mirror of
https://github.com/ROCm/ROCm.git
synced 2026-01-10 23:28:03 -05:00
Compare commits
46 Commits
docs/7.0.2
...
docs/6.4.3
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
da0131029b | ||
|
|
6dfa2fbe15 | ||
|
|
ebf75a901b | ||
|
|
3666bada88 | ||
|
|
cecbb9a408 | ||
|
|
b2ed169073 | ||
|
|
21b9596585 | ||
|
|
94ea06eb01 | ||
|
|
2ce25bc262 | ||
|
|
d90d040084 | ||
|
|
f429d3cc8a | ||
|
|
b3211cc6fa | ||
|
|
5853468fca | ||
|
|
245c95690f | ||
|
|
39c1b926f6 | ||
|
|
3c3847f9f7 | ||
|
|
249bd177ec | ||
|
|
b2ee8d4b2e | ||
|
|
3f834cf520 | ||
|
|
70ba866c5b | ||
|
|
320ec4669a | ||
|
|
c9bd93b537 | ||
|
|
a060550bcd | ||
|
|
c92cbaee66 | ||
|
|
c84afacc8d | ||
|
|
843fd1b3fb | ||
|
|
82221c4e2d | ||
|
|
d0ebe126e7 | ||
|
|
74610893a9 | ||
|
|
afe3e21cad | ||
|
|
ae2440772f | ||
|
|
61f970a24d | ||
|
|
85a1682573 | ||
|
|
87c6e320b4 | ||
|
|
b50948fe6b | ||
|
|
91407405a9 | ||
|
|
8f23f63a6b | ||
|
|
11747aaadc | ||
|
|
1088beefe5 | ||
|
|
b7988925a5 | ||
|
|
89dafa6232 | ||
|
|
8054852dad | ||
|
|
542d7813ce | ||
|
|
bc1ffe4fcb | ||
|
|
09997c68bb | ||
|
|
42bc3501ac |
@@ -5,6 +5,7 @@ ACEs
|
|||||||
ACS
|
ACS
|
||||||
AccVGPR
|
AccVGPR
|
||||||
AccVGPRs
|
AccVGPRs
|
||||||
|
AITER
|
||||||
ALU
|
ALU
|
||||||
AllReduce
|
AllReduce
|
||||||
AMD
|
AMD
|
||||||
@@ -115,6 +116,7 @@ Deprecations
|
|||||||
DevCap
|
DevCap
|
||||||
DirectX
|
DirectX
|
||||||
Dockerfile
|
Dockerfile
|
||||||
|
Dockerized
|
||||||
Doxygen
|
Doxygen
|
||||||
dropless
|
dropless
|
||||||
ELMo
|
ELMo
|
||||||
@@ -122,6 +124,7 @@ ENDPGM
|
|||||||
EPYC
|
EPYC
|
||||||
ESXi
|
ESXi
|
||||||
EoS
|
EoS
|
||||||
|
fas
|
||||||
FBGEMM
|
FBGEMM
|
||||||
FFT
|
FFT
|
||||||
FFTs
|
FFTs
|
||||||
@@ -153,6 +156,7 @@ GEMMs
|
|||||||
GFLOPS
|
GFLOPS
|
||||||
GFortran
|
GFortran
|
||||||
GFXIP
|
GFXIP
|
||||||
|
GGUF
|
||||||
Gemma
|
Gemma
|
||||||
GiB
|
GiB
|
||||||
GIM
|
GIM
|
||||||
@@ -194,6 +198,7 @@ HWE
|
|||||||
HWS
|
HWS
|
||||||
Haswell
|
Haswell
|
||||||
Higgs
|
Higgs
|
||||||
|
href
|
||||||
Hyperparameters
|
Hyperparameters
|
||||||
Huggingface
|
Huggingface
|
||||||
ICD
|
ICD
|
||||||
@@ -289,6 +294,7 @@ Multicore
|
|||||||
Multithreaded
|
Multithreaded
|
||||||
MyEnvironment
|
MyEnvironment
|
||||||
MyST
|
MyST
|
||||||
|
NANOO
|
||||||
NBIO
|
NBIO
|
||||||
NBIOs
|
NBIOs
|
||||||
NCCL
|
NCCL
|
||||||
@@ -360,6 +366,7 @@ PowerEdge
|
|||||||
PowerShell
|
PowerShell
|
||||||
Pretrained
|
Pretrained
|
||||||
Pretraining
|
Pretraining
|
||||||
|
Primus
|
||||||
Profiler's
|
Profiler's
|
||||||
PyPi
|
PyPi
|
||||||
Pytest
|
Pytest
|
||||||
@@ -458,8 +465,6 @@ TPS
|
|||||||
TPU
|
TPU
|
||||||
TPUs
|
TPUs
|
||||||
TSME
|
TSME
|
||||||
Taichi
|
|
||||||
Taichi's
|
|
||||||
Tagram
|
Tagram
|
||||||
TensileLite
|
TensileLite
|
||||||
TensorBoard
|
TensorBoard
|
||||||
@@ -495,6 +500,7 @@ Unhandled
|
|||||||
VALU
|
VALU
|
||||||
VBIOS
|
VBIOS
|
||||||
VCN
|
VCN
|
||||||
|
verl's
|
||||||
VGPR
|
VGPR
|
||||||
VGPRs
|
VGPRs
|
||||||
VM
|
VM
|
||||||
@@ -524,6 +530,7 @@ Xilinx
|
|||||||
Xnack
|
Xnack
|
||||||
Xteam
|
Xteam
|
||||||
YAML
|
YAML
|
||||||
|
YAMLs
|
||||||
YML
|
YML
|
||||||
YModel
|
YModel
|
||||||
ZeRO
|
ZeRO
|
||||||
@@ -584,6 +591,7 @@ completers
|
|||||||
composable
|
composable
|
||||||
concretization
|
concretization
|
||||||
config
|
config
|
||||||
|
configs
|
||||||
conformant
|
conformant
|
||||||
constructible
|
constructible
|
||||||
convolutional
|
convolutional
|
||||||
@@ -663,6 +671,7 @@ github
|
|||||||
globals
|
globals
|
||||||
gnupg
|
gnupg
|
||||||
grayscale
|
grayscale
|
||||||
|
gx
|
||||||
gzip
|
gzip
|
||||||
heterogenous
|
heterogenous
|
||||||
hipBLAS
|
hipBLAS
|
||||||
@@ -735,6 +744,7 @@ logits
|
|||||||
lossy
|
lossy
|
||||||
macOS
|
macOS
|
||||||
matchers
|
matchers
|
||||||
|
maxtext
|
||||||
megatron
|
megatron
|
||||||
microarchitecture
|
microarchitecture
|
||||||
migraphx
|
migraphx
|
||||||
@@ -772,6 +782,7 @@ parallelizing
|
|||||||
param
|
param
|
||||||
parameterization
|
parameterization
|
||||||
passthrough
|
passthrough
|
||||||
|
pe
|
||||||
perfcounter
|
perfcounter
|
||||||
performant
|
performant
|
||||||
perl
|
perl
|
||||||
@@ -794,11 +805,14 @@ preprocessing
|
|||||||
preprocessor
|
preprocessor
|
||||||
prequantized
|
prequantized
|
||||||
prerequisites
|
prerequisites
|
||||||
|
pretrain
|
||||||
pretraining
|
pretraining
|
||||||
|
primus
|
||||||
profiler
|
profiler
|
||||||
profilers
|
profilers
|
||||||
protobuf
|
protobuf
|
||||||
pseudorandom
|
pseudorandom
|
||||||
|
px
|
||||||
py
|
py
|
||||||
pytorch
|
pytorch
|
||||||
recommender
|
recommender
|
||||||
@@ -909,6 +923,7 @@ toolchain
|
|||||||
toolchains
|
toolchains
|
||||||
toolset
|
toolset
|
||||||
toolsets
|
toolsets
|
||||||
|
torchtitan
|
||||||
torchvision
|
torchvision
|
||||||
tqdm
|
tqdm
|
||||||
tracebacks
|
tracebacks
|
||||||
|
|||||||
@@ -57,9 +57,8 @@ ROCm documentation continues to be updated to provide clearer and more comprehen
|
|||||||
|
|
||||||
For more information about the changes, see [Changelog for the AI Developer Hub](https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/changelog.html).
|
For more information about the changes, see [Changelog for the AI Developer Hub](https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/changelog.html).
|
||||||
|
|
||||||
* ROCm provides a comprehensive ecosystem for deep learning development. For more details, see [Deep learning frameworks for ROCm](https://rocm.docs.amd.com/en/docs-6.4.3/how-to/deep-learning-rocm.html). AMD ROCm adds support for the following deep learning frameworks:
|
* ROCm provides a comprehensive ecosystem for deep learning development. For more details, see [Deep learning frameworks for ROCm](https://rocm.docs.amd.com/en/docs-6.4.3/how-to/deep-learning-rocm.html). AMD ROCm adds support for the following deep learning framework:
|
||||||
|
|
||||||
* Taichi is an open-source, imperative, and parallel programming language designed for high-performance numerical computation. Embedded in Python, it leverages just-in-time (JIT) compilation frameworks such as LLVM to accelerate compute-intensive Python code by compiling it to native GPU or CPU instructions. It is currently supported on ROCm 6.3.2. For more information, see [Taichi compatibility](https://rocm.docs.amd.com/en/docs-6.4.3/compatibility/ml-compatibility/taichi-compatibility.html).
|
|
||||||
* Megablocks is a light-weight library for mixture-of-experts (MoE) training. The core of the system is efficient "dropless-MoE" and standard MoE layers. Megablocks is integrated with Megatron-LM, where data and pipeline parallel training of MoEs is supported. It is currently supported on ROCm 6.3.0. For more information, see [Megablocks compatibility](https://rocm.docs.amd.com/en/docs-6.4.3/compatibility/ml-compatibility/megablocks-compatibility.html).
|
* Megablocks is a light-weight library for mixture-of-experts (MoE) training. The core of the system is efficient "dropless-MoE" and standard MoE layers. Megablocks is integrated with Megatron-LM, where data and pipeline parallel training of MoEs is supported. It is currently supported on ROCm 6.3.0. For more information, see [Megablocks compatibility](https://rocm.docs.amd.com/en/docs-6.4.3/compatibility/ml-compatibility/megablocks-compatibility.html).
|
||||||
|
|
||||||
* The [Data types and precision support](https://rocm.docs.amd.com/en/latest/reference/precision-support.html) topic now includes new hardware and library support information.
|
* The [Data types and precision support](https://rocm.docs.amd.com/en/latest/reference/precision-support.html) topic now includes new hardware and library support information.
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ additional licenses. Please review individual repositories for more information.
|
|||||||
| [llvm-project](https://github.com/ROCm/llvm-project/) | [Apache](https://github.com/ROCm/llvm-project/blob/amd-staging/LICENSE.TXT) |
|
| [llvm-project](https://github.com/ROCm/llvm-project/) | [Apache](https://github.com/ROCm/llvm-project/blob/amd-staging/LICENSE.TXT) |
|
||||||
| [llvm-project/flang](https://github.com/ROCm/llvm-project/tree/amd-staging/flang) | [Apache 2.0](https://github.com/ROCm/llvm-project/blob/amd-staging/flang/LICENSE.TXT) |
|
| [llvm-project/flang](https://github.com/ROCm/llvm-project/tree/amd-staging/flang) | [Apache 2.0](https://github.com/ROCm/llvm-project/blob/amd-staging/flang/LICENSE.TXT) |
|
||||||
| [MIGraphX](https://github.com/ROCm/AMDMIGraphX/) | [MIT](https://github.com/ROCm/AMDMIGraphX/blob/develop/LICENSE) |
|
| [MIGraphX](https://github.com/ROCm/AMDMIGraphX/) | [MIT](https://github.com/ROCm/AMDMIGraphX/blob/develop/LICENSE) |
|
||||||
| [MIOpen](https://github.com/ROCm/MIOpen/) | [MIT](https://github.com/ROCm/MIOpen/blob/develop/LICENSE.txt) |
|
| [MIOpen](https://github.com/ROCm/MIOpen/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/miopen/LICENSE.md) |
|
||||||
| [MIVisionX](https://github.com/ROCm/MIVisionX/) | [MIT](https://github.com/ROCm/MIVisionX/blob/develop/LICENSE.txt) |
|
| [MIVisionX](https://github.com/ROCm/MIVisionX/) | [MIT](https://github.com/ROCm/MIVisionX/blob/develop/LICENSE.txt) |
|
||||||
| [rocAL](https://github.com/ROCm/rocAL) | [MIT](https://github.com/ROCm/rocAL/blob/develop/LICENSE.txt) |
|
| [rocAL](https://github.com/ROCm/rocAL) | [MIT](https://github.com/ROCm/rocAL/blob/develop/LICENSE.txt) |
|
||||||
| [rocALUTION](https://github.com/ROCm/rocALUTION/) | [MIT](https://github.com/ROCm/rocALUTION/blob/develop/LICENSE.md) |
|
| [rocALUTION](https://github.com/ROCm/rocALUTION/) | [MIT](https://github.com/ROCm/rocALUTION/blob/develop/LICENSE.md) |
|
||||||
@@ -67,15 +67,15 @@ additional licenses. Please review individual repositories for more information.
|
|||||||
| [ROCm Communication Collectives Library (RCCL)](https://github.com/ROCm/rccl/) | [Custom](https://github.com/ROCm/rccl/blob/develop/LICENSE.txt) |
|
| [ROCm Communication Collectives Library (RCCL)](https://github.com/ROCm/rccl/) | [Custom](https://github.com/ROCm/rccl/blob/develop/LICENSE.txt) |
|
||||||
| [ROCm-Core](https://github.com/ROCm/rocm-core) | [MIT](https://github.com/ROCm/rocm-core/blob/master/copyright) |
|
| [ROCm-Core](https://github.com/ROCm/rocm-core) | [MIT](https://github.com/ROCm/rocm-core/blob/master/copyright) |
|
||||||
| [ROCm Compute Profiler](https://github.com/ROCm/rocprofiler-compute) | [MIT](https://github.com/ROCm/rocprofiler-compute/blob/amd-staging/LICENSE) |
|
| [ROCm Compute Profiler](https://github.com/ROCm/rocprofiler-compute) | [MIT](https://github.com/ROCm/rocprofiler-compute/blob/amd-staging/LICENSE) |
|
||||||
| [ROCm Data Center (RDC)](https://github.com/ROCm/rdc/) | [MIT](https://github.com/ROCm/rdc/blob/amd-staging/LICENSE) |
|
| [ROCm Data Center (RDC)](https://github.com/ROCm/rdc/) | [MIT](https://github.com/ROCm/rdc/blob/amd-staging/LICENSE.md) |
|
||||||
| [ROCm-Device-Libs](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/device-libs) | [The University of Illinois/NCSA](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/device-libs/LICENSE.TXT) |
|
| [ROCm-Device-Libs](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/device-libs) | [The University of Illinois/NCSA](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/device-libs/LICENSE.TXT) |
|
||||||
| [ROCm-OpenCL-Runtime](https://github.com/ROCm/clr/tree/amd-staging/opencl) | [MIT](https://github.com/ROCm/clr/blob/amd-staging/opencl/LICENSE.txt) |
|
| [ROCm-OpenCL-Runtime](https://github.com/ROCm/clr/tree/amd-staging/opencl) | [MIT](https://github.com/ROCm/clr/blob/amd-staging/opencl/LICENSE.txt) |
|
||||||
| [ROCm Performance Primitives (RPP)](https://github.com/ROCm/rpp) | [MIT](https://github.com/ROCm/rpp/blob/develop/LICENSE) |
|
| [ROCm Performance Primitives (RPP)](https://github.com/ROCm/rpp) | [MIT](https://github.com/ROCm/rpp/blob/develop/LICENSE) |
|
||||||
| [ROCm SMI Lib](https://github.com/ROCm/rocm_smi_lib/) | [MIT](https://github.com/ROCm/rocm_smi_lib/blob/amd-staging/License.txt) |
|
| [ROCm SMI Lib](https://github.com/ROCm/rocm_smi_lib/) | [MIT](https://github.com/ROCm/rocm_smi_lib/blob/amd-staging/LICENSE.md) |
|
||||||
| [ROCm Systems Profiler](https://github.com/ROCm/rocprofiler-systems) | [MIT](https://github.com/ROCm/rocprofiler-systems/blob/amd-staging/LICENSE) |
|
| [ROCm Systems Profiler](https://github.com/ROCm/rocprofiler-systems) | [MIT](https://github.com/ROCm/rocprofiler-systems/blob/amd-staging/LICENSE.md) |
|
||||||
| [ROCm Validation Suite](https://github.com/ROCm/ROCmValidationSuite/) | [MIT](https://github.com/ROCm/ROCmValidationSuite/blob/master/LICENSE) |
|
| [ROCm Validation Suite](https://github.com/ROCm/ROCmValidationSuite/) | [MIT](https://github.com/ROCm/ROCmValidationSuite/blob/master/LICENSE) |
|
||||||
| [rocPRIM](https://github.com/ROCm/rocPRIM/) | [MIT](https://github.com/ROCm/rocPRIM/blob/develop/LICENSE.txt) |
|
| [rocPRIM](https://github.com/ROCm/rocPRIM/) | [MIT](https://github.com/ROCm/rocPRIM/blob/develop/LICENSE.txt) |
|
||||||
| [ROCProfiler](https://github.com/ROCm/rocprofiler/) | [MIT](https://github.com/ROCm/rocprofiler/blob/amd-staging/LICENSE) |
|
| [ROCProfiler](https://github.com/ROCm/rocprofiler/) | [MIT](https://github.com/ROCm/rocprofiler/blob/amd-staging/LICENSE.md) |
|
||||||
| [ROCprofiler-SDK](https://github.com/ROCm/rocprofiler-sdk) | [MIT](https://github.com/ROCm/rocprofiler-sdk/blob/amd-mainline/LICENSE) |
|
| [ROCprofiler-SDK](https://github.com/ROCm/rocprofiler-sdk) | [MIT](https://github.com/ROCm/rocprofiler-sdk/blob/amd-mainline/LICENSE) |
|
||||||
| [rocPyDecode](https://github.com/ROCm/rocPyDecode) | [MIT](https://github.com/ROCm/rocPyDecode/blob/develop/LICENSE.txt) |
|
| [rocPyDecode](https://github.com/ROCm/rocPyDecode) | [MIT](https://github.com/ROCm/rocPyDecode/blob/develop/LICENSE.txt) |
|
||||||
| [rocRAND](https://github.com/ROCm/rocRAND/) | [MIT](https://github.com/ROCm/rocRAND/blob/develop/LICENSE.txt) |
|
| [rocRAND](https://github.com/ROCm/rocRAND/) | [MIT](https://github.com/ROCm/rocRAND/blob/develop/LICENSE.txt) |
|
||||||
|
|||||||
@@ -31,10 +31,11 @@ ROCm Version,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6
|
|||||||
:doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.14.0, 2.13.1, 2.12.1","2.14.0, 2.13.1, 2.12.1"
|
:doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.14.0, 2.13.1, 2.12.1","2.14.0, 2.13.1, 2.12.1"
|
||||||
:doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.4.35,0.4.35,0.4.35,0.4.35,0.4.31,0.4.31,0.4.31,0.4.31,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26
|
:doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.4.35,0.4.35,0.4.35,0.4.35,0.4.31,0.4.31,0.4.31,0.4.31,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26
|
||||||
:doc:`verl <../compatibility/ml-compatibility/verl-compatibility>` [#verl_compat]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.3.0.post0,N/A,N/A,N/A,N/A,N/A
|
:doc:`verl <../compatibility/ml-compatibility/verl-compatibility>` [#verl_compat]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.3.0.post0,N/A,N/A,N/A,N/A,N/A
|
||||||
:doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>`,N/A,N/A,N/A,N/A,85f95ae,85f95ae,85f95ae,85f95ae,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
:doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>` [#stanford-megatron-lm_compat]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,85f95ae,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||||
:doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat]_,N/A,N/A,N/A,2.4.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,
|
:doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat]_,N/A,N/A,N/A,2.4.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,
|
||||||
:doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>`,N/A,N/A,N/A,N/A,0.7.0,0.7.0,0.7.0,0.7.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
:doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>` [#megablocks_compat]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.7.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||||
:doc:`Taichi <../compatibility/ml-compatibility/taichi-compatibility>` [#taichi_compat]_,N/A,N/A,N/A,N/A,N/A,1.8.0b1,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
:doc:`Ray <../compatibility/ml-compatibility/ray-compatibility>` [#ray_compat]_,N/A,N/A,2.48.0.post0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||||
|
:doc:`llama.cpp <../compatibility/ml-compatibility/llama-cpp-compatibility>` [#llama-cpp_compat]_,N/A,N/A,N/A,b5997,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||||
`ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.2,1.2,1.2,1.2,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.14.1,1.14.1
|
`ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.2,1.2,1.2,1.2,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.14.1,1.14.1
|
||||||
,,,,,,,,,,,,,,,,,,
|
,,,,,,,,,,,,,,,,,,
|
||||||
,,,,,,,,,,,,,,,,,,
|
,,,,,,,,,,,,,,,,,,
|
||||||
|
|||||||
|
@@ -242,8 +242,11 @@ Expand for full historical view of:
|
|||||||
.. [#mi300_602-past-60] **For ROCm 6.0.2** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.
|
.. [#mi300_602-past-60] **For ROCm 6.0.2** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.
|
||||||
.. [#mi300_600-past-60] **For ROCm 6.0.0** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.
|
.. [#mi300_600-past-60] **For ROCm 6.0.0** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.
|
||||||
.. [#verl_compat] verl is only supported on ROCm 6.2.0.
|
.. [#verl_compat] verl is only supported on ROCm 6.2.0.
|
||||||
|
.. [#stanford-megatron-lm_compat] Stanford Megatron-LM is only supported on ROCm 6.3.0.
|
||||||
.. [#dgl_compat] DGL is only supported on ROCm 6.4.0.
|
.. [#dgl_compat] DGL is only supported on ROCm 6.4.0.
|
||||||
.. [#taichi_compat] Taichi is only supported on ROCm 6.3.2.
|
.. [#megablocks_compat] Megablocks is only supported on ROCm 6.3.0.
|
||||||
|
.. [#ray_compat] Ray is only supported on ROCm 6.4.1.
|
||||||
|
.. [#llama-cpp_compat] llama.cpp is only supported on ROCm 6.4.0.
|
||||||
.. [#kfd_support-past-60] As of ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The tested user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and kernel-space support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
|
.. [#kfd_support-past-60] As of ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The tested user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and kernel-space support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
|
||||||
.. [#ROCT-rocr-past-60] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.
|
.. [#ROCT-rocr-past-60] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.
|
||||||
|
|
||||||
|
|||||||
156
docs/compatibility/ml-compatibility/llama-cpp-compatibility.rst
Normal file
156
docs/compatibility/ml-compatibility/llama-cpp-compatibility.rst
Normal file
@@ -0,0 +1,156 @@
|
|||||||
|
:orphan:
|
||||||
|
|
||||||
|
.. meta::
|
||||||
|
:description: llama.cpp deep learning framework compatibility
|
||||||
|
:keywords: GPU, GGML, llama.cpp compatibility
|
||||||
|
|
||||||
|
.. version-set:: rocm_version latest
|
||||||
|
|
||||||
|
********************************************************************************
|
||||||
|
llama.cpp compatibility
|
||||||
|
********************************************************************************
|
||||||
|
|
||||||
|
`llama.cpp <https://github.com/ggml-org/llama.cpp>`__ is an open-source framework
|
||||||
|
for Large Language Model (LLM) inference that runs on both central processing units
|
||||||
|
(CPUs) and graphics processing units (GPUs). It is written in plain C/C++, providing
|
||||||
|
a simple, dependency-free setup.
|
||||||
|
|
||||||
|
The framework supports multiple quantization options, from 1.5-bit to 8-bit integers,
|
||||||
|
to speed up inference and reduce memory usage. Originally built as a CPU-first library,
|
||||||
|
llama.cpp is easy to integrate with other programming environments and is widely
|
||||||
|
adopted across diverse platforms, including consumer devices.
|
||||||
|
|
||||||
|
ROCm support for llama.cpp is upstreamed, and you can build the official source code
|
||||||
|
with ROCm support:
|
||||||
|
|
||||||
|
- ROCm support for llama.cpp is hosted in the official `https://github.com/ROCm/llama.cpp
|
||||||
|
<https://github.com/ROCm/llama.cpp>`_ repository.
|
||||||
|
|
||||||
|
- Due to independent compatibility considerations, this location differs from the
|
||||||
|
`https://github.com/ggml-org/llama.cpp <https://github.com/ggml-org/llama.cpp>`_ upstream repository.
|
||||||
|
|
||||||
|
- To install llama.cpp, use the prebuilt :ref:`Docker image <llama-cpp-docker-compat>`,
|
||||||
|
which includes ROCm, llama.cpp, and all required dependencies.
|
||||||
|
|
||||||
|
- See the :doc:`ROCm llama.cpp installation guide <rocm-install-on-linux:install/3rd-party/llama-cpp-install>`
|
||||||
|
to install and get started.
|
||||||
|
|
||||||
|
- See the `Installation guide <https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md#hip>`__
|
||||||
|
in the upstream llama.cpp documentation.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
llama.cpp is supported on ROCm 6.4.0.
|
||||||
|
|
||||||
|
Supported devices
|
||||||
|
================================================================================
|
||||||
|
|
||||||
|
**Officially Supported**: AMD Instinct™ MI300X, MI210
|
||||||
|
|
||||||
|
|
||||||
|
Use cases and recommendations
|
||||||
|
================================================================================
|
||||||
|
|
||||||
|
llama.cpp can be applied in a variety of scenarios, particularly when you need to meet one or more of the following requirements:
|
||||||
|
|
||||||
|
- Plain C/C++ implementation with no external dependencies
|
||||||
|
- Support for 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory usage
|
||||||
|
- Custom HIP (Heterogeneous-compute Interface for Portability) kernels for running large language models (LLMs) on AMD GPUs (graphics processing units)
|
||||||
|
- CPU (central processing unit) + GPU (graphics processing unit) hybrid inference for partially accelerating models larger than the total available VRAM (video random-access memory)
|
||||||
|
|
||||||
|
llama.cpp is also used in a range of real-world applications, including:
|
||||||
|
|
||||||
|
- Games such as `Lucy's Labyrinth <https://github.com/MorganRO8/Lucys_Labyrinth>`__:
|
||||||
|
A simple maze game where AI-controlled agents attempt to trick the player.
|
||||||
|
- Tools such as `Styled Lines <https://marketplace.unity.com/packages/tools/ai-ml-integration/style-text-webgl-ios-stand-alone-llm-llama-cpp-wrapper-292902>`__:
|
||||||
|
A proprietary, asynchronous inference wrapper for Unity3D game development, including pre-built mobile and web platform wrappers and a model example.
|
||||||
|
- Various other AI applications use llama.cpp as their inference engine;
|
||||||
|
for a detailed list, see the `user interfaces (UIs) section <https://github.com/ggml-org/llama.cpp?tab=readme-ov-file#description>`__.
|
||||||
|
|
||||||
|
For more use cases and recommendations, refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__,
|
||||||
|
where you can search for llama.cpp examples and best practices to optimize your workloads on AMD GPUs.
|
||||||
|
|
||||||
|
- The `Llama.cpp Meets Instinct: A New Era of Open-Source AI Acceleration <https://rocm.blogs.amd.com/ecosystems-and-partners/llama-cpp/README.html>`__,
|
||||||
|
blog post outlines how the open-source llama.cpp framework enables efficient LLM inference—including interactive inference with ``llama-cli``,
|
||||||
|
server deployment with ``llama-server``, GGUF model preparation and quantization, performance benchmarking, and optimizations tailored for
|
||||||
|
AMD Instinct GPUs within the ROCm ecosystem.
|
||||||
|
|
||||||
|
.. _llama-cpp-docker-compat:
|
||||||
|
|
||||||
|
Docker image compatibility
|
||||||
|
================================================================================
|
||||||
|
|
||||||
|
.. |docker-icon| raw:: html
|
||||||
|
|
||||||
|
<i class="fab fa-docker"></i>
|
||||||
|
|
||||||
|
AMD validates and publishes `ROCm llama.cpp Docker images <https://hub.docker.com/r/rocm/llama.cpp>`__
|
||||||
|
with ROCm backends on Docker Hub. The following Docker image tags and associated
|
||||||
|
inventories were tested on `ROCm 6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__.
|
||||||
|
Click |docker-icon| to view the image on Docker Hub.
|
||||||
|
|
||||||
|
.. important::
|
||||||
|
|
||||||
|
Tag endings of ``_full``, ``_server``, and ``_light`` serve different purposes for entrypoints as follows:
|
||||||
|
|
||||||
|
- Full: This image includes both the main executable file and the tools to convert ``LLaMA`` models into ``ggml`` and convert into 4-bit quantization.
|
||||||
|
- Server: This image only includes the server executable file.
|
||||||
|
- Light: This image only includes the main executable file.
|
||||||
|
|
||||||
|
.. list-table::
|
||||||
|
:header-rows: 1
|
||||||
|
:class: docker-image-compatibility
|
||||||
|
|
||||||
|
* - Full Docker
|
||||||
|
- Server Docker
|
||||||
|
- Light Docker
|
||||||
|
- llama.cpp
|
||||||
|
- Ubuntu
|
||||||
|
|
||||||
|
* - .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b5997_rocm6.4.0_ubuntu24.04_full/images/sha256-f78f6c81ab2f8e957469415fe2370a1334fe969c381d1fe46050c85effaee9d5"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b5997_rocm6.4.0_ubuntu24.04_server/images/sha256-275ad9e18f292c26a00a2de840c37917e98737a88a3520bdc35fd3fc5c9a6a9b"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b5997_rocm6.4.0_ubuntu24.04_light/images/sha256-cc324e6faeedf0e400011f07b49d2dc41a16bae257b2b7befa0f4e2e97231320"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
|
||||||
|
- `b5997 <https://github.com/ROCm/llama.cpp/tree/release/b5997>`__
|
||||||
|
- 24.04
|
||||||
|
|
||||||
|
Key ROCm libraries for llama.cpp
|
||||||
|
================================================================================
|
||||||
|
|
||||||
|
llama.cpp functionality on ROCm is determined by its underlying library
|
||||||
|
dependencies. These ROCm components affect the capabilities, performance, and
|
||||||
|
feature set available to developers.
|
||||||
|
|
||||||
|
.. list-table::
|
||||||
|
:header-rows: 1
|
||||||
|
|
||||||
|
* - ROCm library
|
||||||
|
- Version
|
||||||
|
- Purpose
|
||||||
|
- Usage
|
||||||
|
* - `hipBLAS <https://github.com/ROCm/hipBLAS>`__
|
||||||
|
- :version-ref:`hipBLAS rocm_version`
|
||||||
|
- Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
|
||||||
|
matrix and vector operations.
|
||||||
|
- Supports operations such as matrix multiplication, matrix-vector
|
||||||
|
products, and tensor contractions. Utilized in both dense and batched
|
||||||
|
linear algebra operations.
|
||||||
|
* - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`__
|
||||||
|
- :version-ref:`hipBLASLt rocm_version`
|
||||||
|
- hipBLASLt is an extension of the hipBLAS library, providing additional
|
||||||
|
features like epilogues fused into the matrix multiplication kernel or
|
||||||
|
use of integer tensor cores.
|
||||||
|
- By setting the flag ``ROCBLAS_USE_HIPBLASLT``, you can dispatch hipblasLt
|
||||||
|
kernels where possible.
|
||||||
|
* - `rocWMMA <https://github.com/ROCm/rocWMMA>`__
|
||||||
|
- :version-ref:`rocWMMA rocm_version`
|
||||||
|
- Accelerates warp-level matrix-multiply and matrix-accumulate to speed up matrix
|
||||||
|
multiplication (GEMM) and accumulation operations with mixed precision
|
||||||
|
support.
|
||||||
|
- Can be used to enhance the flash attention performance on AMD compute, by enabling
|
||||||
|
the flag during compile time.
|
||||||
111
docs/compatibility/ml-compatibility/ray-compatibility.rst
Normal file
111
docs/compatibility/ml-compatibility/ray-compatibility.rst
Normal file
@@ -0,0 +1,111 @@
|
|||||||
|
:orphan:
|
||||||
|
|
||||||
|
.. meta::
|
||||||
|
:description: Ray deep learning framework compatibility
|
||||||
|
:keywords: GPU, Ray compatibility
|
||||||
|
|
||||||
|
.. version-set:: rocm_version latest
|
||||||
|
|
||||||
|
*******************************************************************************
|
||||||
|
Ray compatibility
|
||||||
|
*******************************************************************************
|
||||||
|
|
||||||
|
Ray is a unified framework for scaling AI and Python applications from your laptop
|
||||||
|
to a full cluster, without changing your code. Ray consists of `a core distributed
|
||||||
|
runtime <https://docs.ray.io/en/latest/ray-core/walkthrough.html>`_ and a set of
|
||||||
|
`AI libraries <https://docs.ray.io/en/latest/ray-air/getting-started.html>`_ for
|
||||||
|
simplifying machine learning computations.
|
||||||
|
|
||||||
|
Ray is a general-purpose framework that runs many types of workloads efficiently.
|
||||||
|
Any Python application can be scaled with Ray, without extra infrastructure.
|
||||||
|
|
||||||
|
ROCm support for Ray is upstreamed, and you can build the official source code
|
||||||
|
with ROCm support:
|
||||||
|
|
||||||
|
- ROCm support for Ray is hosted in the official `https://github.com/ROCm/ray
|
||||||
|
<https://github.com/ROCm/ray>`_ repository.
|
||||||
|
|
||||||
|
- Due to independent compatibility considerations, this location differs from the
|
||||||
|
`https://github.com/ray-project/ray <https://github.com/ray-project/ray>`_ upstream repository.
|
||||||
|
|
||||||
|
- To install Ray, use the prebuilt :ref:`Docker image <ray-docker-compat>`
|
||||||
|
which includes ROCm, Ray, and all required dependencies.
|
||||||
|
|
||||||
|
- See the :doc:`ROCm Ray installation guide <rocm-install-on-linux:install/3rd-party/ray-install>`
|
||||||
|
for instructions to get started.
|
||||||
|
|
||||||
|
- See the `Installation section <https://docs.ray.io/en/latest/ray-overview/installation.html>`_
|
||||||
|
in the upstream Ray documentation.
|
||||||
|
|
||||||
|
- The Docker image provided is based on the upstream Ray `Daily Release (Nightly) wheels <https://docs.ray.io/en/latest/ray-overview/installation.html#daily-releases-nightlies>`__
|
||||||
|
corresponding to commit `005c372 <https://github.com/ray-project/ray/commit/005c372262e050d5745f475e22e64305fa07f8b8>`__.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
Ray is supported on ROCm 6.4.1.
|
||||||
|
|
||||||
|
Supported devices
|
||||||
|
================================================================================
|
||||||
|
|
||||||
|
**Officially Supported**: AMD Instinct™ MI300X, MI210
|
||||||
|
|
||||||
|
|
||||||
|
Use cases and recommendations
|
||||||
|
================================================================================
|
||||||
|
|
||||||
|
* The `Reinforcement Learning from Human Feedback on AMD GPUs with verl and ROCm
|
||||||
|
Integration <https://rocm.blogs.amd.com/artificial-intelligence/verl-large-scale/README.html>`__
|
||||||
|
blog provides an overview of Volcano Engine Reinforcement Learning (verl)
|
||||||
|
for large language models (LLMs) and discusses its benefits in large-scale
|
||||||
|
reinforcement learning from human feedback (RLHF). It uses Ray as part of a
|
||||||
|
hybrid orchestration engine to schedule and coordinate training and inference
|
||||||
|
tasks in parallel, enabling optimized resource utilization and potential overlap
|
||||||
|
between these phases. This dynamic resource allocation strategy significantly
|
||||||
|
improves overall system efficiency. The blog presents verl’s performance results,
|
||||||
|
focusing on throughput and convergence accuracy achieved on AMD Instinct™ MI300X
|
||||||
|
GPUs. Follow this guide to get started with verl on AMD Instinct GPUs and
|
||||||
|
accelerate your RLHF training with ROCm-optimized performance.
|
||||||
|
|
||||||
|
* The `Exploring Use Cases for Scalable AI: Implementing Ray with ROCm Support for Efficient ML Workflows
|
||||||
|
<https://rocm.blogs.amd.com/artificial-intelligence/rocm-ray/README.html>`__
|
||||||
|
blog post describes key use cases such as training and inference for large language models (LLMs),
|
||||||
|
model serving, hyperparameter tuning, reinforcement learning, and the orchestration of large-scale
|
||||||
|
workloads using Ray in the ROCm environment.
|
||||||
|
|
||||||
|
For more use cases and recommendations, see the AMD GPU tabs in the `Accelerator Support
|
||||||
|
topic <https://docs.ray.io/en/latest/ray-core/scheduling/accelerators.html#accelerator-support>`__
|
||||||
|
of the Ray core documentation and refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__,
|
||||||
|
where you can search for Ray examples and best practices to optimize your workloads on AMD GPUs.
|
||||||
|
|
||||||
|
.. _ray-docker-compat:
|
||||||
|
|
||||||
|
Docker image compatibility
|
||||||
|
================================================================================
|
||||||
|
|
||||||
|
.. |docker-icon| raw:: html
|
||||||
|
|
||||||
|
<i class="fab fa-docker"></i>
|
||||||
|
|
||||||
|
AMD validates and publishes ready-made `ROCm Ray Docker images <https://hub.docker.com/r/rocm/ray/tags>`__
|
||||||
|
with ROCm backends on Docker Hub. The following Docker image tags and
|
||||||
|
associated inventories represent the latest Ray version from the official Docker Hub and are validated for
|
||||||
|
`ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`_. Click the |docker-icon|
|
||||||
|
icon to view the image on Docker Hub.
|
||||||
|
|
||||||
|
.. list-table::
|
||||||
|
:header-rows: 1
|
||||||
|
:class: docker-image-compatibility
|
||||||
|
|
||||||
|
* - Docker image
|
||||||
|
- Ray
|
||||||
|
- Pytorch
|
||||||
|
- Ubuntu
|
||||||
|
- Python
|
||||||
|
|
||||||
|
* - .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://hub.docker.com/layers/rocm/ray/ray-2.48.0.post0_rocm6.4.1_ubuntu24.04_py3.12_pytorch2.6.0/images/sha256-0d166fe6bdced38338c78eedfb96eff92655fb797da3478a62dd636365133cc0"><i class="fab fa-docker fa-lg"></i> rocm/ray</a>
|
||||||
|
- `2.48.0.post0 <https://github.com/ROCm/ray/tree/release/2.48.0.post0>`_
|
||||||
|
- 2.6.0+git684f6f2
|
||||||
|
- 24.04
|
||||||
|
- `3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
|
||||||
@@ -1,76 +0,0 @@
|
|||||||
:orphan:
|
|
||||||
|
|
||||||
.. meta::
|
|
||||||
:description: Taichi compatibility
|
|
||||||
:keywords: GPU, Taichi compatibility
|
|
||||||
|
|
||||||
.. version-set:: rocm_version latest
|
|
||||||
|
|
||||||
*******************************************************************************
|
|
||||||
Taichi compatibility
|
|
||||||
*******************************************************************************
|
|
||||||
|
|
||||||
`Taichi <https://www.taichi-lang.org/>`_ is an open-source, imperative, and parallel
|
|
||||||
programming language designed for high-performance numerical computation.
|
|
||||||
Embedded in Python, it leverages just-in-time (JIT) compilation frameworks such as LLVM to accelerate
|
|
||||||
compute-intensive Python code by compiling it to native GPU or CPU instructions.
|
|
||||||
|
|
||||||
Taichi is widely used across various domains, including real-time physical simulation,
|
|
||||||
numerical computing, augmented reality, artificial intelligence, computer vision, robotics,
|
|
||||||
visual effects in film and gaming, and general-purpose computing.
|
|
||||||
|
|
||||||
* ROCm support for Taichi is hosted in the official `https://github.com/ROCm/taichi <https://github.com/ROCm/taichi>`_ repository.
|
|
||||||
* Due to independent compatibility considerations, this location differs from the `https://github.com/taichi-dev <https://github.com/taichi-dev>`_ upstream repository.
|
|
||||||
* Use the prebuilt :ref:`Docker image <taichi-docker-compat>` with ROCm, PyTorch, and Taichi preinstalled.
|
|
||||||
* See the :doc:`ROCm Taichi installation guide <rocm-install-on-linux:install/3rd-party/taichi-install>` to install and get started.
|
|
||||||
|
|
||||||
.. note::
|
|
||||||
|
|
||||||
Taichi is supported on ROCm 6.3.2.
|
|
||||||
|
|
||||||
Supported devices and features
|
|
||||||
===============================================================================
|
|
||||||
There is support through the ROCm software stack for all Taichi GPU features on AMD Instinct MI250X and MI210X series GPUs with the exception of Taichi’s GPU rendering system, CGUI.
|
|
||||||
AMD Instinct MI300X series GPUs will be supported by November.
|
|
||||||
|
|
||||||
.. _taichi-recommendations:
|
|
||||||
|
|
||||||
Use cases and recommendations
|
|
||||||
================================================================================
|
|
||||||
To fully leverage Taichi's performance capabilities in compute-intensive tasks, it is best to adhere to specific coding patterns and utilize Taichi decorators.
|
|
||||||
A collection of example use cases is available in the `https://github.com/ROCm/taichi_examples <https://github.com/ROCm/taichi_examples>`_ repository,
|
|
||||||
providing practical insights and foundational knowledge for working with the Taichi programming language.
|
|
||||||
You can also refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`_ to search for Taichi examples and best practices to optimize your workflows on AMD GPUs.
|
|
||||||
|
|
||||||
.. _taichi-docker-compat:
|
|
||||||
|
|
||||||
Docker image compatibility
|
|
||||||
================================================================================
|
|
||||||
|
|
||||||
.. |docker-icon| raw:: html
|
|
||||||
|
|
||||||
<i class="fab fa-docker"></i>
|
|
||||||
|
|
||||||
AMD validates and publishes ready-made `ROCm Taichi Docker images <https://hub.docker.com/r/rocm/taichi/tags>`_
|
|
||||||
with ROCm backends on Docker Hub. The following Docker image tags and associated inventories
|
|
||||||
represent the latest Taichi version from the official Docker Hub.
|
|
||||||
The Docker images have been validated for `ROCm 6.3.2 <https://rocm.docs.amd.com/en/docs-6.3.2/about/release-notes.html>`_.
|
|
||||||
Click |docker-icon| to view the image on Docker Hub.
|
|
||||||
|
|
||||||
.. list-table::
|
|
||||||
:header-rows: 1
|
|
||||||
:class: docker-image-compatibility
|
|
||||||
|
|
||||||
* - Docker image
|
|
||||||
- ROCm
|
|
||||||
- Taichi
|
|
||||||
- Ubuntu
|
|
||||||
- Python
|
|
||||||
|
|
||||||
* - .. raw:: html
|
|
||||||
|
|
||||||
<a href="https://hub.docker.com/layers/rocm/taichi/taichi-1.8.0b1_rocm6.3.2_ubuntu22.04_py3.10.12/images/sha256-e016964a751e6a92199032d23e70fa3a564fff8555afe85cd718f8aa63f11fc6"><i class="fab fa-docker fa-lg"></i> rocm/taichi</a>
|
|
||||||
- `6.3.2 <https://repo.radeon.com/rocm/apt/6.3.2/>`_
|
|
||||||
- `1.8.0b1 <https://github.com/taichi-dev/taichi>`_
|
|
||||||
- 22.04
|
|
||||||
- `3.10.12 <https://www.python.org/downloads/release/python-31012/>`_
|
|
||||||
@@ -100,7 +100,8 @@ article_pages = [
|
|||||||
{"file": "compatibility/ml-compatibility/stanford-megatron-lm-compatibility", "os": ["linux"]},
|
{"file": "compatibility/ml-compatibility/stanford-megatron-lm-compatibility", "os": ["linux"]},
|
||||||
{"file": "compatibility/ml-compatibility/dgl-compatibility", "os": ["linux"]},
|
{"file": "compatibility/ml-compatibility/dgl-compatibility", "os": ["linux"]},
|
||||||
{"file": "compatibility/ml-compatibility/megablocks-compatibility", "os": ["linux"]},
|
{"file": "compatibility/ml-compatibility/megablocks-compatibility", "os": ["linux"]},
|
||||||
{"file": "compatibility/ml-compatibility/taichi-compatibility", "os": ["linux"]},
|
{"file": "compatibility/ml-compatibility/ray-compatibility", "os": ["linux"]},
|
||||||
|
{"file": "compatibility/ml-compatibility/llama-cpp-compatibility", "os": ["linux"]},
|
||||||
{"file": "how-to/deep-learning-rocm", "os": ["linux"]},
|
{"file": "how-to/deep-learning-rocm", "os": ["linux"]},
|
||||||
|
|
||||||
{"file": "how-to/rocm-for-ai/index", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/index", "os": ["linux"]},
|
||||||
@@ -117,11 +118,15 @@ article_pages = [
|
|||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.5", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.5", "os": ["linux"]},
|
||||||
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.6", "os": ["linux"]},
|
||||||
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-primus-migration-guide", "os": ["linux"]},
|
||||||
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/primus-megatron", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.3", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.3", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.4", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.4", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5", "os": ["linux"]},
|
||||||
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.6", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4", "os": ["linux"]},
|
||||||
@@ -147,6 +152,8 @@ article_pages = [
|
|||||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250521", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250521", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250605", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250605", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250702", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250702", "os": ["linux"]},
|
||||||
|
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250702", "os": ["linux"]},
|
||||||
|
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/inference/deploy-your-model", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/inference/deploy-your-model", "os": ["linux"]},
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,91 @@
|
|||||||
|
vllm_benchmark:
|
||||||
|
unified_docker:
|
||||||
|
latest:
|
||||||
|
pull_tag: rocm/vllm:rocm6.4.1_vllm_0.10.0_20250812
|
||||||
|
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.0_20250812/images/sha256-4c277ad39af3a8c9feac9b30bf78d439c74d9b4728e788a419d3f1d0c30cacaa
|
||||||
|
rocm_version: 6.4.1
|
||||||
|
vllm_version: 0.10.0 (0.10.1.dev395+g340ea86df.rocm641)
|
||||||
|
pytorch_version: 2.7.0+gitf717b2a
|
||||||
|
hipblaslt_version: 0.15
|
||||||
|
model_groups:
|
||||||
|
- group: Meta Llama
|
||||||
|
tag: llama
|
||||||
|
models:
|
||||||
|
- model: Llama 3.1 8B
|
||||||
|
mad_tag: pyt_vllm_llama-3.1-8b
|
||||||
|
model_repo: meta-llama/Llama-3.1-8B-Instruct
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||||
|
precision: float16
|
||||||
|
- model: Llama 3.1 70B
|
||||||
|
mad_tag: pyt_vllm_llama-3.1-70b
|
||||||
|
model_repo: meta-llama/Llama-3.1-70B-Instruct
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
|
||||||
|
precision: float16
|
||||||
|
- model: Llama 3.1 405B
|
||||||
|
mad_tag: pyt_vllm_llama-3.1-405b
|
||||||
|
model_repo: meta-llama/Llama-3.1-405B-Instruct
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
|
||||||
|
precision: float16
|
||||||
|
- model: Llama 2 70B
|
||||||
|
mad_tag: pyt_vllm_llama-2-70b
|
||||||
|
model_repo: meta-llama/Llama-2-70b-chat-hf
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
|
||||||
|
precision: float16
|
||||||
|
- model: Llama 3.1 8B FP8
|
||||||
|
mad_tag: pyt_vllm_llama-3.1-8b_fp8
|
||||||
|
model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- model: Llama 3.1 70B FP8
|
||||||
|
mad_tag: pyt_vllm_llama-3.1-70b_fp8
|
||||||
|
model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- model: Llama 3.1 405B FP8
|
||||||
|
mad_tag: pyt_vllm_llama-3.1-405b_fp8
|
||||||
|
model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- group: Mistral AI
|
||||||
|
tag: mistral
|
||||||
|
models:
|
||||||
|
- model: Mixtral MoE 8x7B
|
||||||
|
mad_tag: pyt_vllm_mixtral-8x7b
|
||||||
|
model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||||
|
url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||||
|
precision: float16
|
||||||
|
- model: Mixtral MoE 8x22B
|
||||||
|
mad_tag: pyt_vllm_mixtral-8x22b
|
||||||
|
model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||||
|
url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||||
|
precision: float16
|
||||||
|
- model: Mixtral MoE 8x7B FP8
|
||||||
|
mad_tag: pyt_vllm_mixtral-8x7b_fp8
|
||||||
|
model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- model: Mixtral MoE 8x22B FP8
|
||||||
|
mad_tag: pyt_vllm_mixtral-8x22b_fp8
|
||||||
|
model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- group: Qwen
|
||||||
|
tag: qwen
|
||||||
|
models:
|
||||||
|
- model: QwQ-32B
|
||||||
|
mad_tag: pyt_vllm_qwq-32b
|
||||||
|
model_repo: Qwen/QwQ-32B
|
||||||
|
url: https://huggingface.co/Qwen/QwQ-32B
|
||||||
|
precision: float16
|
||||||
|
- model: Qwen3 30B A3B
|
||||||
|
mad_tag: pyt_vllm_qwen3-30b-a3b
|
||||||
|
model_repo: Qwen/Qwen3-30B-A3B
|
||||||
|
url: https://huggingface.co/Qwen/Qwen3-30B-A3B
|
||||||
|
precision: float16
|
||||||
|
- group: Microsoft Phi
|
||||||
|
tag: phi
|
||||||
|
models:
|
||||||
|
- model: Phi-4
|
||||||
|
mad_tag: pyt_vllm_phi-4
|
||||||
|
model_repo: microsoft/phi-4
|
||||||
|
url: https://huggingface.co/microsoft/phi-4
|
||||||
@@ -0,0 +1,163 @@
|
|||||||
|
vllm_benchmark:
|
||||||
|
unified_docker:
|
||||||
|
latest:
|
||||||
|
# TODO: update me
|
||||||
|
pull_tag: rocm/vllm:rocm6.4.1_vllm_0.9.1_20250715
|
||||||
|
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250715/images/sha256-4a429705fa95a58f6d20aceab43b1b76fa769d57f32d5d28bd3f4e030e2a78ea
|
||||||
|
rocm_version: 6.4.1
|
||||||
|
vllm_version: 0.9.1 (0.9.2.dev364+gb432b7a28.rocm641)
|
||||||
|
pytorch_version: 2.7.0+gitf717b2a
|
||||||
|
hipblaslt_version: 0.15
|
||||||
|
model_groups:
|
||||||
|
- group: Meta Llama
|
||||||
|
tag: llama
|
||||||
|
models:
|
||||||
|
- model: Llama 3.1 8B
|
||||||
|
mad_tag: pyt_vllm_llama-3.1-8b
|
||||||
|
model_repo: meta-llama/Llama-3.1-8B-Instruct
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||||
|
precision: float16
|
||||||
|
- model: Llama 3.1 70B
|
||||||
|
mad_tag: pyt_vllm_llama-3.1-70b
|
||||||
|
model_repo: meta-llama/Llama-3.1-70B-Instruct
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
|
||||||
|
precision: float16
|
||||||
|
- model: Llama 3.1 405B
|
||||||
|
mad_tag: pyt_vllm_llama-3.1-405b
|
||||||
|
model_repo: meta-llama/Llama-3.1-405B-Instruct
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
|
||||||
|
precision: float16
|
||||||
|
- model: Llama 2 7B
|
||||||
|
mad_tag: pyt_vllm_llama-2-7b
|
||||||
|
model_repo: meta-llama/Llama-2-7b-chat-hf
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
|
||||||
|
precision: float16
|
||||||
|
- model: Llama 2 70B
|
||||||
|
mad_tag: pyt_vllm_llama-2-70b
|
||||||
|
model_repo: meta-llama/Llama-2-70b-chat-hf
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
|
||||||
|
precision: float16
|
||||||
|
- model: Llama 3.1 8B FP8
|
||||||
|
mad_tag: pyt_vllm_llama-3.1-8b_fp8
|
||||||
|
model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- model: Llama 3.1 70B FP8
|
||||||
|
mad_tag: pyt_vllm_llama-3.1-70b_fp8
|
||||||
|
model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- model: Llama 3.1 405B FP8
|
||||||
|
mad_tag: pyt_vllm_llama-3.1-405b_fp8
|
||||||
|
model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- group: Mistral AI
|
||||||
|
tag: mistral
|
||||||
|
models:
|
||||||
|
- model: Mixtral MoE 8x7B
|
||||||
|
mad_tag: pyt_vllm_mixtral-8x7b
|
||||||
|
model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||||
|
url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||||
|
precision: float16
|
||||||
|
- model: Mixtral MoE 8x22B
|
||||||
|
mad_tag: pyt_vllm_mixtral-8x22b
|
||||||
|
model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||||
|
url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||||
|
precision: float16
|
||||||
|
- model: Mistral 7B
|
||||||
|
mad_tag: pyt_vllm_mistral-7b
|
||||||
|
model_repo: mistralai/Mistral-7B-Instruct-v0.3
|
||||||
|
url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
|
||||||
|
precision: float16
|
||||||
|
- model: Mixtral MoE 8x7B FP8
|
||||||
|
mad_tag: pyt_vllm_mixtral-8x7b_fp8
|
||||||
|
model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- model: Mixtral MoE 8x22B FP8
|
||||||
|
mad_tag: pyt_vllm_mixtral-8x22b_fp8
|
||||||
|
model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- model: Mistral 7B FP8
|
||||||
|
mad_tag: pyt_vllm_mistral-7b_fp8
|
||||||
|
model_repo: amd/Mistral-7B-v0.1-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- group: Qwen
|
||||||
|
tag: qwen
|
||||||
|
models:
|
||||||
|
- model: Qwen2 7B
|
||||||
|
mad_tag: pyt_vllm_qwen2-7b
|
||||||
|
model_repo: Qwen/Qwen2-7B-Instruct
|
||||||
|
url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
|
||||||
|
precision: float16
|
||||||
|
- model: Qwen2 72B
|
||||||
|
mad_tag: pyt_vllm_qwen2-72b
|
||||||
|
model_repo: Qwen/Qwen2-72B-Instruct
|
||||||
|
url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
|
||||||
|
precision: float16
|
||||||
|
- model: QwQ-32B
|
||||||
|
mad_tag: pyt_vllm_qwq-32b
|
||||||
|
model_repo: Qwen/QwQ-32B
|
||||||
|
url: https://huggingface.co/Qwen/QwQ-32B
|
||||||
|
precision: float16
|
||||||
|
tunableop: true
|
||||||
|
- group: Databricks DBRX
|
||||||
|
tag: dbrx
|
||||||
|
models:
|
||||||
|
- model: DBRX Instruct
|
||||||
|
mad_tag: pyt_vllm_dbrx-instruct
|
||||||
|
model_repo: databricks/dbrx-instruct
|
||||||
|
url: https://huggingface.co/databricks/dbrx-instruct
|
||||||
|
precision: float16
|
||||||
|
- model: DBRX Instruct FP8
|
||||||
|
mad_tag: pyt_vllm_dbrx_fp8
|
||||||
|
model_repo: amd/dbrx-instruct-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- group: Google Gemma
|
||||||
|
tag: gemma
|
||||||
|
models:
|
||||||
|
- model: Gemma 2 27B
|
||||||
|
mad_tag: pyt_vllm_gemma-2-27b
|
||||||
|
model_repo: google/gemma-2-27b
|
||||||
|
url: https://huggingface.co/google/gemma-2-27b
|
||||||
|
precision: float16
|
||||||
|
- group: Cohere
|
||||||
|
tag: cohere
|
||||||
|
models:
|
||||||
|
- model: C4AI Command R+ 08-2024
|
||||||
|
mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
|
||||||
|
model_repo: CohereForAI/c4ai-command-r-plus-08-2024
|
||||||
|
url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
|
||||||
|
precision: float16
|
||||||
|
- model: C4AI Command R+ 08-2024 FP8
|
||||||
|
mad_tag: pyt_vllm_command-r-plus_fp8
|
||||||
|
model_repo: amd/c4ai-command-r-plus-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- group: DeepSeek
|
||||||
|
tag: deepseek
|
||||||
|
models:
|
||||||
|
- model: DeepSeek MoE 16B
|
||||||
|
mad_tag: pyt_vllm_deepseek-moe-16b-chat
|
||||||
|
model_repo: deepseek-ai/deepseek-moe-16b-chat
|
||||||
|
url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
|
||||||
|
precision: float16
|
||||||
|
- group: Microsoft Phi
|
||||||
|
tag: phi
|
||||||
|
models:
|
||||||
|
- model: Phi-4
|
||||||
|
mad_tag: pyt_vllm_phi-4
|
||||||
|
model_repo: microsoft/phi-4
|
||||||
|
url: https://huggingface.co/microsoft/phi-4
|
||||||
|
- group: TII Falcon
|
||||||
|
tag: falcon
|
||||||
|
models:
|
||||||
|
- model: Falcon 180B
|
||||||
|
mad_tag: pyt_vllm_falcon-180b
|
||||||
|
model_repo: tiiuae/falcon-180B
|
||||||
|
url: https://huggingface.co/tiiuae/falcon-180B
|
||||||
|
precision: float16
|
||||||
@@ -39,7 +39,7 @@ pytorch_inference_benchmark:
|
|||||||
model_repo: Wan-AI/Wan2.1-T2V-14B
|
model_repo: Wan-AI/Wan2.1-T2V-14B
|
||||||
url: https://huggingface.co/Wan-AI/Wan2.1-T2V-14B
|
url: https://huggingface.co/Wan-AI/Wan2.1-T2V-14B
|
||||||
precision: bfloat16
|
precision: bfloat16
|
||||||
- group: Janus-Pro
|
- group: Janus Pro
|
||||||
tag: janus-pro
|
tag: janus-pro
|
||||||
models:
|
models:
|
||||||
- model: Janus Pro 7B
|
- model: Janus Pro 7B
|
||||||
@@ -47,3 +47,11 @@ pytorch_inference_benchmark:
|
|||||||
model_repo: deepseek-ai/Janus-Pro-7B
|
model_repo: deepseek-ai/Janus-Pro-7B
|
||||||
url: https://huggingface.co/deepseek-ai/Janus-Pro-7B
|
url: https://huggingface.co/deepseek-ai/Janus-Pro-7B
|
||||||
precision: bfloat16
|
precision: bfloat16
|
||||||
|
- group: Hunyuan Video
|
||||||
|
tag: hunyuan
|
||||||
|
models:
|
||||||
|
- model: Hunyuan Video
|
||||||
|
mad_tag: pyt_hy_video
|
||||||
|
model_repo: tencent/HunyuanVideo
|
||||||
|
url: https://huggingface.co/tencent/HunyuanVideo
|
||||||
|
precision: float16
|
||||||
|
|||||||
@@ -1,17 +1,16 @@
|
|||||||
sglang_benchmark:
|
dockers:
|
||||||
unified_docker:
|
- pull_tag: lmsysorg/sglang:v0.4.5-rocm630
|
||||||
latest:
|
docker_hub_url: https://hub.docker.com/layers/lmsysorg/sglang/v0.4.5-rocm630/images/sha256-63d2cb760a237125daf6612464cfe2f395c0784e21e8b0ea37d551cd10d3c951
|
||||||
pull_tag: lmsysorg/sglang:v0.4.5-rocm630
|
components:
|
||||||
docker_hub_url: https://hub.docker.com/layers/lmsysorg/sglang/v0.4.5-rocm630/images/sha256-63d2cb760a237125daf6612464cfe2f395c0784e21e8b0ea37d551cd10d3c951
|
ROCm: 6.3.0
|
||||||
rocm_version: 6.3.0
|
SGLang: 0.4.5 (0.4.5-rocm)
|
||||||
sglang_version: 0.4.5 (0.4.5-rocm)
|
PyTorch: 2.6.0a0+git8d4926e
|
||||||
pytorch_version: 2.6.0a0+git8d4926e
|
model_groups:
|
||||||
model_groups:
|
- group: DeepSeek
|
||||||
- group: DeepSeek
|
tag: deepseek
|
||||||
tag: deepseek
|
models:
|
||||||
models:
|
- model: DeepSeek-R1-Distill-Qwen-32B
|
||||||
- model: DeepSeek-R1-Distill-Qwen-32B
|
mad_tag: pyt_sglang_deepseek-r1-distill-qwen-32b
|
||||||
mad_tag: pyt_sglang_deepseek-r1-distill-qwen-32b
|
model_repo: deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
|
||||||
model_repo: deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
|
url: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
|
||||||
url: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
|
precision: bfloat16
|
||||||
precision: bfloat16
|
|
||||||
|
|||||||
@@ -1,163 +1,188 @@
|
|||||||
vllm_benchmark:
|
dockers:
|
||||||
unified_docker:
|
- pull_tag: rocm/vllm:rocm6.4.1_vllm_0.10.1_20250909
|
||||||
latest:
|
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.1_20250909/images/sha256-1113268572e26d59b205792047bea0e61e018e79aeadceba118b7bf23cb3715c
|
||||||
# TODO: update me
|
components:
|
||||||
pull_tag: rocm/vllm:rocm6.4.1_vllm_0.9.1_20250715
|
ROCm: 6.4.1
|
||||||
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250715/images/sha256-4a429705fa95a58f6d20aceab43b1b76fa769d57f32d5d28bd3f4e030e2a78ea
|
vLLM: 0.10.1 (0.10.1rc2.dev409+g0b6bf6691.rocm641)
|
||||||
rocm_version: 6.4.1
|
PyTorch: 2.7.0+gitf717b2a
|
||||||
vllm_version: 0.9.1 (0.9.2.dev364+gb432b7a28.rocm641)
|
hipBLASLt: 0.15
|
||||||
pytorch_version: 2.7.0+gitf717b2a
|
model_groups:
|
||||||
hipblaslt_version: 0.15
|
- group: Meta Llama
|
||||||
model_groups:
|
tag: llama
|
||||||
- group: Meta Llama
|
models:
|
||||||
tag: llama
|
- model: Llama 3.1 8B
|
||||||
models:
|
mad_tag: pyt_vllm_llama-3.1-8b
|
||||||
- model: Llama 3.1 8B
|
model_repo: meta-llama/Llama-3.1-8B-Instruct
|
||||||
mad_tag: pyt_vllm_llama-3.1-8b
|
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||||
model_repo: meta-llama/Llama-3.1-8B-Instruct
|
precision: float16
|
||||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
config:
|
||||||
precision: float16
|
tp: 1
|
||||||
- model: Llama 3.1 70B
|
dtype: auto
|
||||||
mad_tag: pyt_vllm_llama-3.1-70b
|
kv_cache_dtype: auto
|
||||||
model_repo: meta-llama/Llama-3.1-70B-Instruct
|
max_seq_len_to_capture: 131072
|
||||||
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
|
max_num_batched_tokens: 131072
|
||||||
precision: float16
|
max_model_len: 8192
|
||||||
- model: Llama 3.1 405B
|
- model: Llama 3.1 70B
|
||||||
mad_tag: pyt_vllm_llama-3.1-405b
|
mad_tag: pyt_vllm_llama-3.1-70b
|
||||||
model_repo: meta-llama/Llama-3.1-405B-Instruct
|
model_repo: meta-llama/Llama-3.1-70B-Instruct
|
||||||
url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
|
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
|
||||||
precision: float16
|
precision: float16
|
||||||
- model: Llama 2 7B
|
config:
|
||||||
mad_tag: pyt_vllm_llama-2-7b
|
tp: 8
|
||||||
model_repo: meta-llama/Llama-2-7b-chat-hf
|
dtype: auto
|
||||||
url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
|
kv_cache_dtype: auto
|
||||||
precision: float16
|
max_seq_len_to_capture: 131072
|
||||||
- model: Llama 2 70B
|
max_num_batched_tokens: 131072
|
||||||
mad_tag: pyt_vllm_llama-2-70b
|
max_model_len: 8192
|
||||||
model_repo: meta-llama/Llama-2-70b-chat-hf
|
- model: Llama 3.1 405B
|
||||||
url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
|
mad_tag: pyt_vllm_llama-3.1-405b
|
||||||
precision: float16
|
model_repo: meta-llama/Llama-3.1-405B-Instruct
|
||||||
- model: Llama 3.1 8B FP8
|
url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
|
||||||
mad_tag: pyt_vllm_llama-3.1-8b_fp8
|
precision: float16
|
||||||
model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
|
config:
|
||||||
url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
|
tp: 8
|
||||||
precision: float8
|
dtype: auto
|
||||||
- model: Llama 3.1 70B FP8
|
kv_cache_dtype: auto
|
||||||
mad_tag: pyt_vllm_llama-3.1-70b_fp8
|
max_seq_len_to_capture: 131072
|
||||||
model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
|
max_num_batched_tokens: 131072
|
||||||
url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
|
max_model_len: 8192
|
||||||
precision: float8
|
- model: Llama 2 70B
|
||||||
- model: Llama 3.1 405B FP8
|
mad_tag: pyt_vllm_llama-2-70b
|
||||||
mad_tag: pyt_vllm_llama-3.1-405b_fp8
|
model_repo: meta-llama/Llama-2-70b-chat-hf
|
||||||
model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
|
url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
|
||||||
url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
|
precision: float16
|
||||||
precision: float8
|
config:
|
||||||
- group: Mistral AI
|
tp: 8
|
||||||
tag: mistral
|
dtype: auto
|
||||||
models:
|
kv_cache_dtype: auto
|
||||||
- model: Mixtral MoE 8x7B
|
max_seq_len_to_capture: 4096
|
||||||
mad_tag: pyt_vllm_mixtral-8x7b
|
max_num_batched_tokens: 4096
|
||||||
model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
|
max_model_len: 4096
|
||||||
url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
|
- model: Llama 3.1 8B FP8
|
||||||
precision: float16
|
mad_tag: pyt_vllm_llama-3.1-8b_fp8
|
||||||
- model: Mixtral MoE 8x22B
|
model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||||
mad_tag: pyt_vllm_mixtral-8x22b
|
url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||||
model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
|
precision: float8
|
||||||
url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
|
config:
|
||||||
precision: float16
|
tp: 1
|
||||||
- model: Mistral 7B
|
dtype: auto
|
||||||
mad_tag: pyt_vllm_mistral-7b
|
kv_cache_dtype: fp8
|
||||||
model_repo: mistralai/Mistral-7B-Instruct-v0.3
|
max_seq_len_to_capture: 131072
|
||||||
url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
|
max_num_batched_tokens: 131072
|
||||||
precision: float16
|
max_model_len: 8192
|
||||||
- model: Mixtral MoE 8x7B FP8
|
- model: Llama 3.1 70B FP8
|
||||||
mad_tag: pyt_vllm_mixtral-8x7b_fp8
|
mad_tag: pyt_vllm_llama-3.1-70b_fp8
|
||||||
model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||||
url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||||
precision: float8
|
precision: float8
|
||||||
- model: Mixtral MoE 8x22B FP8
|
config:
|
||||||
mad_tag: pyt_vllm_mixtral-8x22b_fp8
|
tp: 8
|
||||||
model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
dtype: auto
|
||||||
url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
kv_cache_dtype: fp8
|
||||||
precision: float8
|
max_seq_len_to_capture: 131072
|
||||||
- model: Mistral 7B FP8
|
max_num_batched_tokens: 131072
|
||||||
mad_tag: pyt_vllm_mistral-7b_fp8
|
max_model_len: 8192
|
||||||
model_repo: amd/Mistral-7B-v0.1-FP8-KV
|
- model: Llama 3.1 405B FP8
|
||||||
url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
|
mad_tag: pyt_vllm_llama-3.1-405b_fp8
|
||||||
precision: float8
|
model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||||
- group: Qwen
|
url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||||
tag: qwen
|
precision: float8
|
||||||
models:
|
config:
|
||||||
- model: Qwen2 7B
|
tp: 8
|
||||||
mad_tag: pyt_vllm_qwen2-7b
|
dtype: auto
|
||||||
model_repo: Qwen/Qwen2-7B-Instruct
|
kv_cache_dtype: fp8
|
||||||
url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
|
max_seq_len_to_capture: 131072
|
||||||
precision: float16
|
max_num_batched_tokens: 131072
|
||||||
- model: Qwen2 72B
|
max_model_len: 8192
|
||||||
mad_tag: pyt_vllm_qwen2-72b
|
- group: Mistral AI
|
||||||
model_repo: Qwen/Qwen2-72B-Instruct
|
tag: mistral
|
||||||
url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
|
models:
|
||||||
precision: float16
|
- model: Mixtral MoE 8x7B
|
||||||
- model: QwQ-32B
|
mad_tag: pyt_vllm_mixtral-8x7b
|
||||||
mad_tag: pyt_vllm_qwq-32b
|
model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||||
model_repo: Qwen/QwQ-32B
|
url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||||
url: https://huggingface.co/Qwen/QwQ-32B
|
precision: float16
|
||||||
precision: float16
|
config:
|
||||||
tunableop: true
|
tp: 8
|
||||||
- group: Databricks DBRX
|
dtype: auto
|
||||||
tag: dbrx
|
kv_cache_dtype: auto
|
||||||
models:
|
max_seq_len_to_capture: 32768
|
||||||
- model: DBRX Instruct
|
max_num_batched_tokens: 32768
|
||||||
mad_tag: pyt_vllm_dbrx-instruct
|
max_model_len: 8192
|
||||||
model_repo: databricks/dbrx-instruct
|
- model: Mixtral MoE 8x22B
|
||||||
url: https://huggingface.co/databricks/dbrx-instruct
|
mad_tag: pyt_vllm_mixtral-8x22b
|
||||||
precision: float16
|
model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||||
- model: DBRX Instruct FP8
|
url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||||
mad_tag: pyt_vllm_dbrx_fp8
|
precision: float16
|
||||||
model_repo: amd/dbrx-instruct-FP8-KV
|
config:
|
||||||
url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
|
tp: 8
|
||||||
precision: float8
|
dtype: auto
|
||||||
- group: Google Gemma
|
kv_cache_dtype: auto
|
||||||
tag: gemma
|
max_seq_len_to_capture: 65536
|
||||||
models:
|
max_num_batched_tokens: 65536
|
||||||
- model: Gemma 2 27B
|
max_model_len: 8192
|
||||||
mad_tag: pyt_vllm_gemma-2-27b
|
- model: Mixtral MoE 8x7B FP8
|
||||||
model_repo: google/gemma-2-27b
|
mad_tag: pyt_vllm_mixtral-8x7b_fp8
|
||||||
url: https://huggingface.co/google/gemma-2-27b
|
model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||||
precision: float16
|
url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||||
- group: Cohere
|
precision: float8
|
||||||
tag: cohere
|
config:
|
||||||
models:
|
tp: 8
|
||||||
- model: C4AI Command R+ 08-2024
|
dtype: auto
|
||||||
mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
|
kv_cache_dtype: fp8
|
||||||
model_repo: CohereForAI/c4ai-command-r-plus-08-2024
|
max_seq_len_to_capture: 32768
|
||||||
url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
|
max_num_batched_tokens: 32768
|
||||||
precision: float16
|
max_model_len: 8192
|
||||||
- model: C4AI Command R+ 08-2024 FP8
|
- model: Mixtral MoE 8x22B FP8
|
||||||
mad_tag: pyt_vllm_command-r-plus_fp8
|
mad_tag: pyt_vllm_mixtral-8x22b_fp8
|
||||||
model_repo: amd/c4ai-command-r-plus-FP8-KV
|
model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||||
url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
|
url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||||
precision: float8
|
precision: float8
|
||||||
- group: DeepSeek
|
config:
|
||||||
tag: deepseek
|
tp: 8
|
||||||
models:
|
dtype: auto
|
||||||
- model: DeepSeek MoE 16B
|
kv_cache_dtype: fp8
|
||||||
mad_tag: pyt_vllm_deepseek-moe-16b-chat
|
max_seq_len_to_capture: 65536
|
||||||
model_repo: deepseek-ai/deepseek-moe-16b-chat
|
max_num_batched_tokens: 65536
|
||||||
url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
|
max_model_len: 8192
|
||||||
precision: float16
|
- group: Qwen
|
||||||
- group: Microsoft Phi
|
tag: qwen
|
||||||
tag: phi
|
models:
|
||||||
models:
|
- model: QwQ-32B
|
||||||
- model: Phi-4
|
mad_tag: pyt_vllm_qwq-32b
|
||||||
mad_tag: pyt_vllm_phi-4
|
model_repo: Qwen/QwQ-32B
|
||||||
model_repo: microsoft/phi-4
|
url: https://huggingface.co/Qwen/QwQ-32B
|
||||||
url: https://huggingface.co/microsoft/phi-4
|
precision: float16
|
||||||
- group: TII Falcon
|
config:
|
||||||
tag: falcon
|
tp: 1
|
||||||
models:
|
dtype: auto
|
||||||
- model: Falcon 180B
|
kv_cache_dtype: auto
|
||||||
mad_tag: pyt_vllm_falcon-180b
|
max_seq_len_to_capture: 131072
|
||||||
model_repo: tiiuae/falcon-180B
|
max_num_batched_tokens: 131072
|
||||||
url: https://huggingface.co/tiiuae/falcon-180B
|
max_model_len: 8192
|
||||||
precision: float16
|
- model: Qwen3 30B A3B
|
||||||
|
mad_tag: pyt_vllm_qwen3-30b-a3b
|
||||||
|
model_repo: Qwen/Qwen3-30B-A3B
|
||||||
|
url: https://huggingface.co/Qwen/Qwen3-30B-A3B
|
||||||
|
precision: float16
|
||||||
|
config:
|
||||||
|
tp: 1
|
||||||
|
dtype: auto
|
||||||
|
kv_cache_dtype: auto
|
||||||
|
max_seq_len_to_capture: 32768
|
||||||
|
max_num_batched_tokens: 32768
|
||||||
|
max_model_len: 8192
|
||||||
|
- group: Microsoft Phi
|
||||||
|
tag: phi
|
||||||
|
models:
|
||||||
|
- model: Phi-4
|
||||||
|
mad_tag: pyt_vllm_phi-4
|
||||||
|
model_repo: microsoft/phi-4
|
||||||
|
url: https://huggingface.co/microsoft/phi-4
|
||||||
|
config:
|
||||||
|
tp: 1
|
||||||
|
dtype: auto
|
||||||
|
kv_cache_dtype: auto
|
||||||
|
max_seq_len_to_capture: 16384
|
||||||
|
max_num_batched_tokens: 16384
|
||||||
|
max_model_len: 8192
|
||||||
|
|||||||
@@ -0,0 +1,72 @@
|
|||||||
|
dockers:
|
||||||
|
- pull_tag: rocm/jax-training:maxtext-v25.7
|
||||||
|
docker_hub_url: https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7/images/sha256-45f4c727d4019a63fc47313d3a5f5a5105569539294ddfd2d742218212ae9025
|
||||||
|
components:
|
||||||
|
ROCm: 6.4.1
|
||||||
|
JAX: 0.5.0
|
||||||
|
Python: 3.10.12
|
||||||
|
Transformer Engine: 2.1.0+90d703dd
|
||||||
|
hipBLASLt: 1.x.x
|
||||||
|
- pull_tag: rocm/jax-training:maxtext-v25.7-jax060
|
||||||
|
docker_hub_url: https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7/images/sha256-45f4c727d4019a63fc47313d3a5f5a5105569539294ddfd2d742218212ae9025
|
||||||
|
components:
|
||||||
|
ROCm: 6.4.1
|
||||||
|
JAX: 0.6.0
|
||||||
|
Python: 3.10.12
|
||||||
|
Transformer Engine: 2.1.0+90d703dd
|
||||||
|
hipBLASLt: 1.1.0-499ece1c21
|
||||||
|
model_groups:
|
||||||
|
- group: Meta Llama
|
||||||
|
tag: llama
|
||||||
|
models:
|
||||||
|
- model: Llama 3.3 70B
|
||||||
|
mad_tag: jax_maxtext_train_llama-3.3-70b
|
||||||
|
model_repo: Llama-3.3-70B
|
||||||
|
precision: bf16
|
||||||
|
doc_options: ["single-node"]
|
||||||
|
- model: Llama 3.1 8B
|
||||||
|
mad_tag: jax_maxtext_train_llama-3.1-8b
|
||||||
|
model_repo: Llama-3.1-8B
|
||||||
|
precision: bf16
|
||||||
|
doc_options: ["single-node"]
|
||||||
|
- model: Llama 3.1 70B
|
||||||
|
mad_tag: jax_maxtext_train_llama-3.1-70b
|
||||||
|
model_repo: Llama-3.1-70B
|
||||||
|
precision: bf16
|
||||||
|
doc_options: ["single-node"]
|
||||||
|
- model: Llama 3 8B
|
||||||
|
mad_tag: jax_maxtext_train_llama-3-8b
|
||||||
|
multinode_training_script: llama3_8b_multinode.sh
|
||||||
|
doc_options: ["multi-node"]
|
||||||
|
- model: Llama 3 70B
|
||||||
|
mad_tag: jax_maxtext_train_llama-3-70b
|
||||||
|
multinode_training_script: llama3_70b_multinode.sh
|
||||||
|
doc_options: ["multi-node"]
|
||||||
|
- model: Llama 2 7B
|
||||||
|
mad_tag: jax_maxtext_train_llama-2-7b
|
||||||
|
model_repo: Llama-2-7B
|
||||||
|
precision: bf16
|
||||||
|
multinode_training_script: llama2_7b_multinode.sh
|
||||||
|
doc_options: ["single-node", "multi-node"]
|
||||||
|
- model: Llama 2 70B
|
||||||
|
mad_tag: jax_maxtext_train_llama-2-70b
|
||||||
|
model_repo: Llama-2-70B
|
||||||
|
precision: bf16
|
||||||
|
multinode_training_script: llama2_70b_multinode.sh
|
||||||
|
doc_options: ["single-node", "multi-node"]
|
||||||
|
- group: DeepSeek
|
||||||
|
tag: deepseek
|
||||||
|
models:
|
||||||
|
- model: DeepSeek-V2-Lite (16B)
|
||||||
|
mad_tag: jax_maxtext_train_deepseek-v2-lite-16b
|
||||||
|
model_repo: DeepSeek-V2-lite
|
||||||
|
precision: bf16
|
||||||
|
doc_options: ["single-node"]
|
||||||
|
- group: Mistral AI
|
||||||
|
tag: mistral
|
||||||
|
models:
|
||||||
|
- model: Mixtral 8x7B
|
||||||
|
mad_tag: jax_maxtext_train_mixtral-8x7b
|
||||||
|
model_repo: Mixtral-8x7B
|
||||||
|
precision: bf16
|
||||||
|
doc_options: ["single-node"]
|
||||||
@@ -1,26 +1,15 @@
|
|||||||
dockers:
|
dockers:
|
||||||
- pull_tag: rocm/megatron-lm:v25.6_py312
|
- pull_tag: rocm/megatron-lm:v25.7_py310
|
||||||
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py312/images/sha256-482ff906532285bceabdf2bda629bd32cb6174d2d07f4243a736378001b28df0
|
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a
|
||||||
components:
|
components:
|
||||||
ROCm: 6.4.1
|
ROCm: 6.4.2
|
||||||
PyTorch: 2.8.0a0+git7d205b2
|
Primus: v0.1.0-rc1
|
||||||
Python: 3.12
|
PyTorch: 2.8.0a0+gitd06a406
|
||||||
Transformer Engine: 2.1.0.dev0+8c4a512
|
|
||||||
hipBLASLt: 393e413
|
|
||||||
Triton: 3.3.0
|
|
||||||
RCCL: 2.23.4.7a84c5d
|
|
||||||
doc_name: Ubuntu 24.04 + Python 3.12
|
|
||||||
- pull_tag: rocm/megatron-lm:v25.6_py310
|
|
||||||
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py310/images/sha256-9627bd9378684fe26cb1a10c7dd817868f553b33402e49b058355b0f095568d6
|
|
||||||
components:
|
|
||||||
ROCm: 6.4.1
|
|
||||||
PyTorch: 2.8.0a0+git7d205b2
|
|
||||||
Python: "3.10"
|
Python: "3.10"
|
||||||
Transformer Engine: 2.1.0.dev0+8c4a512
|
Transformer Engine: 2.1.0.dev0+ba586519
|
||||||
hipBLASLt: 393e413
|
hipBLASLt: 37ba1d36
|
||||||
Triton: 3.3.0
|
Triton: 3.3.0
|
||||||
RCCL: 2.23.4.7a84c5d
|
RCCL: 2.22.3
|
||||||
doc_name: Ubuntu 22.04 + Python 3.10
|
|
||||||
model_groups:
|
model_groups:
|
||||||
- group: Meta Llama
|
- group: Meta Llama
|
||||||
tag: llama
|
tag: llama
|
||||||
|
|||||||
@@ -0,0 +1,60 @@
|
|||||||
|
dockers:
|
||||||
|
- pull_tag: rocm/megatron-lm:v25.6_py312
|
||||||
|
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py312/images/sha256-482ff906532285bceabdf2bda629bd32cb6174d2d07f4243a736378001b28df0
|
||||||
|
components:
|
||||||
|
ROCm: 6.4.1
|
||||||
|
PyTorch: 2.8.0a0+git7d205b2
|
||||||
|
Python: 3.12
|
||||||
|
Transformer Engine: 2.1.0.dev0+8c4a512
|
||||||
|
hipBLASLt: 393e413
|
||||||
|
Triton: 3.3.0
|
||||||
|
RCCL: 2.23.4.7a84c5d
|
||||||
|
doc_name: Ubuntu 24.04 + Python 3.12
|
||||||
|
- pull_tag: rocm/megatron-lm:v25.6_py310
|
||||||
|
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py310/images/sha256-9627bd9378684fe26cb1a10c7dd817868f553b33402e49b058355b0f095568d6
|
||||||
|
components:
|
||||||
|
ROCm: 6.4.1
|
||||||
|
PyTorch: 2.8.0a0+git7d205b2
|
||||||
|
Python: "3.10"
|
||||||
|
Transformer Engine: 2.1.0.dev0+8c4a512
|
||||||
|
hipBLASLt: 393e413
|
||||||
|
Triton: 3.3.0
|
||||||
|
RCCL: 2.23.4.7a84c5d
|
||||||
|
doc_name: Ubuntu 22.04 + Python 3.10
|
||||||
|
model_groups:
|
||||||
|
- group: Meta Llama
|
||||||
|
tag: llama
|
||||||
|
models:
|
||||||
|
- model: Llama 3.3 70B
|
||||||
|
mad_tag: pyt_megatron_lm_train_llama-3.3-70b
|
||||||
|
- model: Llama 3.1 8B
|
||||||
|
mad_tag: pyt_megatron_lm_train_llama-3.1-8b
|
||||||
|
- model: Llama 3.1 70B
|
||||||
|
mad_tag: pyt_megatron_lm_train_llama-3.1-70b
|
||||||
|
- model: Llama 3.1 70B (proxy)
|
||||||
|
mad_tag: pyt_megatron_lm_train_llama-3.1-70b-proxy
|
||||||
|
- model: Llama 2 7B
|
||||||
|
mad_tag: pyt_megatron_lm_train_llama-2-7b
|
||||||
|
- model: Llama 2 70B
|
||||||
|
mad_tag: pyt_megatron_lm_train_llama-2-70b
|
||||||
|
- group: DeepSeek
|
||||||
|
tag: deepseek
|
||||||
|
models:
|
||||||
|
- model: DeepSeek-V3 (proxy)
|
||||||
|
mad_tag: pyt_megatron_lm_train_deepseek-v3-proxy
|
||||||
|
- model: DeepSeek-V2-Lite
|
||||||
|
mad_tag: pyt_megatron_lm_train_deepseek-v2-lite-16b
|
||||||
|
- group: Mistral AI
|
||||||
|
tag: mistral
|
||||||
|
models:
|
||||||
|
- model: Mixtral 8x7B
|
||||||
|
mad_tag: pyt_megatron_lm_train_mixtral-8x7b
|
||||||
|
- model: Mixtral 8x22B (proxy)
|
||||||
|
mad_tag: pyt_megatron_lm_train_mixtral-8x22b-proxy
|
||||||
|
- group: Qwen
|
||||||
|
tag: qwen
|
||||||
|
models:
|
||||||
|
- model: Qwen 2.5 7B
|
||||||
|
mad_tag: pyt_megatron_lm_train_qwen2.5-7b
|
||||||
|
- model: Qwen 2.5 72B
|
||||||
|
mad_tag: pyt_megatron_lm_train_qwen2.5-72b
|
||||||
@@ -0,0 +1,120 @@
|
|||||||
|
unified_docker:
|
||||||
|
latest:
|
||||||
|
pull_tag: rocm/pytorch-training:v25.6
|
||||||
|
docker_hub_url: https://hub.docker.com/r/rocm/pytorch-training/tags
|
||||||
|
rocm_version: 6.4.1
|
||||||
|
pytorch_version: 2.8.0a0+git7d205b2
|
||||||
|
python_version: 3.10.17
|
||||||
|
transformer_engine_version: 1.14.0+2f85f5f2
|
||||||
|
flash_attention_version: 3.0.0.post1
|
||||||
|
hipblaslt_version: 0.15.0-8c6919d
|
||||||
|
triton_version: 3.3.0
|
||||||
|
model_groups:
|
||||||
|
- group: Pre-training
|
||||||
|
tag: pre-training
|
||||||
|
models:
|
||||||
|
- model: Llama 3.1 8B
|
||||||
|
mad_tag: pyt_train_llama-3.1-8b
|
||||||
|
model_repo: Llama-3.1-8B
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [pretrain]
|
||||||
|
- model: Llama 3.1 70B
|
||||||
|
mad_tag: pyt_train_llama-3.1-70b
|
||||||
|
model_repo: Llama-3.1-70B
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [pretrain]
|
||||||
|
- model: FLUX.1-dev
|
||||||
|
mad_tag: pyt_train_flux
|
||||||
|
model_repo: Flux
|
||||||
|
url: https://huggingface.co/black-forest-labs/FLUX.1-dev
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [pretrain]
|
||||||
|
- group: Fine-tuning
|
||||||
|
tag: fine-tuning
|
||||||
|
models:
|
||||||
|
- model: Llama 4 Scout 17B-16E
|
||||||
|
mad_tag: pyt_train_llama-4-scout-17b-16e
|
||||||
|
model_repo: Llama-4-17B_16E
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [finetune_fw, finetune_lora]
|
||||||
|
- model: Llama 3.3 70B
|
||||||
|
mad_tag: pyt_train_llama-3.3-70b
|
||||||
|
model_repo: Llama-3.3-70B
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [finetune_fw, finetune_lora, finetune_qlora]
|
||||||
|
- model: Llama 3.2 1B
|
||||||
|
mad_tag: pyt_train_llama-3.2-1b
|
||||||
|
model_repo: Llama-3.2-1B
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-3.2-1B
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [finetune_fw, finetune_lora]
|
||||||
|
- model: Llama 3.2 3B
|
||||||
|
mad_tag: pyt_train_llama-3.2-3b
|
||||||
|
model_repo: Llama-3.2-3B
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-3.2-3B
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [finetune_fw, finetune_lora]
|
||||||
|
- model: Llama 3.2 Vision 11B
|
||||||
|
mad_tag: pyt_train_llama-3.2-vision-11b
|
||||||
|
model_repo: Llama-3.2-Vision-11B
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [finetune_fw]
|
||||||
|
- model: Llama 3.2 Vision 90B
|
||||||
|
mad_tag: pyt_train_llama-3.2-vision-90b
|
||||||
|
model_repo: Llama-3.2-Vision-90B
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-3.2-90B-Vision
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [finetune_fw]
|
||||||
|
- model: Llama 3.1 8B
|
||||||
|
mad_tag: pyt_train_llama-3.1-8b
|
||||||
|
model_repo: Llama-3.1-8B
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [finetune_fw, finetune_lora]
|
||||||
|
- model: Llama 3.1 70B
|
||||||
|
mad_tag: pyt_train_llama-3.1-70b
|
||||||
|
model_repo: Llama-3.1-70B
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-3.1-70B
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [finetune_fw, finetune_lora, finetune_qlora]
|
||||||
|
- model: Llama 3.1 405B
|
||||||
|
mad_tag: pyt_train_llama-3.1-405b
|
||||||
|
model_repo: Llama-3.1-405B
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-3.1-405B
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [finetune_qlora, HF_finetune_lora]
|
||||||
|
- model: Llama 3 8B
|
||||||
|
mad_tag: pyt_train_llama-3-8b
|
||||||
|
model_repo: Llama-3-8B
|
||||||
|
url: https://huggingface.co/meta-llama/Meta-Llama-3-8B
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [finetune_fw, finetune_lora]
|
||||||
|
- model: Llama 3 70B
|
||||||
|
mad_tag: pyt_train_llama-3-70b
|
||||||
|
model_repo: Llama-3-70B
|
||||||
|
url: https://huggingface.co/meta-llama/Meta-Llama-3-70B
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [finetune_fw, finetune_lora]
|
||||||
|
- model: Llama 2 7B
|
||||||
|
mad_tag: pyt_train_llama-2-7b
|
||||||
|
model_repo: Llama-2-7B
|
||||||
|
url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [finetune_fw, finetune_lora, finetune_qlora]
|
||||||
|
- model: Llama 2 13B
|
||||||
|
mad_tag: pyt_train_llama-2-13b
|
||||||
|
model_repo: Llama-2-13B
|
||||||
|
url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [finetune_fw, finetune_lora]
|
||||||
|
- model: Llama 2 70B
|
||||||
|
mad_tag: pyt_train_llama-2-70b
|
||||||
|
model_repo: Llama-2-70B
|
||||||
|
url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [finetune_lora, finetune_qlora, HF_finetune_lora]
|
||||||
@@ -0,0 +1,58 @@
|
|||||||
|
dockers:
|
||||||
|
- pull_tag: rocm/megatron-lm:v25.7_py310
|
||||||
|
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a
|
||||||
|
components:
|
||||||
|
ROCm: 6.4.2
|
||||||
|
Primus: v0.1.0-rc1
|
||||||
|
PyTorch: 2.8.0a0+gitd06a406
|
||||||
|
Python: "3.10"
|
||||||
|
Transformer Engine: 2.1.0.dev0+ba586519
|
||||||
|
hipBLASLt: 37ba1d36
|
||||||
|
Triton: 3.3.0
|
||||||
|
RCCL: 2.22.3
|
||||||
|
model_groups:
|
||||||
|
- group: Meta Llama
|
||||||
|
tag: llama
|
||||||
|
models:
|
||||||
|
- model: Llama 3.3 70B
|
||||||
|
mad_tag: primus_pyt_megatron_lm_train_llama-3.3-70b
|
||||||
|
config_name: llama3.3_70B-pretrain.yaml
|
||||||
|
- model: Llama 3.1 70B
|
||||||
|
mad_tag: primus_pyt_megatron_lm_train_llama-3.1-70b
|
||||||
|
config_name: llama3.1_70B-pretrain.yaml
|
||||||
|
- model: Llama 3.1 8B
|
||||||
|
mad_tag: primus_pyt_megatron_lm_train_llama-3.1-8b
|
||||||
|
config_name: llama3.1_8B-pretrain.yaml
|
||||||
|
- model: Llama 2 7B
|
||||||
|
mad_tag: primus_pyt_megatron_lm_train_llama-2-7b
|
||||||
|
config_name: llama2_7B-pretrain.yaml
|
||||||
|
- model: Llama 2 70B
|
||||||
|
mad_tag: primus_pyt_megatron_lm_train_llama-2-70b
|
||||||
|
config_name: llama2_70B-pretrain.yaml
|
||||||
|
- group: DeepSeek
|
||||||
|
tag: deepseek
|
||||||
|
models:
|
||||||
|
- model: DeepSeek-V3 (proxy)
|
||||||
|
mad_tag: primus_pyt_megatron_lm_train_deepseek-v3-proxy
|
||||||
|
config_name: deepseek_v3-pretrain.yaml
|
||||||
|
- model: DeepSeek-V2-Lite
|
||||||
|
mad_tag: primus_pyt_megatron_lm_train_deepseek-v2-lite-16b
|
||||||
|
config_name: deepseek_v2_lite-pretrain.yaml
|
||||||
|
- group: Mistral AI
|
||||||
|
tag: mistral
|
||||||
|
models:
|
||||||
|
- model: Mixtral 8x7B
|
||||||
|
mad_tag: primus_pyt_megatron_lm_train_mixtral-8x7b
|
||||||
|
config_name: mixtral_8x7B_v0.1-pretrain.yaml
|
||||||
|
- model: Mixtral 8x22B (proxy)
|
||||||
|
mad_tag: primus_pyt_megatron_lm_train_mixtral-8x22b-proxy
|
||||||
|
config_name: mixtral_8x22B_v0.1-pretrain.yaml
|
||||||
|
- group: Qwen
|
||||||
|
tag: qwen
|
||||||
|
models:
|
||||||
|
- model: Qwen 2.5 7B
|
||||||
|
mad_tag: primus_pyt_megatron_lm_train_qwen2.5-7b
|
||||||
|
config_name: primus_qwen2.5_7B-pretrain.yaml
|
||||||
|
- model: Qwen 2.5 72B
|
||||||
|
mad_tag: primus_pyt_megatron_lm_train_qwen2.5-72b
|
||||||
|
config_name: qwen2.5_72B-pretrain.yaml
|
||||||
@@ -1,38 +1,17 @@
|
|||||||
unified_docker:
|
dockers:
|
||||||
latest:
|
- pull_tag: rocm/pytorch-training:v25.7
|
||||||
pull_tag: rocm/pytorch-training:v25.6
|
docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.7/images/sha256-cc6fd840ab89cb81d926fc29eca6d075aee9875a55a522675a4b9231c9a0a712
|
||||||
docker_hub_url: https://hub.docker.com/r/rocm/pytorch-training/tags
|
components:
|
||||||
rocm_version: 6.4.1
|
ROCm: 6.4.2
|
||||||
pytorch_version: 2.8.0a0+git7d205b2
|
PyTorch: 2.8.0a0+gitd06a406
|
||||||
python_version: 3.10.17
|
Python: 3.10.18
|
||||||
transformer_engine_version: 1.14.0+2f85f5f2
|
Transformer Engine: 2.2.0.dev0+94e53dd8
|
||||||
flash_attention_version: 3.0.0.post1
|
Flash Attention: 3.0.0.post1
|
||||||
hipblaslt_version: 0.15.0-8c6919d
|
hipBLASLt: 1.1.0-4b9a52edfc
|
||||||
triton_version: 3.3.0
|
Triton: 3.3.0
|
||||||
model_groups:
|
model_groups:
|
||||||
- group: Pre-training
|
- group: Meta Llama
|
||||||
tag: pre-training
|
tag: llama
|
||||||
models:
|
|
||||||
- model: Llama 3.1 8B
|
|
||||||
mad_tag: pyt_train_llama-3.1-8b
|
|
||||||
model_repo: Llama-3.1-8B
|
|
||||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
|
||||||
precision: BF16
|
|
||||||
training_modes: [pretrain]
|
|
||||||
- model: Llama 3.1 70B
|
|
||||||
mad_tag: pyt_train_llama-3.1-70b
|
|
||||||
model_repo: Llama-3.1-70B
|
|
||||||
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
|
|
||||||
precision: BF16
|
|
||||||
training_modes: [pretrain]
|
|
||||||
- model: FLUX.1-dev
|
|
||||||
mad_tag: pyt_train_flux
|
|
||||||
model_repo: Flux
|
|
||||||
url: https://huggingface.co/black-forest-labs/FLUX.1-dev
|
|
||||||
precision: BF16
|
|
||||||
training_modes: [pretrain]
|
|
||||||
- group: Fine-tuning
|
|
||||||
tag: fine-tuning
|
|
||||||
models:
|
models:
|
||||||
- model: Llama 4 Scout 17B-16E
|
- model: Llama 4 Scout 17B-16E
|
||||||
mad_tag: pyt_train_llama-4-scout-17b-16e
|
mad_tag: pyt_train_llama-4-scout-17b-16e
|
||||||
@@ -75,19 +54,19 @@ model_groups:
|
|||||||
model_repo: Llama-3.1-8B
|
model_repo: Llama-3.1-8B
|
||||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||||
precision: BF16
|
precision: BF16
|
||||||
training_modes: [finetune_fw, finetune_lora]
|
training_modes: [pretrain, finetune_fw, finetune_lora, HF_pretrain]
|
||||||
- model: Llama 3.1 70B
|
- model: Llama 3.1 70B
|
||||||
mad_tag: pyt_train_llama-3.1-70b
|
mad_tag: pyt_train_llama-3.1-70b
|
||||||
model_repo: Llama-3.1-70B
|
model_repo: Llama-3.1-70B
|
||||||
url: https://huggingface.co/meta-llama/Llama-3.1-70B
|
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
|
||||||
precision: BF16
|
precision: BF16
|
||||||
training_modes: [finetune_fw, finetune_lora, finetune_qlora]
|
training_modes: [pretrain, finetune_fw, finetune_lora]
|
||||||
- model: Llama 3.1 405B
|
- model: Llama 3.1 405B
|
||||||
mad_tag: pyt_train_llama-3.1-405b
|
mad_tag: pyt_train_llama-3.1-405b
|
||||||
model_repo: Llama-3.1-405B
|
model_repo: Llama-3.1-405B
|
||||||
url: https://huggingface.co/meta-llama/Llama-3.1-405B
|
url: https://huggingface.co/meta-llama/Llama-3.1-405B
|
||||||
precision: BF16
|
precision: BF16
|
||||||
training_modes: [finetune_qlora, HF_finetune_lora]
|
training_modes: [finetune_qlora]
|
||||||
- model: Llama 3 8B
|
- model: Llama 3 8B
|
||||||
mad_tag: pyt_train_llama-3-8b
|
mad_tag: pyt_train_llama-3-8b
|
||||||
model_repo: Llama-3-8B
|
model_repo: Llama-3-8B
|
||||||
@@ -117,4 +96,67 @@ model_groups:
|
|||||||
model_repo: Llama-2-70B
|
model_repo: Llama-2-70B
|
||||||
url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
|
url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
|
||||||
precision: BF16
|
precision: BF16
|
||||||
training_modes: [finetune_lora, finetune_qlora, HF_finetune_lora]
|
training_modes: [finetune_lora, finetune_qlora]
|
||||||
|
- group: OpenAI
|
||||||
|
tag: openai
|
||||||
|
models:
|
||||||
|
- model: GPT OSS 20B
|
||||||
|
mad_tag: pyt_train_gpt_oss_20b
|
||||||
|
model_repo: GPT-OSS-20B
|
||||||
|
url: https://huggingface.co/openai/gpt-oss-20b
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [HF_finetune_lora]
|
||||||
|
- model: GPT OSS 120B
|
||||||
|
mad_tag: pyt_train_gpt_oss_120b
|
||||||
|
model_repo: GPT-OSS-120B
|
||||||
|
url: https://huggingface.co/openai/gpt-oss-120b
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [HF_finetune_lora]
|
||||||
|
- group: Qwen
|
||||||
|
tag: qwen
|
||||||
|
models:
|
||||||
|
- model: Qwen 3 8B
|
||||||
|
mad_tag: pyt_train_qwen3-8b
|
||||||
|
model_repo: Qwen3-8B
|
||||||
|
url: https://huggingface.co/Qwen/Qwen3-8B
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [finetune_fw, finetune_lora]
|
||||||
|
- model: Qwen 3 32B
|
||||||
|
mad_tag: pyt_train_qwen3-32b
|
||||||
|
model_repo: Qwen3-32
|
||||||
|
url: https://huggingface.co/Qwen/Qwen3-32B
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [finetune_lora]
|
||||||
|
- model: Qwen 2.5 32B
|
||||||
|
mad_tag: pyt_train_qwen2.5-32b
|
||||||
|
model_repo: Qwen2.5-32B
|
||||||
|
url: https://huggingface.co/Qwen/Qwen2.5-32B
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [finetune_lora]
|
||||||
|
- model: Qwen 2.5 72B
|
||||||
|
mad_tag: pyt_train_qwen2.5-72b
|
||||||
|
model_repo: Qwen2.5-72B
|
||||||
|
url: https://huggingface.co/Qwen/Qwen2.5-72B
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [finetune_lora]
|
||||||
|
- model: Qwen 2 1.5B
|
||||||
|
mad_tag: pyt_train_qwen2-1.5b
|
||||||
|
model_repo: Qwen2-1.5B
|
||||||
|
url: https://huggingface.co/Qwen/Qwen2-1.5B
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [finetune_fw, finetune_lora]
|
||||||
|
- model: Qwen 2 7B
|
||||||
|
mad_tag: pyt_train_qwen2-7b
|
||||||
|
model_repo: Qwen2-7B
|
||||||
|
url: https://huggingface.co/Qwen/Qwen2-7B
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [finetune_fw, finetune_lora]
|
||||||
|
- group: Flux
|
||||||
|
tag: flux
|
||||||
|
models:
|
||||||
|
- model: FLUX.1-dev
|
||||||
|
mad_tag: pyt_train_flux
|
||||||
|
model_repo: Flux
|
||||||
|
url: https://huggingface.co/black-forest-labs/FLUX.1-dev
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [pretrain]
|
||||||
|
|||||||
@@ -19,5 +19,6 @@ The general steps to build ROCm are:
|
|||||||
#. Run the build command
|
#. Run the build command
|
||||||
|
|
||||||
Because the ROCm stack is constantly evolving, the most current instructions are stored with the source code in GitHub.
|
Because the ROCm stack is constantly evolving, the most current instructions are stored with the source code in GitHub.
|
||||||
For detailed build instructions, see `Getting and Building ROCm from Source <https://github.com/ROCm/ROCm?tab=readme-ov-file#getting-and-building-rocm-from-source>`.
|
For detailed build instructions, see `Getting and Building ROCm from Source <https://github.com/ROCm/ROCm?tab=readme-ov-file#getting-and-building-rocm-from-source>`_.
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -2,58 +2,134 @@
|
|||||||
:description: How to install deep learning frameworks for ROCm
|
:description: How to install deep learning frameworks for ROCm
|
||||||
:keywords: deep learning, frameworks, ROCm, install, PyTorch, TensorFlow, JAX, MAGMA, DeepSpeed, ML, AI
|
:keywords: deep learning, frameworks, ROCm, install, PyTorch, TensorFlow, JAX, MAGMA, DeepSpeed, ML, AI
|
||||||
|
|
||||||
********************************************
|
**********************************
|
||||||
Installing deep learning frameworks for ROCm
|
Deep learning frameworks for ROCm
|
||||||
********************************************
|
**********************************
|
||||||
|
|
||||||
ROCm provides a comprehensive ecosystem for deep learning development, including
|
Deep learning frameworks provide environments for machine learning, training, fine-tuning, inference, and performance optimization.
|
||||||
:ref:`libraries <artificial-intelligence-apis>` for optimized deep learning operations and ROCm-aware versions of popular
|
|
||||||
deep learning frameworks and libraries such as PyTorch, TensorFlow, and JAX. ROCm works closely with these
|
|
||||||
frameworks to ensure that framework-specific optimizations take advantage of AMD accelerator and GPU architectures.
|
|
||||||
|
|
||||||
The following guides provide information on compatibility and supported
|
ROCm offers a complete ecosystem for developing and running deep learning applications efficiently. It also provides ROCm-compatible versions of popular frameworks and libraries, such as PyTorch, TensorFlow, JAX, and others.
|
||||||
features for these ROCm-enabled deep learning frameworks.
|
|
||||||
|
|
||||||
* :doc:`PyTorch compatibility <../compatibility/ml-compatibility/pytorch-compatibility>`
|
The AMD ROCm organization actively contributes to open-source development and collaborates closely with framework organizations. This collaboration ensures that framework-specific optimizations effectively leverage AMD GPUs and accelerators.
|
||||||
* :doc:`TensorFlow compatibility <../compatibility/ml-compatibility/tensorflow-compatibility>`
|
|
||||||
* :doc:`JAX compatibility <../compatibility/ml-compatibility/jax-compatibility>`
|
|
||||||
* :doc:`verl compatibility <../compatibility/ml-compatibility/verl-compatibility>`
|
|
||||||
* :doc:`Stanford Megatron-LM compatibility <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>`
|
|
||||||
* :doc:`DGL compatibility <../compatibility/ml-compatibility/dgl-compatibility>`
|
|
||||||
* :doc:`Megablocks compatibility <../compatibility/ml-compatibility/megablocks-compatibility>`
|
|
||||||
* :doc:`Taichi compatibility <../compatibility/ml-compatibility/taichi-compatibility>`
|
|
||||||
|
|
||||||
This chart steps through typical installation workflows for installing deep learning frameworks for ROCm.
|
The table below summarizes information about ROCm-enabled deep learning frameworks. It includes details on ROCm compatibility and third-party tool support, installation steps and options, and links to GitHub resources. For a complete list of supported framework versions on ROCm, see the :doc:`Compatibility matrix <../compatibility/compatibility-matrix>` topic.
|
||||||
|
|
||||||
.. image:: ../data/how-to/framework_install_2024_07_04.png
|
.. list-table::
|
||||||
:alt: Flowchart for installing ROCm-aware machine learning frameworks
|
:header-rows: 1
|
||||||
:align: center
|
:widths: 5 3 6 3
|
||||||
|
|
||||||
See the installation instructions to get started.
|
* - Framework
|
||||||
|
- Installation
|
||||||
|
- Installation options
|
||||||
|
- GitHub
|
||||||
|
|
||||||
* :doc:`PyTorch for ROCm <rocm-install-on-linux:install/3rd-party/pytorch-install>`
|
* - `PyTorch <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/pytorch-compatibility.html>`__
|
||||||
* :doc:`TensorFlow for ROCm <rocm-install-on-linux:install/3rd-party/tensorflow-install>`
|
- .. raw:: html
|
||||||
* :doc:`JAX for ROCm <rocm-install-on-linux:install/3rd-party/jax-install>`
|
|
||||||
* :doc:`verl for ROCm <rocm-install-on-linux:install/3rd-party/verl-install>`
|
|
||||||
* :doc:`Stanford Megatron-LM for ROCm <rocm-install-on-linux:install/3rd-party/stanford-megatron-lm-install>`
|
|
||||||
* :doc:`DGL for ROCm <rocm-install-on-linux:install/3rd-party/dgl-install>`
|
|
||||||
* :doc:`Megablocks for ROCm <rocm-install-on-linux:install/3rd-party/megablocks-install>`
|
|
||||||
* :doc:`Taichi for ROCm <rocm-install-on-linux:install/3rd-party/taichi-install>`
|
|
||||||
|
|
||||||
.. note::
|
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||||
|
-
|
||||||
|
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-a-docker-image-with-pytorch-pre-installed>`__
|
||||||
|
- `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-a-wheels-package>`__
|
||||||
|
- `ROCm Base Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-the-pytorch-rocm-base-docker-image>`__
|
||||||
|
- `Upstream Docker file <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-the-pytorch-upstream-dockerfile>`__
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
For guidance on installing ROCm itself, refer to :doc:`ROCm installation for Linux <rocm-install-on-linux:index>`.
|
<a href="https://github.com/ROCm/pytorch"><i class="fab fa-github fa-lg"></i></a>
|
||||||
|
|
||||||
|
* - `TensorFlow <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/tensorflow-compatibility.html>`__
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||||
|
-
|
||||||
|
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html#using-a-docker-image-with-tensorflow-pre-installed>`__
|
||||||
|
- `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html#using-a-wheels-package>`__
|
||||||
|
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://github.com/ROCm/tensorflow-upstream"><i class="fab fa-github fa-lg"></i></a>
|
||||||
|
|
||||||
|
* - `JAX <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/jax-compatibility.html>`__
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/jax-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||||
|
-
|
||||||
|
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/jax-install.html#using-a-prebuilt-docker-image>`__
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://github.com/ROCm/jax"><i class="fab fa-github fa-lg"></i></a>
|
||||||
|
|
||||||
|
* - `verl <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/verl-compatibility.html>`__
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/verl-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||||
|
-
|
||||||
|
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/verl-install.html#use-a-prebuilt-docker-image-with-verl-pre-installed>`__
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://github.com/ROCm/verl"><i class="fab fa-github fa-lg"></i></a>
|
||||||
|
|
||||||
|
* - `Stanford Megatron-LM <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/stanford-megatron-lm-compatibility.html>`__
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/stanford-megatron-lm-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||||
|
-
|
||||||
|
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/stanford-megatron-lm-install.html#use-a-prebuilt-docker-image-with-stanford-megatron-lm-pre-installed>`__
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://github.com/ROCm/Stanford-Megatron-LM"><i class="fab fa-github fa-lg"></i></a>
|
||||||
|
|
||||||
|
* - `DGL <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/dgl-compatibility.html>`__
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/dgl-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||||
|
-
|
||||||
|
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/dgl-install.html#use-a-prebuilt-docker-image-with-dgl-pre-installed>`__
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://github.com/ROCm/dgl"><i class="fab fa-github fa-lg"></i></a>
|
||||||
|
|
||||||
|
* - `Megablocks <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/megablocks-compatibility.html>`__
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/megablocks-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||||
|
-
|
||||||
|
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/megablocks-install.html#using-a-prebuilt-docker-image-with-megablocks-pre-installed>`__
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://github.com/ROCm/megablocks"><i class="fab fa-github fa-lg"></i></a>
|
||||||
|
|
||||||
|
* - `Ray <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/ray-compatibility.html>`__
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/ray-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||||
|
-
|
||||||
|
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/ray-install.html#using-a-prebuilt-docker-image-with-ray-pre-installed>`__
|
||||||
|
- `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/ray-install.html#install-ray-on-bare-metal-or-a-custom-container>`__
|
||||||
|
- `ROCm Base Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/ray-install.html#build-your-own-docker-image>`__
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://github.com/ROCm/ray"><i class="fab fa-github fa-lg"></i></a>
|
||||||
|
|
||||||
|
* - `llama.cpp <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/llama-cpp-compatibility.html>`__
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/llama-cpp-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||||
|
-
|
||||||
|
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/llama-cpp-install.html#use-a-prebuilt-docker-image-with-llama-cpp-pre-installed>`__
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://github.com/ROCm/llama.cpp"><i class="fab fa-github fa-lg"></i></a>
|
||||||
|
|
||||||
Learn how to use your ROCm deep learning environment for training, fine-tuning, inference, and performance optimization
|
Learn how to use your ROCm deep learning environment for training, fine-tuning, inference, and performance optimization
|
||||||
through the following guides.
|
through the following guides.
|
||||||
|
|
||||||
* :doc:`rocm-for-ai/index`
|
* :doc:`rocm-for-ai/index`
|
||||||
|
|
||||||
* :doc:`Training <rocm-for-ai/training/index>`
|
* :doc:`Use ROCm for training <rocm-for-ai/training/index>`
|
||||||
|
|
||||||
* :doc:`Fine-tuning LLMs <rocm-for-ai/fine-tuning/index>`
|
* :doc:`Use ROCm for fine-tuning LLMs <rocm-for-ai/fine-tuning/index>`
|
||||||
|
|
||||||
* :doc:`Inference <rocm-for-ai/inference/index>`
|
* :doc:`Use ROCm for AI inference <rocm-for-ai/inference/index>`
|
||||||
|
|
||||||
* :doc:`Inference optimization <rocm-for-ai/inference-optimization/index>`
|
* :doc:`Use ROCm for AI inference optimization <rocm-for-ai/inference-optimization/index>`
|
||||||
|
|
||||||
|
|||||||
@@ -939,7 +939,7 @@ hipBLASLt benchmarking
|
|||||||
The GEMM library
|
The GEMM library
|
||||||
`hipBLASLt <https://rocm.docs.amd.com/projects/hipBLASLt/en/latest/index.html>`_
|
`hipBLASLt <https://rocm.docs.amd.com/projects/hipBLASLt/en/latest/index.html>`_
|
||||||
provides a benchmark tool for its supported operations. Refer to the
|
provides a benchmark tool for its supported operations. Refer to the
|
||||||
`documentation <https://github.com/ROCm/hipBLASLt/blob/develop/clients/benchmarks/README.md>`_
|
`documentation <https://github.com/ROCm/hipBLASLt/blob/develop/clients/bench/README.md>`_
|
||||||
for details.
|
for details.
|
||||||
|
|
||||||
* Example 1: Benchmark mix fp8 GEMM
|
* Example 1: Benchmark mix fp8 GEMM
|
||||||
|
|||||||
@@ -0,0 +1,445 @@
|
|||||||
|
:orphan:
|
||||||
|
|
||||||
|
.. meta::
|
||||||
|
:description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
|
||||||
|
ROCm vLLM Docker image.
|
||||||
|
:keywords: model, MAD, automation, dashboarding, validate
|
||||||
|
|
||||||
|
**********************************
|
||||||
|
vLLM inference performance testing
|
||||||
|
**********************************
|
||||||
|
|
||||||
|
.. caution::
|
||||||
|
|
||||||
|
This documentation does not reflect the latest version of ROCm vLLM
|
||||||
|
inference performance documentation. See :doc:`../vllm` for the latest version.
|
||||||
|
|
||||||
|
.. _vllm-benchmark-unified-docker-812:
|
||||||
|
|
||||||
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.0_20250812-benchmark-models.yaml
|
||||||
|
|
||||||
|
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
||||||
|
{% set model_groups = data.vllm_benchmark.model_groups %}
|
||||||
|
|
||||||
|
The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
|
||||||
|
a prebuilt, optimized environment for validating large language model (LLM)
|
||||||
|
inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
|
||||||
|
Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
|
||||||
|
accelerators and includes the following components:
|
||||||
|
|
||||||
|
.. list-table::
|
||||||
|
:header-rows: 1
|
||||||
|
|
||||||
|
* - Software component
|
||||||
|
- Version
|
||||||
|
|
||||||
|
* - `ROCm <https://github.com/ROCm/ROCm>`__
|
||||||
|
- {{ unified_docker.rocm_version }}
|
||||||
|
|
||||||
|
* - `vLLM <https://docs.vllm.ai/en/latest>`__
|
||||||
|
- {{ unified_docker.vllm_version }}
|
||||||
|
|
||||||
|
* - `PyTorch <https://github.com/ROCm/pytorch>`__
|
||||||
|
- {{ unified_docker.pytorch_version }}
|
||||||
|
|
||||||
|
* - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`__
|
||||||
|
- {{ unified_docker.hipblaslt_version }}
|
||||||
|
|
||||||
|
With this Docker image, you can quickly test the :ref:`expected
|
||||||
|
inference performance numbers <vllm-benchmark-performance-measurements-812>` for
|
||||||
|
MI300X series accelerators.
|
||||||
|
|
||||||
|
What's new
|
||||||
|
==========
|
||||||
|
|
||||||
|
The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <vllm-history>`.
|
||||||
|
|
||||||
|
* Upgraded to vLLM v0.10.
|
||||||
|
|
||||||
|
* FP8 KV cache support via AITER.
|
||||||
|
|
||||||
|
* Full graph capture support via AITER.
|
||||||
|
|
||||||
|
Supported models
|
||||||
|
================
|
||||||
|
|
||||||
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.0_20250812-benchmark-models.yaml
|
||||||
|
|
||||||
|
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
||||||
|
{% set model_groups = data.vllm_benchmark.model_groups %}
|
||||||
|
|
||||||
|
.. _vllm-benchmark-available-models-812:
|
||||||
|
|
||||||
|
The following models are supported for inference performance benchmarking
|
||||||
|
with vLLM and ROCm. Some instructions, commands, and recommendations in this
|
||||||
|
documentation might vary by model -- select one to get started.
|
||||||
|
|
||||||
|
.. raw:: html
|
||||||
|
|
||||||
|
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||||
|
<div class="row">
|
||||||
|
<div class="col-2 me-2 model-param-head">Model group</div>
|
||||||
|
<div class="row col-10">
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
<div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="row mt-1">
|
||||||
|
<div class="col-2 me-2 model-param-head">Model</div>
|
||||||
|
<div class="row col-10">
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
{% set models = model_group.models %}
|
||||||
|
{% for model in models %}
|
||||||
|
{% if models|length % 3 == 0 %}
|
||||||
|
<div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
|
{% else %}
|
||||||
|
<div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
|
{% endif %}
|
||||||
|
{% endfor %}
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
.. _vllm-benchmark-vllm-812:
|
||||||
|
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
{% for model in model_group.models %}
|
||||||
|
|
||||||
|
.. container:: model-doc {{model.mad_tag}}
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
|
||||||
|
Some models require access authorization prior to use via an external license agreement through a third party.
|
||||||
|
|
||||||
|
{% endfor %}
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
vLLM is a toolkit and library for LLM inference and serving. AMD implements
|
||||||
|
high-performance custom kernels and modules in vLLM to enhance performance.
|
||||||
|
See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
|
||||||
|
more information.
|
||||||
|
|
||||||
|
.. _vllm-benchmark-performance-measurements-812:
|
||||||
|
|
||||||
|
Performance measurements
|
||||||
|
========================
|
||||||
|
|
||||||
|
To evaluate performance, the
|
||||||
|
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
|
||||||
|
page provides reference throughput and serving measurements for inferencing popular AI models.
|
||||||
|
|
||||||
|
.. important::
|
||||||
|
|
||||||
|
The performance data presented in
|
||||||
|
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
|
||||||
|
only reflects the latest version of this inference benchmarking environment.
|
||||||
|
The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
|
||||||
|
|
||||||
|
System validation
|
||||||
|
=================
|
||||||
|
|
||||||
|
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||||
|
correctly and performing optimally.
|
||||||
|
|
||||||
|
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
||||||
|
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
||||||
|
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
||||||
|
before starting training.
|
||||||
|
|
||||||
|
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||||
|
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||||
|
system's configuration.
|
||||||
|
|
||||||
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.0_20250812-benchmark-models.yaml
|
||||||
|
|
||||||
|
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
||||||
|
{% set model_groups = data.vllm_benchmark.model_groups %}
|
||||||
|
|
||||||
|
Pull the Docker image
|
||||||
|
=====================
|
||||||
|
|
||||||
|
Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
|
||||||
|
Use the following command to pull the Docker image from Docker Hub.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker pull {{ unified_docker.pull_tag }}
|
||||||
|
|
||||||
|
Benchmarking
|
||||||
|
============
|
||||||
|
|
||||||
|
Once the setup is complete, choose between two options to reproduce the
|
||||||
|
benchmark results:
|
||||||
|
|
||||||
|
.. _vllm-benchmark-mad-812:
|
||||||
|
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
{% for model in model_group.models %}
|
||||||
|
|
||||||
|
.. container:: model-doc {{model.mad_tag}}
|
||||||
|
|
||||||
|
.. tab-set::
|
||||||
|
|
||||||
|
.. tab-item:: MAD-integrated benchmarking
|
||||||
|
|
||||||
|
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||||
|
directory and install the required packages on the host machine.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
git clone https://github.com/ROCm/MAD
|
||||||
|
cd MAD
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
2. Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
|
||||||
|
using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
||||||
|
madengine run \
|
||||||
|
--tags {{model.mad_tag}} \
|
||||||
|
--keep-model-dir \
|
||||||
|
--live-output \
|
||||||
|
--timeout 28800
|
||||||
|
|
||||||
|
MAD launches a Docker container with the name
|
||||||
|
``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the
|
||||||
|
model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv``
|
||||||
|
and ``{{ model.mad_tag }}_serving.csv``.
|
||||||
|
|
||||||
|
Although the :ref:`available models
|
||||||
|
<vllm-benchmark-available-models-812>` are preconfigured to collect
|
||||||
|
offline throughput and online serving performance data, you can
|
||||||
|
also change the benchmarking parameters. See the standalone
|
||||||
|
benchmarking tab for more information.
|
||||||
|
|
||||||
|
{% if model.tunableop %}
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
|
||||||
|
TunableOp automatically explores different implementations and configurations of certain PyTorch
|
||||||
|
operators to find the fastest one for your hardware.
|
||||||
|
|
||||||
|
By default, ``{{model.mad_tag}}`` runs with TunableOp disabled (see
|
||||||
|
`<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To enable it, include
|
||||||
|
the ``--tunableop on`` argument in your run.
|
||||||
|
|
||||||
|
Enabling TunableOp triggers a two-pass run -- a warm-up followed by the
|
||||||
|
performance-collection run.
|
||||||
|
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
.. tab-item:: Standalone benchmarking
|
||||||
|
|
||||||
|
.. rubric:: Download the Docker image and required scripts
|
||||||
|
|
||||||
|
1. Run the vLLM benchmark tool independently by starting the
|
||||||
|
`Docker container <{{ unified_docker.docker_hub_url }}>`_
|
||||||
|
as shown in the following snippet.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker pull {{ unified_docker.pull_tag }}
|
||||||
|
docker run -it \
|
||||||
|
--device=/dev/kfd \
|
||||||
|
--device=/dev/dri \
|
||||||
|
--group-add video \
|
||||||
|
--shm-size 16G \
|
||||||
|
--security-opt seccomp=unconfined \
|
||||||
|
--security-opt apparmor=unconfined \
|
||||||
|
--cap-add=SYS_PTRACE \
|
||||||
|
-v $(pwd):/workspace \
|
||||||
|
--env HUGGINGFACE_HUB_CACHE=/workspace \
|
||||||
|
--name test \
|
||||||
|
{{ unified_docker.pull_tag }}
|
||||||
|
|
||||||
|
2. In the Docker container, clone the ROCm MAD repository and navigate to the
|
||||||
|
benchmark scripts directory at ``~/MAD/scripts/vllm``.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
git clone https://github.com/ROCm/MAD
|
||||||
|
cd MAD/scripts/vllm
|
||||||
|
|
||||||
|
3. To start the benchmark, use the following command with the appropriate options.
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
./run.sh \
|
||||||
|
--config $CONFIG_CSV \
|
||||||
|
--model_repo {{ model.model_repo }} \
|
||||||
|
<overrides>
|
||||||
|
|
||||||
|
.. dropdown:: Benchmark options
|
||||||
|
:open:
|
||||||
|
|
||||||
|
.. list-table::
|
||||||
|
:header-rows: 1
|
||||||
|
:align: center
|
||||||
|
|
||||||
|
* - Name
|
||||||
|
- Options
|
||||||
|
- Description
|
||||||
|
|
||||||
|
* - ``--config``
|
||||||
|
- ``configs/default.csv``
|
||||||
|
- Run configs from the CSV for the chosen model repo and benchmark.
|
||||||
|
|
||||||
|
* -
|
||||||
|
- ``configs/extended.csv``
|
||||||
|
-
|
||||||
|
|
||||||
|
* -
|
||||||
|
- ``configs/performance.csv``
|
||||||
|
-
|
||||||
|
|
||||||
|
* - ``--benchmark``
|
||||||
|
- ``throughput``
|
||||||
|
- Measure offline end-to-end throughput.
|
||||||
|
|
||||||
|
* -
|
||||||
|
- ``serving``
|
||||||
|
- Measure online serving performance.
|
||||||
|
|
||||||
|
* -
|
||||||
|
- ``all``
|
||||||
|
- Measure both throughput and serving.
|
||||||
|
|
||||||
|
* - `<overrides>`
|
||||||
|
- See `run.sh <https://github.com/ROCm/MAD/blob/develop/scripts/vllm/run.sh>`__ for more info.
|
||||||
|
- Additional overrides to the config CSV.
|
||||||
|
|
||||||
|
The input sequence length, output sequence length, and tensor parallel (TP) are
|
||||||
|
already configured. You don't need to specify them with this script.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
For best performance, it's recommended to run with ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``.
|
||||||
|
|
||||||
|
If you encounter the following error, pass your access-authorized Hugging
|
||||||
|
Face token to the gated models.
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
OSError: You are trying to access a gated repo.
|
||||||
|
|
||||||
|
# pass your HF_TOKEN
|
||||||
|
export HF_TOKEN=$your_personal_hf_token
|
||||||
|
|
||||||
|
.. rubric:: Benchmarking examples
|
||||||
|
|
||||||
|
Here are some examples of running the benchmark with various options:
|
||||||
|
|
||||||
|
* Throughput benchmark
|
||||||
|
|
||||||
|
Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
export MAD_MODEL_NAME={{ model.mad_tag }}
|
||||||
|
./run.sh \
|
||||||
|
--config configs/default.csv \
|
||||||
|
--model_repo {{model.model_repo}} \
|
||||||
|
--benchmark throughput
|
||||||
|
|
||||||
|
Find the throughput benchmark report at ``./{{ model.mad_tag }}_throughput.csv``.
|
||||||
|
|
||||||
|
* Serving benchmark
|
||||||
|
|
||||||
|
Use this command to benchmark the serving performance of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
export MAD_MODEL_NAME={{ model.mad_tag }}
|
||||||
|
./run.sh \
|
||||||
|
--config configs/default.csv \
|
||||||
|
--model_repo {{model.model_repo}} \
|
||||||
|
--benchmark serving
|
||||||
|
|
||||||
|
Find the serving benchmark report at ``./{{ model.mad_tag }}_serving.csv``.
|
||||||
|
|
||||||
|
.. raw:: html
|
||||||
|
|
||||||
|
<style>
|
||||||
|
mjx-container[jax="CHTML"][display="true"] {
|
||||||
|
text-align: left;
|
||||||
|
margin: 0;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
Throughput is calculated as:
|
||||||
|
|
||||||
|
- .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
|
||||||
|
|
||||||
|
- .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
|
||||||
|
{% endfor %}
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
Advanced usage
|
||||||
|
==============
|
||||||
|
|
||||||
|
For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
|
||||||
|
see the developer's guide at `<https://github.com/ROCm/vllm/tree/f94ec9beeca1071cc34f9d1e206d8c7f3ac76129/docs/dev-docker>`__.
|
||||||
|
|
||||||
|
Reproducing the Docker image
|
||||||
|
----------------------------
|
||||||
|
|
||||||
|
To reproduce this ROCm/vLLM Docker image release, follow these steps:
|
||||||
|
|
||||||
|
1. Clone the `vLLM repository <https://github.com/ROCm/vllm>`__.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
git clone https://github.com/ROCm/vllm.git
|
||||||
|
|
||||||
|
2. Checkout the specific release commit.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
cd vllm
|
||||||
|
git checkout 340ea86dfe5955d6f9a9e767d6abab5aacf2c978
|
||||||
|
|
||||||
|
3. Build the Docker image. Replace ``vllm-rocm`` with your desired image tag.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker build -f docker/Dockerfile.rocm -t vllm-rocm .
|
||||||
|
|
||||||
|
Further reading
|
||||||
|
===============
|
||||||
|
|
||||||
|
- To learn more about the options for latency and throughput benchmark scripts,
|
||||||
|
see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
|
||||||
|
|
||||||
|
- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
|
||||||
|
|
||||||
|
- To learn more about system settings and management practices to configure your system for
|
||||||
|
AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
|
||||||
|
|
||||||
|
- For application performance optimization strategies for HPC and AI workloads,
|
||||||
|
including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
|
||||||
|
|
||||||
|
- To learn how to run community models from Hugging Face on AMD GPUs, see
|
||||||
|
:doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
|
||||||
|
|
||||||
|
- To learn how to fine-tune LLMs and optimize inference, see
|
||||||
|
:doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
|
||||||
|
|
||||||
|
- For a list of other ready-made Docker images for AI with ROCm, see
|
||||||
|
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
|
||||||
|
|
||||||
|
Previous versions
|
||||||
|
=================
|
||||||
|
|
||||||
|
See :doc:`vllm-history` to find documentation for previous releases
|
||||||
|
of the ``ROCm/vllm`` Docker image.
|
||||||
@@ -14,7 +14,7 @@ vLLM inference performance testing
|
|||||||
This documentation does not reflect the latest version of ROCm vLLM
|
This documentation does not reflect the latest version of ROCm vLLM
|
||||||
inference performance documentation. See :doc:`../vllm` for the latest version.
|
inference performance documentation. See :doc:`../vllm` for the latest version.
|
||||||
|
|
||||||
.. _vllm-benchmark-unified-docker:
|
.. _vllm-benchmark-unified-docker-702:
|
||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250702-benchmark-models.yaml
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250702-benchmark-models.yaml
|
||||||
|
|
||||||
@@ -77,7 +77,7 @@ vLLM inference performance testing
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
.. _vllm-benchmark-vllm:
|
.. _vllm-benchmark-vllm-702:
|
||||||
|
|
||||||
{% for model_group in model_groups %}
|
{% for model_group in model_groups %}
|
||||||
{% for model in model_group.models %}
|
{% for model in model_group.models %}
|
||||||
@@ -159,7 +159,7 @@ vLLM inference performance testing
|
|||||||
Once the setup is complete, choose between two options to reproduce the
|
Once the setup is complete, choose between two options to reproduce the
|
||||||
benchmark results:
|
benchmark results:
|
||||||
|
|
||||||
.. _vllm-benchmark-mad:
|
.. _vllm-benchmark-mad-702:
|
||||||
|
|
||||||
{% for model_group in model_groups %}
|
{% for model_group in model_groups %}
|
||||||
{% for model in model_group.models %}
|
{% for model in model_group.models %}
|
||||||
|
|||||||
@@ -0,0 +1,450 @@
|
|||||||
|
:orphan:
|
||||||
|
|
||||||
|
.. meta::
|
||||||
|
:description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
|
||||||
|
ROCm vLLM Docker image.
|
||||||
|
:keywords: model, MAD, automation, dashboarding, validate
|
||||||
|
|
||||||
|
**********************************
|
||||||
|
vLLM inference performance testing
|
||||||
|
**********************************
|
||||||
|
|
||||||
|
.. caution::
|
||||||
|
|
||||||
|
This documentation does not reflect the latest version of ROCm vLLM
|
||||||
|
inference performance documentation. See :doc:`../vllm` for the latest version.
|
||||||
|
|
||||||
|
.. _vllm-benchmark-unified-docker-715:
|
||||||
|
|
||||||
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark-models.yaml
|
||||||
|
|
||||||
|
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
||||||
|
{% set model_groups = data.vllm_benchmark.model_groups %}
|
||||||
|
|
||||||
|
The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
|
||||||
|
a prebuilt, optimized environment for validating large language model (LLM)
|
||||||
|
inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
|
||||||
|
Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
|
||||||
|
accelerators and includes the following components:
|
||||||
|
|
||||||
|
.. list-table::
|
||||||
|
:header-rows: 1
|
||||||
|
|
||||||
|
* - Software component
|
||||||
|
- Version
|
||||||
|
|
||||||
|
* - `ROCm <https://github.com/ROCm/ROCm>`__
|
||||||
|
- {{ unified_docker.rocm_version }}
|
||||||
|
|
||||||
|
* - `vLLM <https://docs.vllm.ai/en/latest>`__
|
||||||
|
- {{ unified_docker.vllm_version }}
|
||||||
|
|
||||||
|
* - `PyTorch <https://github.com/ROCm/pytorch>`__
|
||||||
|
- {{ unified_docker.pytorch_version }}
|
||||||
|
|
||||||
|
* - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`__
|
||||||
|
- {{ unified_docker.hipblaslt_version }}
|
||||||
|
|
||||||
|
With this Docker image, you can quickly test the :ref:`expected
|
||||||
|
inference performance numbers <vllm-benchmark-performance-measurements-715>` for
|
||||||
|
MI300X series accelerators.
|
||||||
|
|
||||||
|
What's new
|
||||||
|
==========
|
||||||
|
|
||||||
|
The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <vllm-history>`.
|
||||||
|
|
||||||
|
* The ``--compilation-config-parameter`` is no longer required as its options are now enabled by default.
|
||||||
|
This parameter has been removed from the benchmarking script.
|
||||||
|
|
||||||
|
* Resolved Llama 3.1 405 B custom all-reduce issue, eliminating the need for ``--disable-custom-all-reduce``.
|
||||||
|
This parameter has been removed from the benchmarking script.
|
||||||
|
|
||||||
|
* Fixed a ``+rms_norm`` custom kernel issue.
|
||||||
|
|
||||||
|
* Added quick reduce functionality. Set ``VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=FP`` to enable; supported modes are ``FP``, ``INT8``, ``INT6``, ``INT4``.
|
||||||
|
|
||||||
|
* Implemented a workaround to potentially mitigate GPU crashes experienced with the Command R+ model, pending a driver fix.
|
||||||
|
|
||||||
|
Supported models
|
||||||
|
================
|
||||||
|
|
||||||
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark-models.yaml
|
||||||
|
|
||||||
|
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
||||||
|
{% set model_groups = data.vllm_benchmark.model_groups %}
|
||||||
|
|
||||||
|
.. _vllm-benchmark-available-models-715:
|
||||||
|
|
||||||
|
The following models are supported for inference performance benchmarking
|
||||||
|
with vLLM and ROCm. Some instructions, commands, and recommendations in this
|
||||||
|
documentation might vary by model -- select one to get started.
|
||||||
|
|
||||||
|
.. raw:: html
|
||||||
|
|
||||||
|
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||||
|
<div class="row">
|
||||||
|
<div class="col-2 me-2 model-param-head">Model group</div>
|
||||||
|
<div class="row col-10">
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
<div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="row mt-1">
|
||||||
|
<div class="col-2 me-2 model-param-head">Model</div>
|
||||||
|
<div class="row col-10">
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
{% set models = model_group.models %}
|
||||||
|
{% for model in models %}
|
||||||
|
{% if models|length % 3 == 0 %}
|
||||||
|
<div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
|
{% else %}
|
||||||
|
<div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
|
{% endif %}
|
||||||
|
{% endfor %}
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
.. _vllm-benchmark-vllm-715:
|
||||||
|
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
{% for model in model_group.models %}
|
||||||
|
|
||||||
|
.. container:: model-doc {{model.mad_tag}}
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
|
||||||
|
Some models require access authorization prior to use via an external license agreement through a third party.
|
||||||
|
|
||||||
|
{% endfor %}
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
vLLM is a toolkit and library for LLM inference and serving. AMD implements
|
||||||
|
high-performance custom kernels and modules in vLLM to enhance performance.
|
||||||
|
See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
|
||||||
|
more information.
|
||||||
|
|
||||||
|
.. _vllm-benchmark-performance-measurements-715:
|
||||||
|
|
||||||
|
Performance measurements
|
||||||
|
========================
|
||||||
|
|
||||||
|
To evaluate performance, the
|
||||||
|
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
|
||||||
|
page provides reference throughput and latency measurements for inferencing popular AI models.
|
||||||
|
|
||||||
|
.. important::
|
||||||
|
|
||||||
|
The performance data presented in
|
||||||
|
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
|
||||||
|
only reflects the latest version of this inference benchmarking environment.
|
||||||
|
The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
|
||||||
|
|
||||||
|
System validation
|
||||||
|
=================
|
||||||
|
|
||||||
|
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||||
|
correctly and performing optimally.
|
||||||
|
|
||||||
|
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
||||||
|
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
||||||
|
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
||||||
|
before starting training.
|
||||||
|
|
||||||
|
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||||
|
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||||
|
system's configuration.
|
||||||
|
|
||||||
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark-models.yaml
|
||||||
|
|
||||||
|
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
||||||
|
{% set model_groups = data.vllm_benchmark.model_groups %}
|
||||||
|
|
||||||
|
Pull the Docker image
|
||||||
|
=====================
|
||||||
|
|
||||||
|
Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
|
||||||
|
Use the following command to pull the Docker image from Docker Hub.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker pull {{ unified_docker.pull_tag }}
|
||||||
|
|
||||||
|
Benchmarking
|
||||||
|
============
|
||||||
|
|
||||||
|
Once the setup is complete, choose between two options to reproduce the
|
||||||
|
benchmark results:
|
||||||
|
|
||||||
|
.. _vllm-benchmark-mad-715:
|
||||||
|
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
{% for model in model_group.models %}
|
||||||
|
|
||||||
|
.. container:: model-doc {{model.mad_tag}}
|
||||||
|
|
||||||
|
.. tab-set::
|
||||||
|
|
||||||
|
.. tab-item:: MAD-integrated benchmarking
|
||||||
|
|
||||||
|
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||||
|
directory and install the required packages on the host machine.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
git clone https://github.com/ROCm/MAD
|
||||||
|
cd MAD
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
2. Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
|
||||||
|
using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
||||||
|
madengine run \
|
||||||
|
--tags {{model.mad_tag}} \
|
||||||
|
--keep-model-dir \
|
||||||
|
--live-output \
|
||||||
|
--timeout 28800
|
||||||
|
|
||||||
|
MAD launches a Docker container with the name
|
||||||
|
``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
|
||||||
|
model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
|
||||||
|
|
||||||
|
Although the :ref:`available models <vllm-benchmark-available-models-715>` are preconfigured
|
||||||
|
to collect latency and throughput performance data, you can also change the benchmarking
|
||||||
|
parameters. See the standalone benchmarking tab for more information.
|
||||||
|
|
||||||
|
{% if model.tunableop %}
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
|
||||||
|
TunableOp automatically explores different implementations and configurations of certain PyTorch
|
||||||
|
operators to find the fastest one for your hardware.
|
||||||
|
|
||||||
|
By default, ``{{model.mad_tag}}`` runs with TunableOp disabled
|
||||||
|
(see
|
||||||
|
`<https://github.com/ROCm/MAD/blob/develop/models.json>`__).
|
||||||
|
To enable it, include the ``--tunableop on`` argument in your
|
||||||
|
run.
|
||||||
|
|
||||||
|
Enabling TunableOp triggers a two-pass run -- a warm-up followed
|
||||||
|
by the performance-collection run.
|
||||||
|
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
.. tab-item:: Standalone benchmarking
|
||||||
|
|
||||||
|
.. rubric:: Download the Docker image and required scripts
|
||||||
|
|
||||||
|
1. Run the vLLM benchmark tool independently by starting the
|
||||||
|
`Docker container <{{ unified_docker.docker_hub_url }}>`_
|
||||||
|
as shown in the following snippet.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker pull {{ unified_docker.pull_tag }}
|
||||||
|
docker run -it \
|
||||||
|
--device=/dev/kfd \
|
||||||
|
--device=/dev/dri \
|
||||||
|
--group-add video \
|
||||||
|
--shm-size 16G \
|
||||||
|
--security-opt seccomp=unconfined \
|
||||||
|
--security-opt apparmor=unconfined \
|
||||||
|
--cap-add=SYS_PTRACE \
|
||||||
|
-v $(pwd):/workspace \
|
||||||
|
--env HUGGINGFACE_HUB_CACHE=/workspace \
|
||||||
|
--name test \
|
||||||
|
{{ unified_docker.pull_tag }}
|
||||||
|
|
||||||
|
2. In the Docker container, clone the ROCm MAD repository and navigate to the
|
||||||
|
benchmark scripts directory at ``~/MAD/scripts/vllm``.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
git clone https://github.com/ROCm/MAD
|
||||||
|
cd MAD/scripts/vllm
|
||||||
|
|
||||||
|
3. To start the benchmark, use the following command with the appropriate options.
|
||||||
|
|
||||||
|
.. dropdown:: Benchmark options
|
||||||
|
:open:
|
||||||
|
|
||||||
|
.. list-table::
|
||||||
|
:header-rows: 1
|
||||||
|
:align: center
|
||||||
|
|
||||||
|
* - Name
|
||||||
|
- Options
|
||||||
|
- Description
|
||||||
|
|
||||||
|
* - ``$test_option``
|
||||||
|
- latency
|
||||||
|
- Measure decoding token latency
|
||||||
|
|
||||||
|
* -
|
||||||
|
- throughput
|
||||||
|
- Measure token generation throughput
|
||||||
|
|
||||||
|
* -
|
||||||
|
- all
|
||||||
|
- Measure both throughput and latency
|
||||||
|
|
||||||
|
* - ``$num_gpu``
|
||||||
|
- 1 or 8
|
||||||
|
- Number of GPUs
|
||||||
|
|
||||||
|
* - ``$datatype``
|
||||||
|
- ``float16`` or ``float8``
|
||||||
|
- Data type
|
||||||
|
|
||||||
|
The input sequence length, output sequence length, and tensor parallel (TP) are
|
||||||
|
already configured. You don't need to specify them with this script.
|
||||||
|
|
||||||
|
Command:
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
./vllm_benchmark_report.sh \
|
||||||
|
-s $test_option \
|
||||||
|
-m {{model.model_repo}} \
|
||||||
|
-g $num_gpu \
|
||||||
|
-d {{model.precision}}
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
For best performance, it's recommend to run with ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``.
|
||||||
|
|
||||||
|
If you encounter the following error, pass your access-authorized Hugging
|
||||||
|
Face token to the gated models.
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
OSError: You are trying to access a gated repo.
|
||||||
|
|
||||||
|
# pass your HF_TOKEN
|
||||||
|
export HF_TOKEN=$your_personal_hf_token
|
||||||
|
|
||||||
|
.. rubric:: Benchmarking examples
|
||||||
|
|
||||||
|
Here are some examples of running the benchmark with various options:
|
||||||
|
|
||||||
|
* Latency benchmark
|
||||||
|
|
||||||
|
Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
./vllm_benchmark_report.sh \
|
||||||
|
-s latency \
|
||||||
|
-m {{model.model_repo}} \
|
||||||
|
-g 8 \
|
||||||
|
-d {{model.precision}}
|
||||||
|
|
||||||
|
Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
|
||||||
|
|
||||||
|
* Throughput benchmark
|
||||||
|
|
||||||
|
Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
./vllm_benchmark_report.sh \
|
||||||
|
-s throughput \
|
||||||
|
-m {{model.model_repo}} \
|
||||||
|
-g 8 \
|
||||||
|
-d {{model.precision}}
|
||||||
|
|
||||||
|
Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
|
||||||
|
|
||||||
|
.. raw:: html
|
||||||
|
|
||||||
|
<style>
|
||||||
|
mjx-container[jax="CHTML"][display="true"] {
|
||||||
|
text-align: left;
|
||||||
|
margin: 0;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
Throughput is calculated as:
|
||||||
|
|
||||||
|
- .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
|
||||||
|
|
||||||
|
- .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
|
||||||
|
{% endfor %}
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
Advanced usage
|
||||||
|
==============
|
||||||
|
|
||||||
|
For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
|
||||||
|
see the developer's guide at `<https://github.com/ROCm/vllm/tree/f94ec9beeca1071cc34f9d1e206d8c7f3ac76129/docs/dev-docker>`__.
|
||||||
|
|
||||||
|
Reproducing the Docker image
|
||||||
|
----------------------------
|
||||||
|
|
||||||
|
To reproduce this ROCm/vLLM Docker image release, follow these steps:
|
||||||
|
|
||||||
|
1. Clone the `vLLM repository <https://github.com/ROCm/vllm>`__.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
git clone https://github.com/ROCm/vllm.git
|
||||||
|
|
||||||
|
2. Checkout the specific release commit.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
cd vllm
|
||||||
|
git checkout b432b7a285aa0dcb9677380936ffa74931bb6d6f
|
||||||
|
|
||||||
|
3. Build the Docker image. Replace ``vllm-rocm`` with your desired image tag.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker build -f docker/Dockerfile.rocm -t vllm-rocm .
|
||||||
|
|
||||||
|
Known issues and workarounds
|
||||||
|
============================
|
||||||
|
|
||||||
|
AITER does not support FP8 KV cache yet.
|
||||||
|
|
||||||
|
Further reading
|
||||||
|
===============
|
||||||
|
|
||||||
|
- To learn more about the options for latency and throughput benchmark scripts,
|
||||||
|
see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
|
||||||
|
|
||||||
|
- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
|
||||||
|
|
||||||
|
- To learn more about system settings and management practices to configure your system for
|
||||||
|
AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
|
||||||
|
|
||||||
|
- For application performance optimization strategies for HPC and AI workloads,
|
||||||
|
including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
|
||||||
|
|
||||||
|
- To learn how to run community models from Hugging Face on AMD GPUs, see
|
||||||
|
:doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
|
||||||
|
|
||||||
|
- To learn how to fine-tune LLMs and optimize inference, see
|
||||||
|
:doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
|
||||||
|
|
||||||
|
- For a list of other ready-made Docker images for AI with ROCm, see
|
||||||
|
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
|
||||||
|
|
||||||
|
Previous versions
|
||||||
|
=================
|
||||||
|
|
||||||
|
See :doc:`vllm-history` to find documentation for previous releases
|
||||||
|
of the ``ROCm/vllm`` Docker image.
|
||||||
@@ -7,7 +7,7 @@ vLLM inference performance testing version history
|
|||||||
This table lists previous versions of the ROCm vLLM inference Docker image for
|
This table lists previous versions of the ROCm vLLM inference Docker image for
|
||||||
inference performance testing. For detailed information about available models
|
inference performance testing. For detailed information about available models
|
||||||
for benchmarking, see the version-specific documentation. You can find tagged
|
for benchmarking, see the version-specific documentation. You can find tagged
|
||||||
previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub <https://hub.docker.com/r/rocm/vllm/tags>`__.
|
previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.1_20250909/images/sha256-1113268572e26d59b205792047bea0e61e018e79aeadceba118b7bf23cb3715c>`__.
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
@@ -16,14 +16,23 @@ previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub <https://hub.
|
|||||||
- Components
|
- Components
|
||||||
- Resources
|
- Resources
|
||||||
|
|
||||||
* - ``rocm/vllm:rocm6.4.1_vllm_0.9.1_20250715``
|
* - ``rocm/vllm:rocm6.4.1_vllm_0.10.0_20250812``
|
||||||
(latest)
|
(latest)
|
||||||
|
-
|
||||||
|
* ROCm 6.4.1
|
||||||
|
* vLLM 0.10.0
|
||||||
|
* PyTorch 2.7.0
|
||||||
|
-
|
||||||
|
* :doc:`Documentation <../vllm>`
|
||||||
|
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.0_20250812/images/sha256-4c277ad39af3a8c9feac9b30bf78d439c74d9b4728e788a419d3f1d0c30cacaa>`__
|
||||||
|
|
||||||
|
* - ``rocm/vllm:rocm6.4.1_vllm_0.9.1_20250715``
|
||||||
-
|
-
|
||||||
* ROCm 6.4.1
|
* ROCm 6.4.1
|
||||||
* vLLM 0.9.1
|
* vLLM 0.9.1
|
||||||
* PyTorch 2.7.0
|
* PyTorch 2.7.0
|
||||||
-
|
-
|
||||||
* :doc:`Documentation <../vllm>`
|
* :doc:`Documentation <vllm-0.9.1-20250715>`
|
||||||
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250715/images/sha256-4a429705fa95a58f6d20aceab43b1b76fa769d57f32d5d28bd3f4e030e2a78ea>`__
|
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250715/images/sha256-4a429705fa95a58f6d20aceab43b1b76fa769d57f32d5d28bd3f4e030e2a78ea>`__
|
||||||
|
|
||||||
* - ``rocm/vllm:rocm6.4.1_vllm_0.9.1_20250702``
|
* - ``rocm/vllm:rocm6.4.1_vllm_0.9.1_20250702``
|
||||||
|
|||||||
@@ -31,26 +31,30 @@ PyTorch inference performance testing
|
|||||||
.. raw:: html
|
.. raw:: html
|
||||||
|
|
||||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||||
<div class="row">
|
<div class="row gx-0">
|
||||||
<div class="col-2 me-2 model-param-head">Model</div>
|
<div class="col-2 me-1 px-2 model-param-head">Model</div>
|
||||||
<div class="row col-10">
|
<div class="row col-10 pe-0">
|
||||||
{% for model_group in model_groups %}
|
{% for model_group in model_groups %}
|
||||||
<div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
<div class="col-3 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||||
{% endfor %}
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="row mt-1" style="display: none;">
|
|
||||||
<div class="col-2 me-2 model-param-head">Model</div>
|
|
||||||
<div class="row col-10">
|
|
||||||
{% for model_group in model_groups %}
|
|
||||||
{% set models = model_group.models %}
|
|
||||||
{% for model in models %}
|
|
||||||
<div class="col-12 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
{% endfor %}
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
|
||||||
|
<div class="row gx-0 pt-1" style="display: none;">
|
||||||
|
<div class="col-2 me-1 px-2 model-param-head">Variant</div>
|
||||||
|
<div class="row col-10 pe-0">
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
{% set models = model_group.models %}
|
||||||
|
{% for model in models %}
|
||||||
|
{% if models|length % 3 == 0 %}
|
||||||
|
<div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
|
{% else %}
|
||||||
|
<div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
|
{% endif %}
|
||||||
|
{% endfor %}
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
{% for model_group in model_groups %}
|
{% for model_group in model_groups %}
|
||||||
@@ -103,7 +107,7 @@ PyTorch inference performance testing
|
|||||||
|
|
||||||
The Chai-1 benchmark uses a specifically selected Docker image using ROCm 6.2.3 and PyTorch 2.3.0 to address an accuracy issue.
|
The Chai-1 benchmark uses a specifically selected Docker image using ROCm 6.2.3 and PyTorch 2.3.0 to address an accuracy issue.
|
||||||
|
|
||||||
.. container:: model-doc pyt_clip_inference pyt_mochi_video_inference pyt_wan2.1_inference pyt_janus_pro_inference
|
.. container:: model-doc pyt_clip_inference pyt_mochi_video_inference pyt_wan2.1_inference pyt_janus_pro_inference pyt_hy_video
|
||||||
|
|
||||||
Use the following command to pull the `ROCm PyTorch Docker image <https://hub.docker.com/layers/rocm/pytorch/latest/images/sha256-05b55983e5154f46e7441897d0908d79877370adca4d1fff4899d9539d6c4969>`__ from Docker Hub.
|
Use the following command to pull the `ROCm PyTorch Docker image <https://hub.docker.com/layers/rocm/pytorch/latest/images/sha256-05b55983e5154f46e7441897d0908d79877370adca4d1fff4899d9539d6c4969>`__ from Docker Hub.
|
||||||
|
|
||||||
|
|||||||
@@ -2,19 +2,19 @@
|
|||||||
:description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and SGLang
|
:description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and SGLang
|
||||||
:keywords: model, MAD, automation, dashboarding, validate
|
:keywords: model, MAD, automation, dashboarding, validate
|
||||||
|
|
||||||
************************************
|
*****************************************************************
|
||||||
SGLang inference performance testing
|
SGLang inference performance testing DeepSeek-R1-Distill-Qwen-32B
|
||||||
************************************
|
*****************************************************************
|
||||||
|
|
||||||
.. _sglang-benchmark-unified-docker:
|
.. _sglang-benchmark-unified-docker:
|
||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/sglang-benchmark-models.yaml
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/sglang-benchmark-models.yaml
|
||||||
|
|
||||||
{% set unified_docker = data.sglang_benchmark.unified_docker.latest %}
|
{% set docker = data.dockers[0] %}
|
||||||
|
|
||||||
`SGLang <https://docs.sglang.ai>`__ is a high-performance inference and
|
`SGLang <https://docs.sglang.ai>`__ is a high-performance inference and
|
||||||
serving engine for large language models (LLMs) and vision models. The
|
serving engine for large language models (LLMs) and vision models. The
|
||||||
ROCm-enabled `SGLang Docker image <{{ unified_docker.docker_hub_url }}>`__
|
ROCm-enabled `SGLang Docker image <{{ docker.docker_hub_url }}>`__
|
||||||
bundles SGLang with PyTorch, optimized for AMD Instinct MI300X series
|
bundles SGLang with PyTorch, optimized for AMD Instinct MI300X series
|
||||||
accelerators. It includes the following software components:
|
accelerators. It includes the following software components:
|
||||||
|
|
||||||
@@ -24,14 +24,10 @@ SGLang inference performance testing
|
|||||||
* - Software component
|
* - Software component
|
||||||
- Version
|
- Version
|
||||||
|
|
||||||
* - `ROCm <https://github.com/ROCm/ROCm>`__
|
{% for component_name, component_version in docker.components.items() %}
|
||||||
- {{ unified_docker.rocm_version }}
|
* - {{ component_name }}
|
||||||
|
- {{ component_version }}
|
||||||
* - `SGLang <https://docs.sglang.ai/index.html>`__
|
{% endfor %}
|
||||||
- {{ unified_docker.sglang_version }}
|
|
||||||
|
|
||||||
* - `PyTorch <https://github.com/pytorch/pytorch>`__
|
|
||||||
- {{ unified_docker.pytorch_version }}
|
|
||||||
|
|
||||||
System validation
|
System validation
|
||||||
=================
|
=================
|
||||||
@@ -50,8 +46,8 @@ system's configuration.
|
|||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/sglang-benchmark-models.yaml
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/sglang-benchmark-models.yaml
|
||||||
|
|
||||||
{% set unified_docker = data.sglang_benchmark.unified_docker.latest %}
|
{% set unified_docker = data.dockers[0] %}
|
||||||
{% set model_groups = data.sglang_benchmark.model_groups %}
|
{% set model_groups = data.model_groups %}
|
||||||
|
|
||||||
Pull the Docker image
|
Pull the Docker image
|
||||||
=====================
|
=====================
|
||||||
|
|||||||
@@ -7,14 +7,13 @@
|
|||||||
vLLM inference performance testing
|
vLLM inference performance testing
|
||||||
**********************************
|
**********************************
|
||||||
|
|
||||||
.. _vllm-benchmark-unified-docker:
|
.. _vllm-benchmark-unified-docker-909:
|
||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
|
||||||
|
|
||||||
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
{% set docker = data.dockers[0] %}
|
||||||
{% set model_groups = data.vllm_benchmark.model_groups %}
|
|
||||||
|
|
||||||
The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
|
The `ROCm vLLM Docker <{{ docker.docker_hub_url }}>`_ image offers
|
||||||
a prebuilt, optimized environment for validating large language model (LLM)
|
a prebuilt, optimized environment for validating large language model (LLM)
|
||||||
inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
|
inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
|
||||||
Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
|
Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
|
||||||
@@ -26,20 +25,13 @@ vLLM inference performance testing
|
|||||||
* - Software component
|
* - Software component
|
||||||
- Version
|
- Version
|
||||||
|
|
||||||
* - `ROCm <https://github.com/ROCm/ROCm>`__
|
{% for component_name, component_version in docker.components.items() %}
|
||||||
- {{ unified_docker.rocm_version }}
|
* - {{ component_name }}
|
||||||
|
- {{ component_version }}
|
||||||
* - `vLLM <https://docs.vllm.ai/en/latest>`__
|
{% endfor %}
|
||||||
- {{ unified_docker.vllm_version }}
|
|
||||||
|
|
||||||
* - `PyTorch <https://github.com/ROCm/pytorch>`__
|
|
||||||
- {{ unified_docker.pytorch_version }}
|
|
||||||
|
|
||||||
* - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`__
|
|
||||||
- {{ unified_docker.hipblaslt_version }}
|
|
||||||
|
|
||||||
With this Docker image, you can quickly test the :ref:`expected
|
With this Docker image, you can quickly test the :ref:`expected
|
||||||
inference performance numbers <vllm-benchmark-performance-measurements>` for
|
inference performance numbers <vllm-benchmark-performance-measurements-909>` for
|
||||||
MI300X series accelerators.
|
MI300X series accelerators.
|
||||||
|
|
||||||
What's new
|
What's new
|
||||||
@@ -47,27 +39,23 @@ What's new
|
|||||||
|
|
||||||
The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <previous-versions/vllm-history>`.
|
The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <previous-versions/vllm-history>`.
|
||||||
|
|
||||||
* The ``--compilation-config-parameter`` is no longer required as its options are now enabled by default.
|
* Upgraded to vLLM v0.10.1.
|
||||||
This parameter has been removed from the benchmarking script.
|
|
||||||
|
|
||||||
* Resolved Llama 3.1 405 B custom all-reduce issue, eliminating the need for ``--disable-custom-all-reduce``.
|
* Set ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1`` by default for better performance.
|
||||||
This parameter has been removed from the benchmarking script.
|
|
||||||
|
|
||||||
* Fixed a ``+rms_norm`` custom kernel issue.
|
* Set ``VLLM_ROCM_USE_AITER_RMSNORM=0`` by default to avoid various issues with torch compile.
|
||||||
|
|
||||||
* Added quick reduce functionality. Set ``VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=FP`` to enable; supported modes are ``FP``, ``INT8``, ``INT6``, ``INT4``.
|
.. _vllm-benchmark-supported-models-909:
|
||||||
|
|
||||||
* Implemented a workaround to potentially mitigate GPU crashes experienced with the Command R+ model, pending a driver fix.
|
|
||||||
|
|
||||||
Supported models
|
Supported models
|
||||||
================
|
================
|
||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
|
||||||
|
|
||||||
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
{% set docker = data.dockers[0] %}
|
||||||
{% set model_groups = data.vllm_benchmark.model_groups %}
|
{% set model_groups = data.model_groups %}
|
||||||
|
|
||||||
.. _vllm-benchmark-available-models:
|
.. _vllm-benchmark-available-models-909:
|
||||||
|
|
||||||
The following models are supported for inference performance benchmarking
|
The following models are supported for inference performance benchmarking
|
||||||
with vLLM and ROCm. Some instructions, commands, and recommendations in this
|
with vLLM and ROCm. Some instructions, commands, and recommendations in this
|
||||||
@@ -76,62 +64,58 @@ Supported models
|
|||||||
.. raw:: html
|
.. raw:: html
|
||||||
|
|
||||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||||
<div class="row">
|
<div class="row gx-0">
|
||||||
<div class="col-2 me-2 model-param-head">Model group</div>
|
<div class="col-2 me-1 px-2 model-param-head">Model</div>
|
||||||
<div class="row col-10">
|
<div class="row col-10 pe-0">
|
||||||
{% for model_group in model_groups %}
|
{% for model_group in model_groups %}
|
||||||
<div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
<div class="col-3 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||||
{% endfor %}
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="row mt-1">
|
|
||||||
<div class="col-2 me-2 model-param-head">Model</div>
|
|
||||||
<div class="row col-10">
|
|
||||||
{% for model_group in model_groups %}
|
|
||||||
{% set models = model_group.models %}
|
|
||||||
{% for model in models %}
|
|
||||||
{% if models|length % 3 == 0 %}
|
|
||||||
<div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
|
||||||
{% else %}
|
|
||||||
<div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
|
||||||
{% endif %}
|
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
{% endfor %}
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="row gx-0 pt-1">
|
||||||
|
<div class="col-2 me-1 px-2 model-param-head">Variant</div>
|
||||||
|
<div class="row col-10 pe-0">
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
{% set models = model_group.models %}
|
||||||
|
{% for model in models %}
|
||||||
|
{% if models|length % 3 == 0 %}
|
||||||
|
<div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
|
{% else %}
|
||||||
|
<div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
|
{% endif %}
|
||||||
|
{% endfor %}
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
|
||||||
|
|
||||||
.. _vllm-benchmark-vllm:
|
.. _vllm-benchmark-vllm-909:
|
||||||
|
|
||||||
{% for model_group in model_groups %}
|
{% for model_group in model_groups %}
|
||||||
{% for model in model_group.models %}
|
{% for model in model_group.models %}
|
||||||
|
|
||||||
.. container:: model-doc {{model.mad_tag}}
|
.. container:: model-doc {{ model.mad_tag }}
|
||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
|
|
||||||
See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
|
See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
|
||||||
Some models require access authorization prior to use via an external license agreement through a third party.
|
Some models require access authorization prior to use via an external license agreement through a third party.
|
||||||
|
{% if model.precision == "float8" and model.model_repo.startswith("amd") %}
|
||||||
|
This model uses FP8 quantization via `AMD Quark <https://quark.docs.amd.com/latest/>`__ for efficient inference on AMD accelerators.
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
|
|
||||||
.. note::
|
.. _vllm-benchmark-performance-measurements-909:
|
||||||
|
|
||||||
vLLM is a toolkit and library for LLM inference and serving. AMD implements
|
|
||||||
high-performance custom kernels and modules in vLLM to enhance performance.
|
|
||||||
See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
|
|
||||||
more information.
|
|
||||||
|
|
||||||
.. _vllm-benchmark-performance-measurements:
|
|
||||||
|
|
||||||
Performance measurements
|
Performance measurements
|
||||||
========================
|
========================
|
||||||
|
|
||||||
To evaluate performance, the
|
To evaluate performance, the
|
||||||
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
|
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
|
||||||
page provides reference throughput and latency measurements for inferencing popular AI models.
|
page provides reference throughput and serving measurements for inferencing popular AI models.
|
||||||
|
|
||||||
.. important::
|
.. important::
|
||||||
|
|
||||||
@@ -157,18 +141,18 @@ system's configuration.
|
|||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
|
||||||
|
|
||||||
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
{% set docker = data.dockers[0] %}
|
||||||
{% set model_groups = data.vllm_benchmark.model_groups %}
|
{% set model_groups = data.model_groups %}
|
||||||
|
|
||||||
Pull the Docker image
|
Pull the Docker image
|
||||||
=====================
|
=====================
|
||||||
|
|
||||||
Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
|
Download the `ROCm vLLM Docker image <{{ docker.docker_hub_url }}>`_.
|
||||||
Use the following command to pull the Docker image from Docker Hub.
|
Use the following command to pull the Docker image from Docker Hub.
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
docker pull {{ unified_docker.pull_tag }}
|
docker pull {{ docker.pull_tag }}
|
||||||
|
|
||||||
Benchmarking
|
Benchmarking
|
||||||
============
|
============
|
||||||
@@ -176,7 +160,7 @@ system's configuration.
|
|||||||
Once the setup is complete, choose between two options to reproduce the
|
Once the setup is complete, choose between two options to reproduce the
|
||||||
benchmark results:
|
benchmark results:
|
||||||
|
|
||||||
.. _vllm-benchmark-mad:
|
.. _vllm-benchmark-mad-909:
|
||||||
|
|
||||||
{% for model_group in model_groups %}
|
{% for model_group in model_groups %}
|
||||||
{% for model in model_group.models %}
|
{% for model in model_group.models %}
|
||||||
@@ -187,6 +171,9 @@ system's configuration.
|
|||||||
|
|
||||||
.. tab-item:: MAD-integrated benchmarking
|
.. tab-item:: MAD-integrated benchmarking
|
||||||
|
|
||||||
|
The following run command is tailored to {{ model.model }}.
|
||||||
|
See :ref:`vllm-benchmark-supported-models-909` to switch to another available model.
|
||||||
|
|
||||||
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||||
directory and install the required packages on the host machine.
|
directory and install the required packages on the host machine.
|
||||||
|
|
||||||
@@ -209,12 +196,15 @@ system's configuration.
|
|||||||
--timeout 28800
|
--timeout 28800
|
||||||
|
|
||||||
MAD launches a Docker container with the name
|
MAD launches a Docker container with the name
|
||||||
``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
|
``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the
|
||||||
model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
|
model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv``
|
||||||
|
and ``{{ model.mad_tag }}_serving.csv``.
|
||||||
|
|
||||||
Although the :ref:`available models <vllm-benchmark-available-models>` are preconfigured
|
Although the :ref:`available models
|
||||||
to collect latency and throughput performance data, you can also change the benchmarking
|
<vllm-benchmark-available-models-909>` are preconfigured to collect
|
||||||
parameters. See the standalone benchmarking tab for more information.
|
offline throughput and online serving performance data, you can
|
||||||
|
also change the benchmarking parameters. See the standalone
|
||||||
|
benchmarking tab for more information.
|
||||||
|
|
||||||
{% if model.tunableop %}
|
{% if model.tunableop %}
|
||||||
|
|
||||||
@@ -224,140 +214,154 @@ system's configuration.
|
|||||||
TunableOp automatically explores different implementations and configurations of certain PyTorch
|
TunableOp automatically explores different implementations and configurations of certain PyTorch
|
||||||
operators to find the fastest one for your hardware.
|
operators to find the fastest one for your hardware.
|
||||||
|
|
||||||
By default, ``{{model.mad_tag}}`` runs with TunableOp disabled
|
By default, ``{{model.mad_tag}}`` runs with TunableOp disabled (see
|
||||||
(see
|
`<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To enable it, include
|
||||||
`<https://github.com/ROCm/MAD/blob/develop/models.json>`__).
|
the ``--tunableop on`` argument in your run.
|
||||||
To enable it, include the ``--tunableop on`` argument in your
|
|
||||||
run.
|
|
||||||
|
|
||||||
Enabling TunableOp triggers a two-pass run -- a warm-up followed
|
Enabling TunableOp triggers a two-pass run -- a warm-up followed by the
|
||||||
by the performance-collection run.
|
performance-collection run.
|
||||||
|
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
.. tab-item:: Standalone benchmarking
|
.. tab-item:: Standalone benchmarking
|
||||||
|
|
||||||
.. rubric:: Download the Docker image and required scripts
|
The following commands are optimized for {{ model.model }}.
|
||||||
|
See :ref:`vllm-benchmark-supported-models-909` to switch to another available model.
|
||||||
|
|
||||||
1. Run the vLLM benchmark tool independently by starting the
|
.. seealso::
|
||||||
`Docker container <{{ unified_docker.docker_hub_url }}>`_
|
|
||||||
as shown in the following snippet.
|
For more information on configuration, see the `config files
|
||||||
|
<https://github.com/ROCm/MAD-private/tree/develop/scripts/vllm/configs>`__
|
||||||
|
in the MAD repository. Refer to the `vLLM engine <https://docs.vllm.ai/en/latest/configuration/engine_args.html#engineargs>`__
|
||||||
|
for descriptions of available configuration options
|
||||||
|
and `Benchmarking vLLM <https://github.com/vllm-project/vllm/blob/main/benchmarks/README.md>`__ for
|
||||||
|
additional benchmarking information.
|
||||||
|
|
||||||
|
.. rubric:: Launch the container
|
||||||
|
|
||||||
|
You can run the vLLM benchmark tool independently by starting the
|
||||||
|
`Docker container <{{ docker.docker_hub_url }}>`_ as shown
|
||||||
|
in the following snippet.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker pull {{ docker.pull_tag }}
|
||||||
|
docker run -it \
|
||||||
|
--device=/dev/kfd \
|
||||||
|
--device=/dev/dri \
|
||||||
|
--group-add video \
|
||||||
|
--shm-size 16G \
|
||||||
|
--security-opt seccomp=unconfined \
|
||||||
|
--security-opt apparmor=unconfined \
|
||||||
|
--cap-add=SYS_PTRACE \
|
||||||
|
-v $(pwd):/workspace \
|
||||||
|
--env HUGGINGFACE_HUB_CACHE=/workspace \
|
||||||
|
--name test \
|
||||||
|
{{ docker.pull_tag }}
|
||||||
|
|
||||||
|
.. rubric:: Throughput command
|
||||||
|
|
||||||
|
Use the following command to start the throughput benchmark.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
model={{ model.model_repo }}
|
||||||
|
tp={{ model.config.tp }}
|
||||||
|
num_prompts=1024
|
||||||
|
in=128
|
||||||
|
out=128
|
||||||
|
dtype={{ model.config.dtype }}
|
||||||
|
kv_cache_dtype={{ model.config.kv_cache_dtype }}
|
||||||
|
max_num_seqs=1024
|
||||||
|
max_seq_len_to_capture={{ model.config.max_seq_len_to_capture }}
|
||||||
|
max_num_batched_tokens={{ model.config.max_num_batched_tokens }}
|
||||||
|
max_model_len={{ model.config.max_model_len }}
|
||||||
|
|
||||||
|
vllm bench throughput --model $model \
|
||||||
|
-tp $tp \
|
||||||
|
--num-prompts $num_prompts \
|
||||||
|
--input-len $in \
|
||||||
|
--output-len $out \
|
||||||
|
--dtype $dtype \
|
||||||
|
--kv-cache-dtype $kv_cache_dtype \
|
||||||
|
--max-num-seqs $max_num_seqs \
|
||||||
|
--max-seq-len-to-capture $max_seq_len_to_capture \
|
||||||
|
--max-num-batched-tokens $max_num_batched_tokens \
|
||||||
|
--max-model-len $max_model_len \
|
||||||
|
--trust-remote-code \
|
||||||
|
--output-json ${model}_throughput.json \
|
||||||
|
--gpu-memory-utilization 0.9
|
||||||
|
|
||||||
|
.. rubric:: Serving command
|
||||||
|
|
||||||
|
1. Start the server using the following command:
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
docker pull {{ unified_docker.pull_tag }}
|
model={{ model.model_repo }}
|
||||||
docker run -it \
|
tp={{ model.config.tp }}
|
||||||
--device=/dev/kfd \
|
dtype={{ model.config.dtype }}
|
||||||
--device=/dev/dri \
|
kv_cache_dtype={{ model.config.kv_cache_dtype }}
|
||||||
--group-add video \
|
max_num_seqs=256
|
||||||
--shm-size 16G \
|
max_seq_len_to_capture={{ model.config.max_seq_len_to_capture }}
|
||||||
--security-opt seccomp=unconfined \
|
max_num_batched_tokens={{ model.config.max_num_batched_tokens }}
|
||||||
--security-opt apparmor=unconfined \
|
max_model_len={{ model.config.max_model_len }}
|
||||||
--cap-add=SYS_PTRACE \
|
|
||||||
-v $(pwd):/workspace \
|
|
||||||
--env HUGGINGFACE_HUB_CACHE=/workspace \
|
|
||||||
--name test \
|
|
||||||
{{ unified_docker.pull_tag }}
|
|
||||||
|
|
||||||
2. In the Docker container, clone the ROCm MAD repository and navigate to the
|
vllm serve $model \
|
||||||
benchmark scripts directory at ``~/MAD/scripts/vllm``.
|
-tp $tp \
|
||||||
|
--dtype $dtype \
|
||||||
|
--kv-cache-dtype $kv_cache_dtype \
|
||||||
|
--max-num-seqs $max_num_seqs \
|
||||||
|
--max-seq-len-to-capture $max_seq_len_to_capture \
|
||||||
|
--max-num-batched-tokens $max_num_batched_tokens \
|
||||||
|
--max-model-len $max_model_len \
|
||||||
|
--no-enable-prefix-caching \
|
||||||
|
--swap-space 16 \
|
||||||
|
--disable-log-requests \
|
||||||
|
--trust-remote-code \
|
||||||
|
--gpu-memory-utilization 0.9
|
||||||
|
|
||||||
|
Wait until the model has loaded and the server is ready to accept requests.
|
||||||
|
|
||||||
|
2. On another terminal on the same machine, run the benchmark:
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
git clone https://github.com/ROCm/MAD
|
# Connect to the container
|
||||||
cd MAD/scripts/vllm
|
docker exec -it test bash
|
||||||
|
|
||||||
3. To start the benchmark, use the following command with the appropriate options.
|
# Wait for the server to start
|
||||||
|
until curl -s http://localhost:8000/v1/models; do sleep 30; done
|
||||||
|
|
||||||
.. dropdown:: Benchmark options
|
# Run the benchmark
|
||||||
:open:
|
model={{ model.model_repo }}
|
||||||
|
max_concurrency=1
|
||||||
|
num_prompts=10
|
||||||
|
in=128
|
||||||
|
out=128
|
||||||
|
vllm bench serve --model $model \
|
||||||
|
--percentile-metrics "ttft,tpot,itl,e2el" \
|
||||||
|
--dataset-name random \
|
||||||
|
--ignore-eos \
|
||||||
|
--max-concurrency $max_concurrency \
|
||||||
|
--num-prompts $num_prompts \
|
||||||
|
--random-input-len $in \
|
||||||
|
--random-output-len $out \
|
||||||
|
--trust-remote-code \
|
||||||
|
--save-result \
|
||||||
|
--result-filename ${model}_serving.json
|
||||||
|
|
||||||
.. list-table::
|
.. note::
|
||||||
:header-rows: 1
|
|
||||||
:align: center
|
|
||||||
|
|
||||||
* - Name
|
If you encounter the following error, pass your access-authorized Hugging
|
||||||
- Options
|
Face token to the gated models.
|
||||||
- Description
|
|
||||||
|
|
||||||
* - ``$test_option``
|
|
||||||
- latency
|
|
||||||
- Measure decoding token latency
|
|
||||||
|
|
||||||
* -
|
|
||||||
- throughput
|
|
||||||
- Measure token generation throughput
|
|
||||||
|
|
||||||
* -
|
|
||||||
- all
|
|
||||||
- Measure both throughput and latency
|
|
||||||
|
|
||||||
* - ``$num_gpu``
|
|
||||||
- 1 or 8
|
|
||||||
- Number of GPUs
|
|
||||||
|
|
||||||
* - ``$datatype``
|
|
||||||
- ``float16`` or ``float8``
|
|
||||||
- Data type
|
|
||||||
|
|
||||||
The input sequence length, output sequence length, and tensor parallel (TP) are
|
|
||||||
already configured. You don't need to specify them with this script.
|
|
||||||
|
|
||||||
Command:
|
|
||||||
|
|
||||||
.. code-block::
|
.. code-block::
|
||||||
|
|
||||||
./vllm_benchmark_report.sh \
|
OSError: You are trying to access a gated repo.
|
||||||
-s $test_option \
|
|
||||||
-m {{model.model_repo}} \
|
|
||||||
-g $num_gpu \
|
|
||||||
-d {{model.precision}}
|
|
||||||
|
|
||||||
.. note::
|
# pass your HF_TOKEN
|
||||||
|
export HF_TOKEN=$your_personal_hf_token
|
||||||
For best performance, it's recommend to run with ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``.
|
|
||||||
|
|
||||||
If you encounter the following error, pass your access-authorized Hugging
|
|
||||||
Face token to the gated models.
|
|
||||||
|
|
||||||
.. code-block::
|
|
||||||
|
|
||||||
OSError: You are trying to access a gated repo.
|
|
||||||
|
|
||||||
# pass your HF_TOKEN
|
|
||||||
export HF_TOKEN=$your_personal_hf_token
|
|
||||||
|
|
||||||
.. rubric:: Benchmarking examples
|
|
||||||
|
|
||||||
Here are some examples of running the benchmark with various options:
|
|
||||||
|
|
||||||
* Latency benchmark
|
|
||||||
|
|
||||||
Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
|
|
||||||
|
|
||||||
.. code-block::
|
|
||||||
|
|
||||||
./vllm_benchmark_report.sh \
|
|
||||||
-s latency \
|
|
||||||
-m {{model.model_repo}} \
|
|
||||||
-g 8 \
|
|
||||||
-d {{model.precision}}
|
|
||||||
|
|
||||||
Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
|
|
||||||
|
|
||||||
* Throughput benchmark
|
|
||||||
|
|
||||||
Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
./vllm_benchmark_report.sh \
|
|
||||||
-s throughput \
|
|
||||||
-m {{model.model_repo}} \
|
|
||||||
-g 8 \
|
|
||||||
-d {{model.precision}}
|
|
||||||
|
|
||||||
Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
|
|
||||||
|
|
||||||
.. raw:: html
|
.. raw:: html
|
||||||
|
|
||||||
@@ -382,7 +386,7 @@ Advanced usage
|
|||||||
==============
|
==============
|
||||||
|
|
||||||
For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
|
For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
|
||||||
see the developer's guide at `<https://github.com/ROCm/vllm/tree/f94ec9beeca1071cc34f9d1e206d8c7f3ac76129/docs/dev-docker>`__.
|
see the developer's guide at `<https://github.com/ROCm/vllm/blob/documentation/docs/dev-docker/README.md>`__.
|
||||||
|
|
||||||
Reproducing the Docker image
|
Reproducing the Docker image
|
||||||
----------------------------
|
----------------------------
|
||||||
@@ -400,7 +404,7 @@ To reproduce this ROCm/vLLM Docker image release, follow these steps:
|
|||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
cd vllm
|
cd vllm
|
||||||
git checkout b432b7a285aa0dcb9677380936ffa74931bb6d6f
|
git checkout 6663000a391911eba96d7864a26ac42b07f6ef29
|
||||||
|
|
||||||
3. Build the Docker image. Replace ``vllm-rocm`` with your desired image tag.
|
3. Build the Docker image. Replace ``vllm-rocm`` with your desired image tag.
|
||||||
|
|
||||||
@@ -408,11 +412,6 @@ To reproduce this ROCm/vLLM Docker image release, follow these steps:
|
|||||||
|
|
||||||
docker build -f docker/Dockerfile.rocm -t vllm-rocm .
|
docker build -f docker/Dockerfile.rocm -t vllm-rocm .
|
||||||
|
|
||||||
Known issues and workarounds
|
|
||||||
============================
|
|
||||||
|
|
||||||
AITER does not support FP8 KV cache yet.
|
|
||||||
|
|
||||||
Further reading
|
Further reading
|
||||||
===============
|
===============
|
||||||
|
|
||||||
@@ -424,15 +423,12 @@ Further reading
|
|||||||
- To learn more about system settings and management practices to configure your system for
|
- To learn more about system settings and management practices to configure your system for
|
||||||
AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
|
AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
|
||||||
|
|
||||||
|
- See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
|
||||||
|
a brief introduction to vLLM and optimization strategies.
|
||||||
|
|
||||||
- For application performance optimization strategies for HPC and AI workloads,
|
- For application performance optimization strategies for HPC and AI workloads,
|
||||||
including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
|
including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
|
||||||
|
|
||||||
- To learn how to run community models from Hugging Face on AMD GPUs, see
|
|
||||||
:doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
|
|
||||||
|
|
||||||
- To learn how to fine-tune LLMs and optimize inference, see
|
|
||||||
:doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
|
|
||||||
|
|
||||||
- For a list of other ready-made Docker images for AI with ROCm, see
|
- For a list of other ready-made Docker images for AI with ROCm, see
|
||||||
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
|
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
|
||||||
|
|
||||||
|
|||||||
@@ -1,14 +1,14 @@
|
|||||||
.. meta::
|
.. meta::
|
||||||
:description: How to install ROCm and popular machine learning frameworks.
|
:description: How to install ROCm and popular deep learning frameworks.
|
||||||
:keywords: ROCm, AI, LLM, train, fine-tune, FSDP, DeepSpeed, LLaMA, tutorial
|
:keywords: ROCm, AI, LLM, train, fine-tune, FSDP, DeepSpeed, LLaMA, tutorial
|
||||||
|
|
||||||
.. _rocm-for-ai-install:
|
.. _rocm-for-ai-install:
|
||||||
|
|
||||||
***********************************************
|
********************************************
|
||||||
Installing ROCm and machine learning frameworks
|
Installing ROCm and deep learning frameworks
|
||||||
***********************************************
|
********************************************
|
||||||
|
|
||||||
Before getting started, install ROCm and supported machine learning frameworks.
|
Before getting started, install ROCm and supported deep learning frameworks.
|
||||||
|
|
||||||
.. grid:: 1
|
.. grid:: 1
|
||||||
|
|
||||||
@@ -22,9 +22,9 @@ If you’re new to ROCm, refer to the :doc:`ROCm quick start install guide for L
|
|||||||
<rocm-install-on-linux:install/quick-start>`.
|
<rocm-install-on-linux:install/quick-start>`.
|
||||||
|
|
||||||
If you’re using a Radeon GPU for graphics-accelerated applications, refer to the
|
If you’re using a Radeon GPU for graphics-accelerated applications, refer to the
|
||||||
`Radeon installation instructions <https://rocm.docs.amd.com/projects/radeon/en/docs-6.1.3/docs/install/native_linux/install-radeon.html>`_.
|
`Radeon installation instructions <https://rocm.docs.amd.com/projects/radeon/en/latest/docs/install/native_linux/howto_native_linux.html>`_.
|
||||||
|
|
||||||
You can install ROCm on :ref:`compatible systems <rocm-install-on-linux:reference/system-requirements>` via your Linux
|
You can install ROCm on :doc:`compatible systems <rocm-install-on-linux:reference/system-requirements>` via your Linux
|
||||||
distribution's package manager. See the following documentation resources to get started:
|
distribution's package manager. See the following documentation resources to get started:
|
||||||
|
|
||||||
* :doc:`ROCm installation overview <rocm-install-on-linux:install/install-overview>`
|
* :doc:`ROCm installation overview <rocm-install-on-linux:install/install-overview>`
|
||||||
@@ -43,29 +43,16 @@ distribution's package manager. See the following documentation resources to get
|
|||||||
If you encounter any issues during installation, refer to the
|
If you encounter any issues during installation, refer to the
|
||||||
:doc:`Installation troubleshooting <rocm-install-on-linux:reference/install-faq>` guide.
|
:doc:`Installation troubleshooting <rocm-install-on-linux:reference/install-faq>` guide.
|
||||||
|
|
||||||
Machine learning frameworks
|
Deep learning frameworks
|
||||||
===========================
|
========================
|
||||||
|
|
||||||
ROCm supports popular machine learning frameworks and libraries including `PyTorch
|
ROCm supports deep learning frameworks and libraries including `PyTorch
|
||||||
<https://pytorch.org/blog/pytorch-for-amd-rocm-platform-now-available-as-python-package>`_, `TensorFlow
|
<https://pytorch.org/blog/pytorch-for-amd-rocm-platform-now-available-as-python-package>`_, `TensorFlow
|
||||||
<https://tensorflow.org>`_, `JAX <https://jax.readthedocs.io/en/latest>`_, and `DeepSpeed
|
<https://tensorflow.org>`_, `JAX <https://jax.readthedocs.io/en/latest>`_, and more.
|
||||||
<https://cloudblogs.microsoft.com/opensource/2022/03/21/supporting-efficient-large-model-training-on-amd-instinct-gpus-with-deepspeed/>`_.
|
|
||||||
|
|
||||||
Review the framework installation documentation. For ease-of-use, it's recommended to use official ROCm prebuilt Docker
|
Review the :doc:`framework installation documentation <../deep-learning-rocm>`. For ease-of-use, it's recommended to use official ROCm prebuilt Docker
|
||||||
images with the framework pre-installed.
|
images with the framework pre-installed.
|
||||||
|
|
||||||
* :doc:`PyTorch for ROCm <rocm-install-on-linux:install/3rd-party/pytorch-install>`
|
|
||||||
|
|
||||||
* :doc:`TensorFlow for ROCm <rocm-install-on-linux:install/3rd-party/tensorflow-install>`
|
|
||||||
|
|
||||||
* :doc:`JAX for ROCm <rocm-install-on-linux:install/3rd-party/jax-install>`
|
|
||||||
|
|
||||||
* :doc:`verl for ROCm <rocm-install-on-linux:install/3rd-party/verl-install>`
|
|
||||||
|
|
||||||
* :doc:`Stanford Megatron-LM for ROCm <rocm-install-on-linux:install/3rd-party/jax-install>`
|
|
||||||
|
|
||||||
* :doc:`DGL for ROCm <rocm-install-on-linux:install/3rd-party/jax-install>`
|
|
||||||
|
|
||||||
Next steps
|
Next steps
|
||||||
==========
|
==========
|
||||||
|
|
||||||
|
|||||||
@@ -2,9 +2,9 @@
|
|||||||
:description: How to train a model using JAX MaxText for ROCm.
|
:description: How to train a model using JAX MaxText for ROCm.
|
||||||
:keywords: ROCm, AI, LLM, train, jax, torch, Llama, flux, tutorial, docker
|
:keywords: ROCm, AI, LLM, train, jax, torch, Llama, flux, tutorial, docker
|
||||||
|
|
||||||
**************************************
|
******************************************
|
||||||
Training a model with MaxText for ROCm
|
Training a model with JAX MaxText for ROCm
|
||||||
**************************************
|
******************************************
|
||||||
|
|
||||||
MaxText is a high-performance, open-source framework built on the Google JAX
|
MaxText is a high-performance, open-source framework built on the Google JAX
|
||||||
machine learning library to train LLMs at scale. The MaxText framework for
|
machine learning library to train LLMs at scale. The MaxText framework for
|
||||||
@@ -12,70 +12,108 @@ ROCm is an optimized fork of the upstream
|
|||||||
`<https://github.com/AI-Hypercomputer/maxtext>`__ enabling efficient AI workloads
|
`<https://github.com/AI-Hypercomputer/maxtext>`__ enabling efficient AI workloads
|
||||||
on AMD MI300X series accelerators.
|
on AMD MI300X series accelerators.
|
||||||
|
|
||||||
The MaxText for ROCm training Docker (``rocm/jax-training:maxtext-v25.5``) image
|
The MaxText for ROCm training Docker image
|
||||||
provides a prebuilt environment for training on AMD Instinct MI300X and MI325X accelerators,
|
provides a prebuilt environment for training on AMD Instinct MI300X and MI325X accelerators,
|
||||||
including essential components like JAX, XLA, ROCm libraries, and MaxText utilities.
|
including essential components like JAX, XLA, ROCm libraries, and MaxText utilities.
|
||||||
It includes the following software components:
|
It includes the following software components:
|
||||||
|
|
||||||
+--------------------------+--------------------------------+
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
|
||||||
| Software component | Version |
|
|
||||||
+==========================+================================+
|
|
||||||
| ROCm | 6.3.4 |
|
|
||||||
+--------------------------+--------------------------------+
|
|
||||||
| JAX | 0.4.35 |
|
|
||||||
+--------------------------+--------------------------------+
|
|
||||||
| Python | 3.10.12 |
|
|
||||||
+--------------------------+--------------------------------+
|
|
||||||
| Transformer Engine | 1.12.0.dev0+b8b92dc |
|
|
||||||
+--------------------------+--------------------------------+
|
|
||||||
| hipBLASLt | 0.13.0-ae9c477a |
|
|
||||||
+--------------------------+--------------------------------+
|
|
||||||
|
|
||||||
Supported features and models
|
{% set dockers = data.dockers %}
|
||||||
=============================
|
.. tab-set::
|
||||||
|
|
||||||
MaxText provides the following key features to train large language models efficiently:
|
{% for docker in dockers %}
|
||||||
|
{% set jax_version = docker.components["JAX"] %}
|
||||||
|
|
||||||
|
.. tab-item:: JAX {{ jax_version }}
|
||||||
|
:sync: {{ docker.pull_tag }}
|
||||||
|
|
||||||
|
.. list-table::
|
||||||
|
:header-rows: 1
|
||||||
|
|
||||||
|
* - Software component
|
||||||
|
- Version
|
||||||
|
|
||||||
|
{% for component_name, component_version in docker.components.items() %}
|
||||||
|
* - {{ component_name }}
|
||||||
|
- {{ component_version }}
|
||||||
|
|
||||||
|
{% endfor %}
|
||||||
|
{% if jax_version == "0.6.0" %}
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
Shardy is a new config in JAX 0.6.0. You might get related errors if it's
|
||||||
|
not configured correctly. For now you can turn it off by setting
|
||||||
|
``shardy=False`` during the training run. You can also follow the `migration
|
||||||
|
guide <https://docs.jax.dev/en/latest/shardy_jax_migration.html>`__ to enable
|
||||||
|
it.
|
||||||
|
|
||||||
|
The provided multi-node training scripts in this documentation are
|
||||||
|
not currently supported with JAX 0.6.0. For multi-node training, use the JAX 0.5.0
|
||||||
|
Docker image.
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
MaxText with on ROCm provides the following key features to train large language models efficiently:
|
||||||
|
|
||||||
- Transformer Engine (TE)
|
- Transformer Engine (TE)
|
||||||
|
|
||||||
- Flash Attention (FA) 3
|
- Flash Attention (FA) 3 -- with or without sequence input packing
|
||||||
|
|
||||||
- GEMM tuning
|
- GEMM tuning
|
||||||
|
|
||||||
- Multi-node support
|
- Multi-node support
|
||||||
|
|
||||||
.. _amd-maxtext-model-support:
|
- NANOO FP8 quantization support
|
||||||
|
|
||||||
The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
|
.. _amd-maxtext-model-support-v257:
|
||||||
|
|
||||||
* Llama 3.3 70B
|
Supported models
|
||||||
|
================
|
||||||
|
|
||||||
* Llama 3.1 8B
|
The following models are pre-optimized for performance on AMD Instinct MI300
|
||||||
|
series accelerators. Some instructions, commands, and available training
|
||||||
|
configurations in this documentation might vary by model -- select one to get
|
||||||
|
started.
|
||||||
|
|
||||||
* Llama 3.1 70B
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
|
||||||
|
|
||||||
* Llama 3 8B
|
{% set model_groups = data.model_groups %}
|
||||||
|
.. raw:: html
|
||||||
|
|
||||||
* Llama 3 70B
|
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||||
|
<div class="row gx-0">
|
||||||
|
<div class="col-2 me-1 px-2 model-param-head">Model</div>
|
||||||
|
<div class="row col-10 pe-0">
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
<div class="col-4 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
* Llama 2 7B
|
<div class="row gx-0 pt-1">
|
||||||
|
<div class="col-2 me-1 px-2 model-param-head">Variant</div>
|
||||||
* Llama 2 70B
|
<div class="row col-10 pe-0">
|
||||||
|
{% for model_group in model_groups %}
|
||||||
* DeepSeek-V2-Lite
|
{% set models = model_group.models %}
|
||||||
|
{% for model in models %}
|
||||||
|
{% if models|length % 3 == 0 %}
|
||||||
|
<div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
|
{% else %}
|
||||||
|
<div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
|
{% endif %}
|
||||||
|
{% endfor %}
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
|
|
||||||
Some models, such as Llama 3, require an external license agreement through
|
Some models, such as Llama 3, require an external license agreement through
|
||||||
a third party (for example, Meta).
|
a third party (for example, Meta).
|
||||||
|
|
||||||
Unsupported features
|
|
||||||
--------------------
|
|
||||||
|
|
||||||
Currently, MaxText's default packed input format is not supported. Using this format
|
|
||||||
with the current Docker image results in incorrect attention calculations
|
|
||||||
across different input sequences. Support for packed input format is planned for a future release.
|
|
||||||
|
|
||||||
System validation
|
System validation
|
||||||
=================
|
=================
|
||||||
|
|
||||||
@@ -98,14 +136,14 @@ This Docker image is optimized for specific model configurations outlined
|
|||||||
as follows. Performance can vary for other training workloads, as AMD
|
as follows. Performance can vary for other training workloads, as AMD
|
||||||
doesn’t validate configurations and run conditions outside those described.
|
doesn’t validate configurations and run conditions outside those described.
|
||||||
|
|
||||||
.. _amd-maxtext-multi-node-setup:
|
.. _amd-maxtext-multi-node-setup-v257:
|
||||||
|
|
||||||
Multi-node setup
|
Multi-node setup
|
||||||
----------------
|
----------------
|
||||||
|
|
||||||
For multi-node environments, ensure you have all the necessary packages for
|
For multi-node environments, ensure you have all the necessary packages for
|
||||||
your network device, such as, RDMA. If you're not using a multi-node setup
|
your network device, such as, RDMA. If you're not using a multi-node setup
|
||||||
with RDMA, skip ahead to :ref:`amd-maxtext-download-docker`.
|
with RDMA, skip ahead to :ref:`amd-maxtext-get-started-v257`.
|
||||||
|
|
||||||
1. Install the following packages to build and install the RDMA driver.
|
1. Install the following packages to build and install the RDMA driver.
|
||||||
|
|
||||||
@@ -170,7 +208,7 @@ with RDMA, skip ahead to :ref:`amd-maxtext-download-docker`.
|
|||||||
|
|
||||||
e. RDMA interface
|
e. RDMA interface
|
||||||
|
|
||||||
Ensure the :ref:`required packages <amd-maxtext-multi-node-setup>` are installed on all nodes.
|
Ensure the :ref:`required packages <amd-maxtext-multi-node-setup-v257>` are installed on all nodes.
|
||||||
Then, set the RDMA interfaces to use for communication.
|
Then, set the RDMA interfaces to use for communication.
|
||||||
|
|
||||||
.. code-block:: bash
|
.. code-block:: bash
|
||||||
@@ -180,196 +218,203 @@ with RDMA, skip ahead to :ref:`amd-maxtext-download-docker`.
|
|||||||
# If using Mellanox NIC
|
# If using Mellanox NIC
|
||||||
export NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_8,mlx5_9
|
export NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_8,mlx5_9
|
||||||
|
|
||||||
.. _amd-maxtext-download-docker:
|
.. _amd-maxtext-get-started-v257:
|
||||||
|
|
||||||
Pull the Docker image
|
Benchmarking
|
||||||
---------------------
|
============
|
||||||
|
|
||||||
1. Use the following command to pull the Docker image from Docker Hub.
|
Once the setup is complete, choose between two options to reproduce the
|
||||||
|
benchmark results:
|
||||||
|
|
||||||
.. code-block:: shell
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
|
||||||
|
|
||||||
docker pull rocm/jax-training:maxtext-v25.5
|
.. _vllm-benchmark-mad:
|
||||||
|
|
||||||
2. Use the following command to launch the Docker container. Note that the benchmarking scripts
|
{% set dockers = data.dockers %}
|
||||||
used in the :ref:`following section <amd-maxtext-get-started>` automatically launch the Docker container
|
{% set model_groups = data.model_groups %}
|
||||||
and execute the benchmark.
|
{% for model_group in model_groups %}
|
||||||
|
{% for model in model_group.models %}
|
||||||
|
|
||||||
.. code-block:: shell
|
.. container:: model-doc {{model.mad_tag}}
|
||||||
|
|
||||||
docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME/.ssh:/root/.ssh --shm-size 128G --name maxtext_training rocm/jax-training:maxtext-v25.5
|
.. tab-set::
|
||||||
|
|
||||||
.. _amd-maxtext-get-started:
|
{% if model.mad_tag and "single-node" in model.doc_options %}
|
||||||
|
.. tab-item:: MAD-integrated benchmarking
|
||||||
|
|
||||||
Getting started
|
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||||
|
directory and install the required packages on the host machine.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
git clone https://github.com/ROCm/MAD
|
||||||
|
cd MAD
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
2. Use this command to run the performance benchmark test on the {{ model.model }} model
|
||||||
|
using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
||||||
|
madengine run \
|
||||||
|
--tags {{model.mad_tag}} \
|
||||||
|
--keep-model-dir \
|
||||||
|
--live-output \
|
||||||
|
--timeout 28800
|
||||||
|
|
||||||
|
MAD launches a Docker container with the name
|
||||||
|
``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
|
||||||
|
model are collected in the following path: ``~/MAD/perf.csv/``.
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
.. tab-item:: Standalone benchmarking
|
||||||
|
|
||||||
|
.. rubric:: Download the Docker image and required scripts
|
||||||
|
|
||||||
|
Run the JAX MaxText benchmark tool independently by starting the
|
||||||
|
Docker container as shown in the following snippet.
|
||||||
|
|
||||||
|
.. tab-set::
|
||||||
|
{% for docker in dockers %}
|
||||||
|
{% set jax_version = docker.components["JAX"] %}
|
||||||
|
|
||||||
|
.. tab-item:: JAX {{ jax_version }}
|
||||||
|
:sync: {{ docker.pull_tag }}
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker pull {{ docker.pull_tag }}
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
{% if model.model_repo and "single-node" in model.doc_options %}
|
||||||
|
.. rubric:: Single node training
|
||||||
|
|
||||||
|
1. Set up environment variables.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
export MAD_SECRETS_HFTOKEN=<Your Hugging Face token>
|
||||||
|
export HF_HOME=<Location of saved/cached Hugging Face models>
|
||||||
|
|
||||||
|
``MAD_SECRETS_HFTOKEN`` is your Hugging Face access token to access models, tokenizers, and data.
|
||||||
|
See `User access tokens <https://huggingface.co/docs/hub/en/security-tokens>`__.
|
||||||
|
|
||||||
|
``HF_HOME`` is where ``huggingface_hub`` will store local data. See `huggingface_hub CLI <https://huggingface.co/docs/huggingface_hub/main/en/guides/cli#huggingface-cli-download>`__.
|
||||||
|
If you already have downloaded or cached Hugging Face artifacts, set this variable to that path.
|
||||||
|
Downloaded files typically get cached to ``~/.cache/huggingface``.
|
||||||
|
|
||||||
|
2. Launch the Docker container.
|
||||||
|
|
||||||
|
.. tab-set::
|
||||||
|
{% for docker in dockers %}
|
||||||
|
{% set jax_version = docker.components["JAX"] %}
|
||||||
|
|
||||||
|
.. tab-item:: JAX {{ jax_version }}
|
||||||
|
:sync: {{ docker.pull_tag }}
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker run -it \
|
||||||
|
--device=/dev/dri \
|
||||||
|
--device=/dev/kfd \
|
||||||
|
--network host \
|
||||||
|
--ipc host \
|
||||||
|
--group-add video \
|
||||||
|
--cap-add=SYS_PTRACE \
|
||||||
|
--security-opt seccomp=unconfined \
|
||||||
|
--privileged \
|
||||||
|
-v $HOME:$HOME \
|
||||||
|
-v $HOME/.ssh:/root/.ssh \
|
||||||
|
-v $HF_HOME:/hf_cache \
|
||||||
|
-e HF_HOME=/hf_cache \
|
||||||
|
-e MAD_SECRETS_HFTOKEN=$MAD_SECRETS_HFTOKEN
|
||||||
|
--shm-size 64G \
|
||||||
|
--name training_env \
|
||||||
|
{{ docker.pull_tag }}
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
3. In the Docker container, clone the ROCm MAD repository and navigate to the
|
||||||
|
benchmark scripts directory at ``MAD/scripts/jax-maxtext``.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
git clone https://github.com/ROCm/MAD
|
||||||
|
cd MAD/scripts/jax-maxtext
|
||||||
|
|
||||||
|
4. Run the setup scripts to install libraries and datasets needed
|
||||||
|
for benchmarking.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
./jax-maxtext_benchmark_setup.sh -m {{ model.model_repo }}
|
||||||
|
|
||||||
|
5. To run the training benchmark without quantization, use the following command:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
./jax-maxtext_benchmark_report.sh -m {{ model.model_repo }}
|
||||||
|
|
||||||
|
For quantized training, use the following command:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
./jax-maxtext_benchmark_report.sh -m {{ model.model_repo }} -q nanoo_fp8
|
||||||
|
|
||||||
|
.. important::
|
||||||
|
|
||||||
|
Quantized training is not supported with the JAX 0.6.0 Docker image; support
|
||||||
|
will be added in a future release. For quantized training, use the JAX 0.5.0
|
||||||
|
Docker image: ``rocm/jax-training:maxtext-v25.7``.
|
||||||
|
|
||||||
|
{% endif %}
|
||||||
|
{% if model.multinode_training_script and "multi-node" in model.doc_options %}
|
||||||
|
.. rubric:: Multi-node training
|
||||||
|
|
||||||
|
The following examples use SLURM to run on multiple nodes.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
The following scripts will launch the Docker container and run the
|
||||||
|
benchmark. Run them outside of any Docker container.
|
||||||
|
|
||||||
|
1. Make sure ``$HF_HOME`` is set before running the test. See
|
||||||
|
`ROCm benchmarking <https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/readme.md>`__
|
||||||
|
for more details on downloading the Llama models before running the
|
||||||
|
benchmark.
|
||||||
|
|
||||||
|
2. To run multi-node training for {{ model.model }},
|
||||||
|
use the
|
||||||
|
`multi-node training script <https://github.com/ROCm/MAD/blob/develop/scripts/jax-maxtext/gpu-rocm/{{ model.multinode_training_script }}>`__
|
||||||
|
under the ``scripts/jax-maxtext/gpu-rocm/`` directory.
|
||||||
|
|
||||||
|
3. Run the multi-node training benchmark script.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
sbatch -N <num_nodes> {{ model.multinode_training_script }}
|
||||||
|
|
||||||
|
{% else %}
|
||||||
|
.. rubric:: Multi-node training
|
||||||
|
|
||||||
|
For multi-node training examples, choose a model from :ref:`amd-maxtext-model-support-v257`
|
||||||
|
with an available `multi-node training script <https://github.com/ROCm/MAD/tree/develop/scripts/jax-maxtext/gpu-rocm>`__.
|
||||||
|
{% endif %}
|
||||||
|
{% endfor %}
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
Further reading
|
||||||
===============
|
===============
|
||||||
|
|
||||||
The following examples demonstrate how to get started with single node
|
- See the ROCm/maxtext benchmarking README at `<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/readme.md>`__.
|
||||||
and multi-node training using the benchmarking scripts provided at
|
|
||||||
`<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/>`__.
|
|
||||||
|
|
||||||
.. important::
|
- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
|
||||||
|
|
||||||
The provided scripts launch a Docker container and execute a benchmark. Ensure you run these commands outside of any existing Docker container.
|
- To learn more about system settings and management practices to configure your system for
|
||||||
|
AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
|
||||||
|
|
||||||
Before running any benchmarks, ensure the ``$HF_HOME`` environment variable is
|
- For a list of other ready-made Docker images for AI with ROCm, see
|
||||||
set correctly and points to your Hugging Face cache directory. Refer to the
|
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
|
||||||
README at `<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/>`__
|
|
||||||
for more detailed instructions.
|
|
||||||
|
|
||||||
Single node training benchmarking examples
|
|
||||||
------------------------------------------
|
|
||||||
|
|
||||||
* Example 1: Single node training with Llama 2 7B
|
|
||||||
|
|
||||||
Download the benchmarking script:
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_7b.sh
|
|
||||||
|
|
||||||
Run the single node training benchmark:
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama2_7b.sh
|
|
||||||
|
|
||||||
* Example 2: Single node training with Llama 2 70B
|
|
||||||
|
|
||||||
Download the benchmarking script:
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_70b.sh
|
|
||||||
|
|
||||||
Run the single node training benchmark:
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama2_70b.sh
|
|
||||||
|
|
||||||
* Example 3: Single node training with Llama 3 8B
|
|
||||||
|
|
||||||
Download the benchmarking script:
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_8b.sh
|
|
||||||
|
|
||||||
Run the single node training benchmark:
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama3_8b.sh
|
|
||||||
|
|
||||||
* Example 4: Single node training with Llama 3 70B
|
|
||||||
|
|
||||||
Download the benchmarking script:
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_70b.sh
|
|
||||||
|
|
||||||
Run the single node training benchmark:
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama3_70b.sh
|
|
||||||
|
|
||||||
* Example 5: Single node training with Llama 3.3 70B
|
|
||||||
|
|
||||||
Download the benchmarking script:
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3.3_70b.sh
|
|
||||||
|
|
||||||
Run the single node training benchmark:
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama3.3_70b.sh
|
|
||||||
|
|
||||||
* Example 6: Single node training with DeepSeek V2 16B
|
|
||||||
|
|
||||||
Download the benchmarking script:
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/deepseek_v2_16b.sh
|
|
||||||
|
|
||||||
Run the single node training benchmark:
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
IMAGE="rocm/jax-training:maxtext-v25.5" bash ./deepseek_v2_16b.sh
|
|
||||||
|
|
||||||
.. note::
|
|
||||||
|
|
||||||
The reported TFLOP/s by MaxText for DeepSeek is not accurate. Use
|
|
||||||
the tokens/s as a performance indicator.
|
|
||||||
|
|
||||||
Multi-node training benchmarking examples
|
|
||||||
-----------------------------------------
|
|
||||||
|
|
||||||
The following examples use SLURM for running on multiple nodes -- the commands might need to be adjusted for your
|
|
||||||
own cluster setup.
|
|
||||||
|
|
||||||
* Example 1: Multi-node training with Llama 2 7B
|
|
||||||
|
|
||||||
Download the benchmarking script:
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_7b_multinode.sh
|
|
||||||
|
|
||||||
Run the multi-node training benchmark. For example:
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
sbatch -N <num_nodes> llama2_7b_multinode.sh
|
|
||||||
|
|
||||||
* Example 2: Multi-node training with Llama 2 70B
|
|
||||||
|
|
||||||
Download the benchmarking script:
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_70b_multinode.sh
|
|
||||||
|
|
||||||
Run the multi-node training benchmark. For example:
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
sbatch -N <num_nodes> llama2_70b_multinode.sh
|
|
||||||
|
|
||||||
* Example 3: Multi-node training with Llama 3 8B model
|
|
||||||
|
|
||||||
Download the benchmarking script:
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_8b_multinode.sh
|
|
||||||
|
|
||||||
Run the multi-node training benchmark. For example:
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
sbatch -N <num_nodes> llama3_8b_multinode.sh
|
|
||||||
|
|
||||||
* Example 4: Multi-node training with Llama 3 70B model
|
|
||||||
|
|
||||||
Download the benchmarking script:
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_70b_multinode.sh
|
|
||||||
|
|
||||||
Run the multi-node training benchmark. For example:
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
sbatch -N <num_nodes> llama3_70b_multinode.sh
|
|
||||||
|
|
||||||
Previous versions
|
Previous versions
|
||||||
=================
|
=================
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
:orphan:
|
||||||
|
|
||||||
.. meta::
|
.. meta::
|
||||||
:description: How to train a model using Megatron-LM for ROCm.
|
:description: How to train a model using Megatron-LM for ROCm.
|
||||||
:keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
|
:keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
|
||||||
@@ -6,6 +8,14 @@
|
|||||||
Training a model with Megatron-LM for ROCm
|
Training a model with Megatron-LM for ROCm
|
||||||
******************************************
|
******************************************
|
||||||
|
|
||||||
|
.. caution::
|
||||||
|
|
||||||
|
The ROCm Megatron-LM framework now has limited support with this Docker
|
||||||
|
environment; it now focuses on Primus with Megatron-Core. See :doc:`primus-megatron`.
|
||||||
|
|
||||||
|
To learn how to migrate your existing workloads to Primus with Megatron-Core,
|
||||||
|
see :doc:`previous-versions/megatron-lm-primus-migration-guide`.
|
||||||
|
|
||||||
The `Megatron-LM framework for ROCm <https://github.com/ROCm/Megatron-LM>`_ is
|
The `Megatron-LM framework for ROCm <https://github.com/ROCm/Megatron-LM>`_ is
|
||||||
a specialized fork of the robust Megatron-LM, designed to enable efficient
|
a specialized fork of the robust Megatron-LM, designed to enable efficient
|
||||||
training of large-scale language models on AMD GPUs. By leveraging AMD
|
training of large-scale language models on AMD GPUs. By leveraging AMD
|
||||||
@@ -20,13 +30,17 @@ essential components, including PyTorch, ROCm libraries, and Megatron-LM
|
|||||||
utilities. It contains the following software components to accelerate training
|
utilities. It contains the following software components to accelerate training
|
||||||
workloads:
|
workloads:
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
This Docker environment is based on Python 3.10 and Ubuntu 22.04. For an alternative environment with
|
||||||
|
Python 3.12 and Ubuntu 24.04, see the :doc:`previous ROCm Megatron-LM v25.6 Docker release <previous-versions/megatron-lm-v25.6>`.
|
||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
|
||||||
|
|
||||||
{% set dockers = data.dockers %}
|
{% set dockers = data.dockers %}
|
||||||
{% if dockers|length > 1 %}
|
|
||||||
.. tab-set::
|
.. tab-set::
|
||||||
|
|
||||||
{% for docker in data.dockers %}
|
{% for docker in dockers %}
|
||||||
.. tab-item:: ``{{ docker.pull_tag }}``
|
.. tab-item:: ``{{ docker.pull_tag }}``
|
||||||
:sync: {{ docker.pull_tag }}
|
:sync: {{ docker.pull_tag }}
|
||||||
|
|
||||||
@@ -42,60 +56,46 @@ workloads:
|
|||||||
|
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
{% elif dockers|length == 1 %}
|
|
||||||
.. list-table::
|
|
||||||
:header-rows: 1
|
|
||||||
|
|
||||||
* - Software component
|
|
||||||
- Version
|
|
||||||
|
|
||||||
{% for component_name, component_version in docker.components %}
|
|
||||||
* - {{ component_name }}
|
|
||||||
- {{ component_version }}
|
|
||||||
|
|
||||||
{% endfor %}
|
|
||||||
{% endif %}
|
|
||||||
|
|
||||||
.. _amd-megatron-lm-model-support:
|
.. _amd-megatron-lm-model-support:
|
||||||
|
|
||||||
The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
|
|
||||||
|
|
||||||
Supported models
|
Supported models
|
||||||
================
|
================
|
||||||
|
|
||||||
The following models are supported for training performance benchmarking with Megatron-LM and ROCm.
|
The following models are supported for training performance benchmarking with Megatron-LM and ROCm
|
||||||
|
on AMD Instinct MI300X series accelerators.
|
||||||
Some instructions, commands, and training recommendations in this documentation might
|
Some instructions, commands, and training recommendations in this documentation might
|
||||||
vary by model -- select one to get started.
|
vary by model -- select one to get started.
|
||||||
|
|
||||||
{% set model_groups = data.model_groups %}
|
{% set model_groups = data.model_groups %}
|
||||||
.. raw:: html
|
.. raw:: html
|
||||||
|
|
||||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||||
<div class="row">
|
<div class="row gx-0">
|
||||||
<div class="col-2 me-2 model-param-head">Model</div>
|
<div class="col-2 me-1 px-2 model-param-head">Model</div>
|
||||||
<div class="row col-10">
|
<div class="row col-10 pe-0">
|
||||||
{% for model_group in model_groups %}
|
{% for model_group in model_groups %}
|
||||||
<div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
<div class="col-3 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="row mt-1">
|
<div class="row gx-0 pt-1">
|
||||||
<div class="col-2 me-2 model-param-head">Model variant</div>
|
<div class="col-2 me-1 px-2 model-param-head">Variant</div>
|
||||||
<div class="row col-10">
|
<div class="row col-10 pe-0">
|
||||||
{% for model_group in model_groups %}
|
{% for model_group in model_groups %}
|
||||||
{% set models = model_group.models %}
|
{% set models = model_group.models %}
|
||||||
{% for model in models %}
|
{% for model in models %}
|
||||||
{% if models|length % 3 == 0 %}
|
{% if models|length % 3 == 0 %}
|
||||||
<div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
<div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
{% else %}
|
{% else %}
|
||||||
<div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
<div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
</div>
|
</div>
|
||||||
</div>
|
|
||||||
</div>
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
|
|
||||||
@@ -177,7 +177,7 @@ Download the Docker image
|
|||||||
{% if dockers|length > 1 %}
|
{% if dockers|length > 1 %}
|
||||||
.. tab-set::
|
.. tab-set::
|
||||||
|
|
||||||
{% for docker in data.dockers %}
|
{% for docker in dockers %}
|
||||||
.. tab-item:: {{ docker.doc_name }}
|
.. tab-item:: {{ docker.doc_name }}
|
||||||
:sync: {{ docker.pull_tag }}
|
:sync: {{ docker.pull_tag }}
|
||||||
|
|
||||||
@@ -227,10 +227,17 @@ Download the Docker image
|
|||||||
docker start megatron_training_env
|
docker start megatron_training_env
|
||||||
docker exec -it megatron_training_env bash
|
docker exec -it megatron_training_env bash
|
||||||
|
|
||||||
The Docker container includes a pre-installed, verified version of the ROCm
|
4. **Megatron-LM backward compatibility setup** -- this Docker is primarily intended for use with Primus, but it maintains Megatron-LM compatibility with limited support.
|
||||||
Megatron-LM development branch
|
To roll back to using Megatron-LM, follow these steps:
|
||||||
`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev>`__, including necessary
|
|
||||||
training scripts.
|
.. code-block:: shell
|
||||||
|
|
||||||
|
cd /workspace/Megatron-LM/
|
||||||
|
pip uninstall megatron-core
|
||||||
|
pip install -e .
|
||||||
|
|
||||||
|
The Docker container hosts
|
||||||
|
`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev>`__ at verified commit ``e8e9edc``.
|
||||||
|
|
||||||
.. _amd-megatron-lm-environment-setup:
|
.. _amd-megatron-lm-environment-setup:
|
||||||
|
|
||||||
|
|||||||
@@ -17,12 +17,21 @@ previous releases of the ``ROCm/jax-training`` Docker image on `Docker Hub <http
|
|||||||
- Components
|
- Components
|
||||||
- Resources
|
- Resources
|
||||||
|
|
||||||
* - 25.5 (latest)
|
* - 25.7 (latest)
|
||||||
|
-
|
||||||
|
* ROCm 6.4.1
|
||||||
|
* JAX 0.6.0, 0.5.0
|
||||||
|
-
|
||||||
|
* :doc:`Documentation <../jax-maxtext>`
|
||||||
|
* `Docker Hub (JAX 0.6.0) <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7-jax060/images/sha256-7352212ae033a76dca2b9dceffc23c1b5f1a61a7a560082cf747a9bf1acfc9ce>`__
|
||||||
|
* `Docker Hub (JAX 0.5.0) <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7/images/sha256-45f4c727d4019a63fc47313d3a5f5a5105569539294ddfd2d742218212ae9025>`__
|
||||||
|
|
||||||
|
* - 25.5
|
||||||
-
|
-
|
||||||
* ROCm 6.3.4
|
* ROCm 6.3.4
|
||||||
* JAX 0.4.35
|
* JAX 0.4.35
|
||||||
-
|
-
|
||||||
* :doc:`Documentation <../jax-maxtext>`
|
* :doc:`Documentation <jax-maxtext-v25.5>`
|
||||||
* `Docker Hub <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.5/images/sha256-4e0516358a227cae8f552fb866ec07e2edcf244756f02e7b40212abfbab5217b>`__
|
* `Docker Hub <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.5/images/sha256-4e0516358a227cae8f552fb866ec07e2edcf244756f02e7b40212abfbab5217b>`__
|
||||||
|
|
||||||
* - 25.4
|
* - 25.4
|
||||||
|
|||||||
@@ -51,7 +51,7 @@ MaxText provides the following key features to train large language models effic
|
|||||||
|
|
||||||
- Multi-node support
|
- Multi-node support
|
||||||
|
|
||||||
.. _amd-maxtext-model-support:
|
.. _amd-maxtext-model-support-v254:
|
||||||
|
|
||||||
The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
|
The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,385 @@
|
|||||||
|
:orphan:
|
||||||
|
|
||||||
|
.. meta::
|
||||||
|
:description: How to train a model using JAX MaxText for ROCm.
|
||||||
|
:keywords: ROCm, AI, LLM, train, jax, torch, Llama, flux, tutorial, docker
|
||||||
|
|
||||||
|
**************************************
|
||||||
|
Training a model with MaxText for ROCm
|
||||||
|
**************************************
|
||||||
|
|
||||||
|
.. caution::
|
||||||
|
|
||||||
|
This documentation does not reflect the latest version of ROCm JAX MaxText
|
||||||
|
training performance documentation. See :doc:`../jax-maxtext` for the latest version.
|
||||||
|
|
||||||
|
MaxText is a high-performance, open-source framework built on the Google JAX
|
||||||
|
machine learning library to train LLMs at scale. The MaxText framework for
|
||||||
|
ROCm is an optimized fork of the upstream
|
||||||
|
`<https://github.com/AI-Hypercomputer/maxtext>`__ enabling efficient AI workloads
|
||||||
|
on AMD MI300X series accelerators.
|
||||||
|
|
||||||
|
The MaxText for ROCm training Docker (``rocm/jax-training:maxtext-v25.5``) image
|
||||||
|
provides a prebuilt environment for training on AMD Instinct MI300X and MI325X accelerators,
|
||||||
|
including essential components like JAX, XLA, ROCm libraries, and MaxText utilities.
|
||||||
|
It includes the following software components:
|
||||||
|
|
||||||
|
+--------------------------+--------------------------------+
|
||||||
|
| Software component | Version |
|
||||||
|
+==========================+================================+
|
||||||
|
| ROCm | 6.3.4 |
|
||||||
|
+--------------------------+--------------------------------+
|
||||||
|
| JAX | 0.4.35 |
|
||||||
|
+--------------------------+--------------------------------+
|
||||||
|
| Python | 3.10.12 |
|
||||||
|
+--------------------------+--------------------------------+
|
||||||
|
| Transformer Engine | 1.12.0.dev0+b8b92dc |
|
||||||
|
+--------------------------+--------------------------------+
|
||||||
|
| hipBLASLt | 0.13.0-ae9c477a |
|
||||||
|
+--------------------------+--------------------------------+
|
||||||
|
|
||||||
|
Supported features and models
|
||||||
|
=============================
|
||||||
|
|
||||||
|
MaxText provides the following key features to train large language models efficiently:
|
||||||
|
|
||||||
|
- Transformer Engine (TE)
|
||||||
|
|
||||||
|
- Flash Attention (FA) 3
|
||||||
|
|
||||||
|
- GEMM tuning
|
||||||
|
|
||||||
|
- Multi-node support
|
||||||
|
|
||||||
|
.. _amd-maxtext-model-support-v255:
|
||||||
|
|
||||||
|
The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
|
||||||
|
|
||||||
|
* Llama 3.3 70B
|
||||||
|
|
||||||
|
* Llama 3.1 8B
|
||||||
|
|
||||||
|
* Llama 3.1 70B
|
||||||
|
|
||||||
|
* Llama 3 8B
|
||||||
|
|
||||||
|
* Llama 3 70B
|
||||||
|
|
||||||
|
* Llama 2 7B
|
||||||
|
|
||||||
|
* Llama 2 70B
|
||||||
|
|
||||||
|
* DeepSeek-V2-Lite
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
Some models, such as Llama 3, require an external license agreement through
|
||||||
|
a third party (for example, Meta).
|
||||||
|
|
||||||
|
Unsupported features
|
||||||
|
--------------------
|
||||||
|
|
||||||
|
Currently, MaxText's default packed input format is not supported. Using this format
|
||||||
|
with the current Docker image results in incorrect attention calculations
|
||||||
|
across different input sequences. Support for packed input format is planned for a future release.
|
||||||
|
|
||||||
|
System validation
|
||||||
|
=================
|
||||||
|
|
||||||
|
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||||
|
correctly and performing optimally.
|
||||||
|
|
||||||
|
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
||||||
|
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
||||||
|
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
||||||
|
before starting training.
|
||||||
|
|
||||||
|
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||||
|
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||||
|
system's configuration.
|
||||||
|
|
||||||
|
Environment setup
|
||||||
|
=================
|
||||||
|
|
||||||
|
This Docker image is optimized for specific model configurations outlined
|
||||||
|
as follows. Performance can vary for other training workloads, as AMD
|
||||||
|
doesn’t validate configurations and run conditions outside those described.
|
||||||
|
|
||||||
|
.. _amd-maxtext-multi-node-setup-v255:
|
||||||
|
|
||||||
|
Multi-node setup
|
||||||
|
----------------
|
||||||
|
|
||||||
|
For multi-node environments, ensure you have all the necessary packages for
|
||||||
|
your network device, such as, RDMA. If you're not using a multi-node setup
|
||||||
|
with RDMA, skip ahead to :ref:`amd-maxtext-download-docker-v255`.
|
||||||
|
|
||||||
|
1. Install the following packages to build and install the RDMA driver.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
sudo apt install iproute2 -y
|
||||||
|
sudo apt install -y linux-headers-"$(uname-r)" libelf-dev
|
||||||
|
sudo apt install -y gcc make libtool autoconf librdmacm-dev rdmacm-utils infiniband-diags ibverbs-utils perftest ethtool libibverbs-dev rdma-core strace libibmad5 libibnetdisc5 ibverbs-providers libibumad-dev libibumad3 libibverbs1 libnl-3-dev libnl-route-3-dev
|
||||||
|
|
||||||
|
Refer to your NIC manufacturer's documentation for further steps on
|
||||||
|
compiling and installing the RoCE driver. For example, for Broadcom,
|
||||||
|
see `Compiling Broadcom NIC software from source <https://docs.broadcom.com/doc/957608-AN2XX#G3.484341>`_
|
||||||
|
in `Ethernet networking guide for AMD Instinct MI300X GPU clusters <https://docs.broadcom.com/doc/957608-AN2XX>`_.
|
||||||
|
|
||||||
|
2. Set the following environment variables.
|
||||||
|
|
||||||
|
a. Master address
|
||||||
|
|
||||||
|
Change ``localhost`` to the master node's resolvable hostname or IP address:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
export MASTER_ADDR="${MASTER_ADDR:-localhost}"
|
||||||
|
|
||||||
|
b. Number of nodes
|
||||||
|
|
||||||
|
Set the number of nodes you want to train on (for example, ``2``, ``4``, or ``8``):
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
export NNODES="${NNODES:-1}"
|
||||||
|
|
||||||
|
c. Node ranks
|
||||||
|
|
||||||
|
Set the rank of each node (``0`` for master, ``1`` for the first worker node, and so on)
|
||||||
|
Node ranks should be unique across all nodes in the cluster.
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
export NODE_RANK="${NODE_RANK:-0}"
|
||||||
|
|
||||||
|
d. Network interface
|
||||||
|
|
||||||
|
Update the network interface in the script to match your system's network interface. To
|
||||||
|
find your network interface, run the following (outside of any Docker container):
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
ip a
|
||||||
|
|
||||||
|
Look for an active interface with an IP address in the same subnet as
|
||||||
|
your other nodes. Then, update the following variable in the script, for
|
||||||
|
example:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
export NCCL_SOCKET_IFNAME=ens50f0np0
|
||||||
|
|
||||||
|
This variable specifies which network interface to use for inter-node communication.
|
||||||
|
Setting this variable to the incorrect interface can result in communication failures
|
||||||
|
or significantly reduced performance.
|
||||||
|
|
||||||
|
e. RDMA interface
|
||||||
|
|
||||||
|
Ensure the :ref:`required packages <amd-maxtext-multi-node-setup-v255>` are installed on all nodes.
|
||||||
|
Then, set the RDMA interfaces to use for communication.
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
# If using Broadcom NIC
|
||||||
|
export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
|
||||||
|
# If using Mellanox NIC
|
||||||
|
export NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_8,mlx5_9
|
||||||
|
|
||||||
|
.. _amd-maxtext-download-docker-v255:
|
||||||
|
|
||||||
|
Pull the Docker image
|
||||||
|
---------------------
|
||||||
|
|
||||||
|
1. Use the following command to pull the Docker image from Docker Hub.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker pull rocm/jax-training:maxtext-v25.5
|
||||||
|
|
||||||
|
2. Use the following command to launch the Docker container. Note that the benchmarking scripts
|
||||||
|
used in the :ref:`following section <amd-maxtext-get-started-v255>` automatically launch the Docker container
|
||||||
|
and execute the benchmark.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME/.ssh:/root/.ssh --shm-size 128G --name maxtext_training rocm/jax-training:maxtext-v25.5
|
||||||
|
|
||||||
|
.. _amd-maxtext-get-started-v255:
|
||||||
|
|
||||||
|
Getting started
|
||||||
|
===============
|
||||||
|
|
||||||
|
The following examples demonstrate how to get started with single node
|
||||||
|
and multi-node training using the benchmarking scripts provided at
|
||||||
|
`<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/>`__.
|
||||||
|
|
||||||
|
.. important::
|
||||||
|
|
||||||
|
The provided scripts launch a Docker container and execute a benchmark. Ensure you run these commands outside of any existing Docker container.
|
||||||
|
|
||||||
|
Before running any benchmarks, ensure the ``$HF_HOME`` environment variable is
|
||||||
|
set correctly and points to your Hugging Face cache directory. Refer to the
|
||||||
|
README at `<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/>`__
|
||||||
|
for more detailed instructions.
|
||||||
|
|
||||||
|
Single node training benchmarking examples
|
||||||
|
------------------------------------------
|
||||||
|
|
||||||
|
* Example 1: Single node training with Llama 2 7B
|
||||||
|
|
||||||
|
Download the benchmarking script:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_7b.sh
|
||||||
|
|
||||||
|
Run the single node training benchmark:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama2_7b.sh
|
||||||
|
|
||||||
|
* Example 2: Single node training with Llama 2 70B
|
||||||
|
|
||||||
|
Download the benchmarking script:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_70b.sh
|
||||||
|
|
||||||
|
Run the single node training benchmark:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama2_70b.sh
|
||||||
|
|
||||||
|
* Example 3: Single node training with Llama 3 8B
|
||||||
|
|
||||||
|
Download the benchmarking script:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_8b.sh
|
||||||
|
|
||||||
|
Run the single node training benchmark:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama3_8b.sh
|
||||||
|
|
||||||
|
* Example 4: Single node training with Llama 3 70B
|
||||||
|
|
||||||
|
Download the benchmarking script:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_70b.sh
|
||||||
|
|
||||||
|
Run the single node training benchmark:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama3_70b.sh
|
||||||
|
|
||||||
|
* Example 5: Single node training with Llama 3.3 70B
|
||||||
|
|
||||||
|
Download the benchmarking script:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3.3_70b.sh
|
||||||
|
|
||||||
|
Run the single node training benchmark:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama3.3_70b.sh
|
||||||
|
|
||||||
|
* Example 6: Single node training with DeepSeek V2 16B
|
||||||
|
|
||||||
|
Download the benchmarking script:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/deepseek_v2_16b.sh
|
||||||
|
|
||||||
|
Run the single node training benchmark:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
IMAGE="rocm/jax-training:maxtext-v25.5" bash ./deepseek_v2_16b.sh
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
The reported TFLOP/s by MaxText for DeepSeek is not accurate. Use
|
||||||
|
the tokens/s as a performance indicator.
|
||||||
|
|
||||||
|
Multi-node training benchmarking examples
|
||||||
|
-----------------------------------------
|
||||||
|
|
||||||
|
The following examples use SLURM for running on multiple nodes -- the commands might need to be adjusted for your
|
||||||
|
own cluster setup.
|
||||||
|
|
||||||
|
* Example 1: Multi-node training with Llama 2 7B
|
||||||
|
|
||||||
|
Download the benchmarking script:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_7b_multinode.sh
|
||||||
|
|
||||||
|
Run the multi-node training benchmark. For example:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
sbatch -N <num_nodes> llama2_7b_multinode.sh
|
||||||
|
|
||||||
|
* Example 2: Multi-node training with Llama 2 70B
|
||||||
|
|
||||||
|
Download the benchmarking script:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_70b_multinode.sh
|
||||||
|
|
||||||
|
Run the multi-node training benchmark. For example:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
sbatch -N <num_nodes> llama2_70b_multinode.sh
|
||||||
|
|
||||||
|
* Example 3: Multi-node training with Llama 3 8B model
|
||||||
|
|
||||||
|
Download the benchmarking script:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_8b_multinode.sh
|
||||||
|
|
||||||
|
Run the multi-node training benchmark. For example:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
sbatch -N <num_nodes> llama3_8b_multinode.sh
|
||||||
|
|
||||||
|
* Example 4: Multi-node training with Llama 3 70B model
|
||||||
|
|
||||||
|
Download the benchmarking script:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_70b_multinode.sh
|
||||||
|
|
||||||
|
Run the multi-node training benchmark. For example:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
sbatch -N <num_nodes> llama3_70b_multinode.sh
|
||||||
|
|
||||||
|
Previous versions
|
||||||
|
=================
|
||||||
|
|
||||||
|
See :doc:`jax-maxtext-history` to find documentation for previous releases
|
||||||
|
of the ``ROCm/jax-training`` Docker image.
|
||||||
@@ -16,12 +16,20 @@ previous releases of the ``ROCm/megatron-lm`` Docker image on `Docker Hub <https
|
|||||||
- Components
|
- Components
|
||||||
- Resources
|
- Resources
|
||||||
|
|
||||||
* - v25.6 (latest)
|
* - v25.7 (latest)
|
||||||
|
-
|
||||||
|
* ROCm
|
||||||
|
* PyTorch
|
||||||
|
-
|
||||||
|
* :doc:`Documentation <../megatron-lm>`
|
||||||
|
* `Docker Hub (py310) <https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a>`__
|
||||||
|
|
||||||
|
* - v25.6
|
||||||
-
|
-
|
||||||
* ROCm 6.4.1
|
* ROCm 6.4.1
|
||||||
* PyTorch 2.8.0a0+git7d205b2
|
* PyTorch 2.8.0a0+git7d205b2
|
||||||
-
|
-
|
||||||
* :doc:`Documentation <../megatron-lm>`
|
* :doc:`Documentation <megatron-lm-v25.6>`
|
||||||
* `Docker Hub (py312) <https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py312/images/sha256-482ff906532285bceabdf2bda629bd32cb6174d2d07f4243a736378001b28df0>`__
|
* `Docker Hub (py312) <https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py312/images/sha256-482ff906532285bceabdf2bda629bd32cb6174d2d07f4243a736378001b28df0>`__
|
||||||
* `Docker Hub (py310) <https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py310/images/sha256-9627bd9378684fe26cb1a10c7dd817868f553b33402e49b058355b0f095568d6>`__
|
* `Docker Hub (py310) <https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py310/images/sha256-9627bd9378684fe26cb1a10c7dd817868f553b33402e49b058355b0f095568d6>`__
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,175 @@
|
|||||||
|
:orphan:
|
||||||
|
|
||||||
|
**********************************************************************
|
||||||
|
Migrating workloads to Primus (Megatron-Core backend) from Megatron-LM
|
||||||
|
**********************************************************************
|
||||||
|
|
||||||
|
Primus supports Megatron-Core as backend optimization library,
|
||||||
|
replacing ROCm Megatron-LM. This document outlines the steps to migrate
|
||||||
|
workload from ROCm Megatron-LM to Primus with the Megatron-Core backend.
|
||||||
|
|
||||||
|
Model architecture
|
||||||
|
==================
|
||||||
|
|
||||||
|
ROCm Megatron-LM defines model architecture parameters in the training scripts;
|
||||||
|
for example, the Llama 3 8B model parameters are defined in
|
||||||
|
`examples/llama/train_llama3.sh <https://github.com/ROCm/Megatron-LM/blob/rocm_dev/examples/llama/train_llama3.sh#L117>`__
|
||||||
|
as shown below:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
HIDDEN_SIZE=4096
|
||||||
|
FFN_HIDDEN_SIZE=14336
|
||||||
|
NUM_LAYERS=32
|
||||||
|
NUM_HEADS=32
|
||||||
|
NUM_KV_HEADS=8
|
||||||
|
|
||||||
|
Primus defines the model architecture through model YAML configuration files
|
||||||
|
inside the ``primus/configs/models/megatron/`` repository. For example, Llama 3 8B
|
||||||
|
model architecture parameters are defined in
|
||||||
|
`primus/configs/models/megatron/llama3_8B.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/llama3_8B.yaml>`__
|
||||||
|
as shown below:
|
||||||
|
|
||||||
|
.. code-block:: yaml
|
||||||
|
|
||||||
|
bases:
|
||||||
|
- llama3_base.yaml
|
||||||
|
|
||||||
|
tokenizer_type: Llama3Tokenizer
|
||||||
|
tokenizer_model: meta-llama/Llama-3.1-8B
|
||||||
|
|
||||||
|
ffn_hidden_size: 14336
|
||||||
|
hidden_size: 4096
|
||||||
|
num_attention_heads: 32
|
||||||
|
num_layers: 32
|
||||||
|
num_query_groups: 8
|
||||||
|
|
||||||
|
Primus' model config files follow a hierarchical design, meaning that new model
|
||||||
|
config YAMLs can inherit existing model config files by importing them as
|
||||||
|
bases. For example,
|
||||||
|
`llama3.1_8B.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/llama3.1_8B.yaml>`__
|
||||||
|
uses ``llama3_8B.yaml`` as a base config and overrides few parameters, as shown below.
|
||||||
|
In this example, ``llama3.1_8B`` overrides the ``max_position_embeddings`` value:
|
||||||
|
|
||||||
|
.. code-block:: yaml
|
||||||
|
|
||||||
|
bases:
|
||||||
|
- llama3_8B.yaml
|
||||||
|
|
||||||
|
tokenizer_type: Llama3Tokenizer
|
||||||
|
tokenizer_model: meta-llama/Llama-3.1-8B
|
||||||
|
|
||||||
|
max_position_embeddings: 131072
|
||||||
|
|
||||||
|
.. tip::
|
||||||
|
|
||||||
|
Primus provides ``llama_base.yaml`` as the base configuration, which can be
|
||||||
|
used as bases for additional model architectures. For example,
|
||||||
|
`mixtral_base.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/mixtral_base.yaml>`__
|
||||||
|
and
|
||||||
|
`deepseek_v3_base.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/deepseek_v3_base.yaml>`__
|
||||||
|
define ``llama_base.yaml`` as its base.
|
||||||
|
|
||||||
|
.. code-block:: yaml
|
||||||
|
|
||||||
|
# Example mixtral_base.yaml:
|
||||||
|
|
||||||
|
bases:
|
||||||
|
- llama_base.yaml
|
||||||
|
|
||||||
|
init_method_std: 0.01
|
||||||
|
rotary_base: 1000000
|
||||||
|
qk_layernorm: false
|
||||||
|
|
||||||
|
group_query_attention: true
|
||||||
|
num_query_groups: 8
|
||||||
|
|
||||||
|
# moe parameters
|
||||||
|
num_experts: 8
|
||||||
|
moe_router_topk: 2
|
||||||
|
moe_router_load_balancing_type: aux_loss
|
||||||
|
moe_aux_loss_coeff: 1e-2
|
||||||
|
moe_grouped_gemm: true
|
||||||
|
moe_token_dispatcher_type: alltoall
|
||||||
|
|
||||||
|
It is recommended to add a new ``${MODEL_NAME}_base.yaml`` to add a new
|
||||||
|
category of model and define new models on top of it. For example, to add
|
||||||
|
Qwen2.5 models in Primus, we define
|
||||||
|
`qwen2.5_base.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/qwen2.5_base.yaml>`__
|
||||||
|
and build
|
||||||
|
`qwen2.5_7B.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/qwen2.5_7B.yaml>`__
|
||||||
|
and
|
||||||
|
`qwen2.5_72B.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/qwen2.5_72B.yaml>`__
|
||||||
|
using ``qwen2.5_base.yaml`` as the base config.
|
||||||
|
|
||||||
|
Training parameters
|
||||||
|
===================
|
||||||
|
|
||||||
|
ROCm Megatron-LM also defines the training parameters, like batch size,
|
||||||
|
tensor-parallelism, precision, as so on, in the training scripts. For example,
|
||||||
|
Llama3 8B model parameters are defined in
|
||||||
|
`examples/llama/train_llama3.sh <https://github.com/ROCm/Megatron-LM/blob/rocm_dev/examples/llama/train_llama3.sh>`__
|
||||||
|
as shown below:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
TP="${TP:-8}"
|
||||||
|
PP="${PP:-1}"
|
||||||
|
CP="${CP:-1}"
|
||||||
|
MBS="${MBS:-1}"
|
||||||
|
BS="${BS:-8}"
|
||||||
|
|
||||||
|
Primus defines the training parameters in top-level YAML files -- see
|
||||||
|
`examples/megatron/configs/
|
||||||
|
<https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1/examples/megatron/configs>`__.
|
||||||
|
For example, the `llama3.1_8B-pretrain.yaml
|
||||||
|
<https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/examples/megatron/configs/llama3.1_8B-pretrain.yaml>`__
|
||||||
|
configuration imports the ``llama3.1_8B.yaml`` model architecture file. Users can then override
|
||||||
|
the default training parameters in ``llama3.1_8B-pretrain.yaml``.
|
||||||
|
|
||||||
|
.. code-block:: yaml
|
||||||
|
|
||||||
|
# model to run
|
||||||
|
model: llama3.1_8B.yaml # Model architecture yaml
|
||||||
|
overrides:
|
||||||
|
# log
|
||||||
|
# disable_wandb: false
|
||||||
|
# disable_tensorboard: false
|
||||||
|
stderr_sink_level: DEBUG
|
||||||
|
|
||||||
|
log_avg_skip_iterations: 2
|
||||||
|
log_avg_reset_interval: 50
|
||||||
|
|
||||||
|
train_iters: 50
|
||||||
|
micro_batch_size: 2
|
||||||
|
global_batch_size: 128
|
||||||
|
|
||||||
|
seq_length: 8192
|
||||||
|
max_position_embeddings: 8192
|
||||||
|
|
||||||
|
lr: 1.0e-5
|
||||||
|
min_lr: 0.0
|
||||||
|
lr_warmup_iters: 2
|
||||||
|
lr_decay_iters: null
|
||||||
|
lr_decay_style: cosine
|
||||||
|
weight_decay: 0.1
|
||||||
|
adam_beta1: 0.9
|
||||||
|
adam_beta2: 0.95
|
||||||
|
eod_mask_loss: true
|
||||||
|
init_method_std: 0.008
|
||||||
|
norm_epsilon: 1.0e-6
|
||||||
|
|
||||||
|
Backward compatibility with Megatron-LM
|
||||||
|
=======================================
|
||||||
|
|
||||||
|
The Dockerized environment used for Primus maintains compatibility with Megatron-LM with
|
||||||
|
limited support. To roll back to using Megatron-LM, follow these steps.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
cd /workspace/Megatron-LM/
|
||||||
|
pip uninstall megatron-core
|
||||||
|
pip install -e .
|
||||||
|
|
||||||
|
Once Megatron-LM is installed, follow :doc:`the documentation <../megatron-lm>` to run workloads as
|
||||||
|
usual.
|
||||||
@@ -18,7 +18,7 @@ Training a model with ROCm Megatron-LM
|
|||||||
The ROCm Megatron-LM framework is a specialized fork of the robust Megatron-LM, designed to
|
The ROCm Megatron-LM framework is a specialized fork of the robust Megatron-LM, designed to
|
||||||
enable efficient training of large-scale language models on AMD GPUs. By leveraging AMD Instinct™ MI300X
|
enable efficient training of large-scale language models on AMD GPUs. By leveraging AMD Instinct™ MI300X
|
||||||
accelerators, AMD Megatron-LM delivers enhanced scalability, performance, and resource utilization for AI
|
accelerators, AMD Megatron-LM delivers enhanced scalability, performance, and resource utilization for AI
|
||||||
workloads. It is purpose-built to :ref:`support models <amd-megatron-lm-model-support>`
|
workloads. It is purpose-built to :ref:`support models <amd-megatron-lm-model-support-24-12>`
|
||||||
like Meta's Llama 2, Llama 3, and Llama 3.1, enabling developers to train next-generation AI models with greater
|
like Meta's Llama 2, Llama 3, and Llama 3.1, enabling developers to train next-generation AI models with greater
|
||||||
efficiency. See the GitHub repository at `<https://github.com/ROCm/Megatron-LM>`__.
|
efficiency. See the GitHub repository at `<https://github.com/ROCm/Megatron-LM>`__.
|
||||||
|
|
||||||
@@ -67,7 +67,7 @@ Megatron-LM provides the following key features to train large language models e
|
|||||||
|
|
||||||
- Pre-training
|
- Pre-training
|
||||||
|
|
||||||
.. _amd-megatron-lm-model-support:
|
.. _amd-megatron-lm-model-support-24-12:
|
||||||
|
|
||||||
The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.
|
The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.
|
||||||
|
|
||||||
|
|||||||
@@ -67,7 +67,7 @@ Megatron-LM provides the following key features to train large language models e
|
|||||||
|
|
||||||
- Pre-training
|
- Pre-training
|
||||||
|
|
||||||
.. _amd-megatron-lm-model-support:
|
.. _amd-megatron-lm-model-support-25-3:
|
||||||
|
|
||||||
The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.
|
The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.
|
||||||
|
|
||||||
@@ -278,7 +278,7 @@ handle a variety of input sequences, including unseen words or domain-specific t
|
|||||||
.. tab-item:: Llama
|
.. tab-item:: Llama
|
||||||
:sync: llama
|
:sync: llama
|
||||||
|
|
||||||
To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``Llama2Tokenizer``.
|
To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support-25-3>`, use the ``Llama2Tokenizer``.
|
||||||
|
|
||||||
To train any of Llama 3 and Llama 3.1 models that this Docker image supports, use the ``HuggingFaceTokenizer``.
|
To train any of Llama 3 and Llama 3.1 models that this Docker image supports, use the ``HuggingFaceTokenizer``.
|
||||||
Set the Hugging Face model link in the ``TOKENIZER_MODEL`` variable.
|
Set the Hugging Face model link in the ``TOKENIZER_MODEL`` variable.
|
||||||
@@ -292,7 +292,7 @@ handle a variety of input sequences, including unseen words or domain-specific t
|
|||||||
.. tab-item:: DeepSeek V2
|
.. tab-item:: DeepSeek V2
|
||||||
:sync: deepseek
|
:sync: deepseek
|
||||||
|
|
||||||
To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``DeepSeekV2Tokenizer``.
|
To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support-25-3>`, use the ``DeepSeekV2Tokenizer``.
|
||||||
|
|
||||||
Multi-node training
|
Multi-node training
|
||||||
^^^^^^^^^^^^^^^^^^^
|
^^^^^^^^^^^^^^^^^^^
|
||||||
|
|||||||
@@ -67,7 +67,7 @@ Megatron-LM provides the following key features to train large language models e
|
|||||||
|
|
||||||
- Pre-training
|
- Pre-training
|
||||||
|
|
||||||
.. _amd-megatron-lm-model-support:
|
.. _amd-megatron-lm-model-support-25-4:
|
||||||
|
|
||||||
The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
|
The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
|
||||||
|
|
||||||
@@ -291,7 +291,7 @@ or ``${DATA_DIR}/tokenizer_llama2``.
|
|||||||
.. tab-item:: Llama
|
.. tab-item:: Llama
|
||||||
:sync: llama
|
:sync: llama
|
||||||
|
|
||||||
To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``Llama2Tokenizer``
|
To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support-25-4>`, use the ``Llama2Tokenizer``
|
||||||
or the default ``HuggingFaceTokenizer``.
|
or the default ``HuggingFaceTokenizer``.
|
||||||
|
|
||||||
To train any of Llama 3 and Llama 3.1 models that this Docker image supports, use the ``HuggingFaceTokenizer``.
|
To train any of Llama 3 and Llama 3.1 models that this Docker image supports, use the ``HuggingFaceTokenizer``.
|
||||||
@@ -320,7 +320,7 @@ or ``${DATA_DIR}/tokenizer_llama2``.
|
|||||||
.. tab-item:: DeepSeek V2
|
.. tab-item:: DeepSeek V2
|
||||||
:sync: deepseek
|
:sync: deepseek
|
||||||
|
|
||||||
To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``DeepSeekV2Tokenizer``.
|
To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support-25-4>`, use the ``DeepSeekV2Tokenizer``.
|
||||||
|
|
||||||
Multi-node training
|
Multi-node training
|
||||||
^^^^^^^^^^^^^^^^^^^
|
^^^^^^^^^^^^^^^^^^^
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -16,12 +16,20 @@ previous releases of the ``ROCm/pytorch-training`` Docker image on `Docker Hub <
|
|||||||
- Components
|
- Components
|
||||||
- Resources
|
- Resources
|
||||||
|
|
||||||
|
* - v25.7
|
||||||
|
-
|
||||||
|
* ROCm 6.4.2
|
||||||
|
* PyTorch 2.8.0a0+gitd06a406
|
||||||
|
-
|
||||||
|
* :doc:`Documentation <../pytorch-training>`
|
||||||
|
* `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.7/images/sha256-cc6fd840ab89cb81d926fc29eca6d075aee9875a55a522675a4b9231c9a0a712>`__
|
||||||
|
|
||||||
* - v25.6
|
* - v25.6
|
||||||
-
|
-
|
||||||
* ROCm 6.3.4
|
* ROCm 6.3.4
|
||||||
* PyTorch 2.8.0a0+git7d205b2
|
* PyTorch 2.8.0a0+git7d205b2
|
||||||
-
|
-
|
||||||
* :doc:`Documentation <../pytorch-training>`
|
* :doc:`Documentation <pytorch-training-v25.6>`
|
||||||
* `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.6/images/sha256-a4cea3c493a4a03d199a3e81960ac071d79a4a7a391aa9866add3b30a7842661>`__
|
* `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.6/images/sha256-a4cea3c493a4a03d199a3e81960ac071d79a4a7a391aa9866add3b30a7842661>`__
|
||||||
|
|
||||||
* - v25.5
|
* - v25.5
|
||||||
|
|||||||
@@ -437,3 +437,8 @@ Once the setup is complete, choose between two options to start benchmarking:
|
|||||||
|
|
||||||
./pytorch_benchmark_report.sh -t HF_finetune_lora -p BF16 -m Llama-2-70B
|
./pytorch_benchmark_report.sh -t HF_finetune_lora -p BF16 -m Llama-2-70B
|
||||||
|
|
||||||
|
Previous versions
|
||||||
|
=================
|
||||||
|
|
||||||
|
See :doc:`pytorch-training-history` to find documentation for previous releases
|
||||||
|
of the ``ROCm/pytorch-training`` Docker image.
|
||||||
|
|||||||
@@ -0,0 +1,456 @@
|
|||||||
|
:orphan:
|
||||||
|
|
||||||
|
.. meta::
|
||||||
|
:description: How to train a model using PyTorch for ROCm.
|
||||||
|
:keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
|
||||||
|
|
||||||
|
**************************************
|
||||||
|
Training a model with PyTorch for ROCm
|
||||||
|
**************************************
|
||||||
|
|
||||||
|
.. caution::
|
||||||
|
|
||||||
|
This documentation does not reflect the latest version of ROCm vLLM
|
||||||
|
performance benchmark documentation. See :doc:`../pytorch-training` for the latest version.
|
||||||
|
|
||||||
|
PyTorch is an open-source machine learning framework that is widely used for
|
||||||
|
model training with GPU-optimized components for transformer-based models.
|
||||||
|
|
||||||
|
The `PyTorch for ROCm training Docker <https://hub.docker.com/layers/rocm/pytorch-training/v25.6/images/sha256-a4cea3c493a4a03d199a3e81960ac071d79a4a7a391aa9866add3b30a7842661>`_
|
||||||
|
(``rocm/pytorch-training:v25.6``) image provides a prebuilt optimized environment for fine-tuning and pretraining a
|
||||||
|
model on AMD Instinct MI325X and MI300X accelerators. It includes the following software components to accelerate
|
||||||
|
training workloads:
|
||||||
|
|
||||||
|
+--------------------------+--------------------------------+
|
||||||
|
| Software component | Version |
|
||||||
|
+==========================+================================+
|
||||||
|
| ROCm | 6.3.4 |
|
||||||
|
+--------------------------+--------------------------------+
|
||||||
|
| PyTorch | 2.8.0a0+git7d205b2 |
|
||||||
|
+--------------------------+--------------------------------+
|
||||||
|
| Python | 3.10.17 |
|
||||||
|
+--------------------------+--------------------------------+
|
||||||
|
| Transformer Engine | 1.14.0+2f85f5f2 |
|
||||||
|
+--------------------------+--------------------------------+
|
||||||
|
| Flash Attention | 3.0.0.post1 |
|
||||||
|
+--------------------------+--------------------------------+
|
||||||
|
| hipBLASLt | 0.15.0-8c6919d |
|
||||||
|
+--------------------------+--------------------------------+
|
||||||
|
| Triton | 3.3.0 |
|
||||||
|
+--------------------------+--------------------------------+
|
||||||
|
|
||||||
|
.. _amd-pytorch-training-model-support-v256:
|
||||||
|
|
||||||
|
Supported models
|
||||||
|
================
|
||||||
|
|
||||||
|
The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X accelerators.
|
||||||
|
|
||||||
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.6-benchmark-models.yaml
|
||||||
|
|
||||||
|
{% set unified_docker = data.unified_docker.latest %}
|
||||||
|
{% set model_groups = data.model_groups %}
|
||||||
|
|
||||||
|
.. raw:: html
|
||||||
|
|
||||||
|
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||||
|
<div class="row">
|
||||||
|
<div class="col-2 me-2 model-param-head">Workload</div>
|
||||||
|
<div class="row col-10">
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
<div class="col-6 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="row mt-1">
|
||||||
|
<div class="col-2 me-2 model-param-head">Model</div>
|
||||||
|
<div class="row col-10">
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
{% set models = model_group.models %}
|
||||||
|
{% for model in models %}
|
||||||
|
{% if models|length % 3 == 0 %}
|
||||||
|
<div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
|
{% else %}
|
||||||
|
<div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
|
{% endif %}
|
||||||
|
{% endfor %}
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
Some models require an external license agreement through a third party (for example, Meta).
|
||||||
|
|
||||||
|
.. _amd-pytorch-training-performance-measurements-v256:
|
||||||
|
|
||||||
|
Performance measurements
|
||||||
|
========================
|
||||||
|
|
||||||
|
To evaluate performance, the
|
||||||
|
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
|
||||||
|
page provides reference throughput and latency measurements for training
|
||||||
|
popular AI models.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
The performance data presented in
|
||||||
|
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
|
||||||
|
should not be interpreted as the peak performance achievable by AMD
|
||||||
|
Instinct MI325X and MI300X accelerators or ROCm software.
|
||||||
|
|
||||||
|
System validation
|
||||||
|
=================
|
||||||
|
|
||||||
|
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||||
|
correctly and performing optimally.
|
||||||
|
|
||||||
|
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
||||||
|
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
||||||
|
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
||||||
|
before starting training.
|
||||||
|
|
||||||
|
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||||
|
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||||
|
system's configuration.
|
||||||
|
|
||||||
|
This Docker image is optimized for specific model configurations outlined
|
||||||
|
below. Performance can vary for other training workloads, as AMD
|
||||||
|
doesn’t validate configurations and run conditions outside those described.
|
||||||
|
|
||||||
|
Benchmarking
|
||||||
|
============
|
||||||
|
|
||||||
|
Once the setup is complete, choose between two options to start benchmarking:
|
||||||
|
|
||||||
|
.. tab-set::
|
||||||
|
|
||||||
|
.. tab-item:: MAD-integrated benchmarking
|
||||||
|
|
||||||
|
Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||||
|
directory and install the required packages on the host machine.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
git clone https://github.com/ROCm/MAD
|
||||||
|
cd MAD
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
{% for model in model_group.models %}
|
||||||
|
|
||||||
|
.. container:: model-doc {{ model.mad_tag }}
|
||||||
|
|
||||||
|
For example, use this command to run the performance benchmark test on the {{ model.model }} model
|
||||||
|
using one GPU with the {{ model.precision }} data type on the host machine.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
||||||
|
madengine run \
|
||||||
|
--tags {{ model.mad_tag }} \
|
||||||
|
--keep-model-dir \
|
||||||
|
--live-output \
|
||||||
|
--timeout 28800
|
||||||
|
|
||||||
|
MAD launches a Docker container with the name
|
||||||
|
``container_ci-{{ model.mad_tag }}``, for example. The latency and throughput reports of the
|
||||||
|
model are collected in the following path: ``~/MAD/perf.csv``.
|
||||||
|
|
||||||
|
{% endfor %}
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
.. tab-item:: Standalone benchmarking
|
||||||
|
|
||||||
|
.. rubric:: Download the Docker image and required packages
|
||||||
|
|
||||||
|
Use the following command to pull the Docker image from Docker Hub.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker pull {{ unified_docker.pull_tag }}
|
||||||
|
|
||||||
|
Run the Docker container.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env {{ unified_docker.pull_tag }}
|
||||||
|
|
||||||
|
Use these commands if you exit the ``training_env`` container and need to return to it.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker start training_env
|
||||||
|
docker exec -it training_env bash
|
||||||
|
|
||||||
|
In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
|
||||||
|
repository and navigate to the benchmark scripts directory
|
||||||
|
``/workspace/MAD/scripts/pytorch_train``.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
git clone https://github.com/ROCm/MAD
|
||||||
|
cd MAD/scripts/pytorch_train
|
||||||
|
|
||||||
|
.. rubric:: Prepare training datasets and dependencies
|
||||||
|
|
||||||
|
The following benchmarking examples require downloading models and datasets
|
||||||
|
from Hugging Face. To ensure successful access to gated repos, set your
|
||||||
|
``HF_TOKEN``.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
export HF_TOKEN=$your_personal_hugging_face_access_token
|
||||||
|
|
||||||
|
Run the setup script to install libraries and datasets needed for benchmarking.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
./pytorch_benchmark_setup.sh
|
||||||
|
|
||||||
|
.. container:: model-doc pyt_train_llama-3.1-8b
|
||||||
|
|
||||||
|
``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 8B:
|
||||||
|
|
||||||
|
.. list-table::
|
||||||
|
:header-rows: 1
|
||||||
|
|
||||||
|
* - Library
|
||||||
|
- Reference
|
||||||
|
|
||||||
|
* - ``accelerate``
|
||||||
|
- `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
|
||||||
|
|
||||||
|
* - ``datasets``
|
||||||
|
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
|
||||||
|
|
||||||
|
.. container:: model-doc pyt_train_llama-3.1-70b
|
||||||
|
|
||||||
|
``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 70B:
|
||||||
|
|
||||||
|
.. list-table::
|
||||||
|
:header-rows: 1
|
||||||
|
|
||||||
|
* - Library
|
||||||
|
- Reference
|
||||||
|
|
||||||
|
* - ``datasets``
|
||||||
|
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
|
||||||
|
|
||||||
|
* - ``torchdata``
|
||||||
|
- `TorchData <https://pytorch.org/data/beta/index.html>`_
|
||||||
|
|
||||||
|
* - ``tomli``
|
||||||
|
- `Tomli <https://pypi.org/project/tomli/>`_
|
||||||
|
|
||||||
|
* - ``tiktoken``
|
||||||
|
- `tiktoken <https://github.com/openai/tiktoken>`_
|
||||||
|
|
||||||
|
* - ``blobfile``
|
||||||
|
- `blobfile <https://pypi.org/project/blobfile/>`_
|
||||||
|
|
||||||
|
* - ``tabulate``
|
||||||
|
- `tabulate <https://pypi.org/project/tabulate/>`_
|
||||||
|
|
||||||
|
* - ``wandb``
|
||||||
|
- `Weights & Biases <https://github.com/wandb/wandb>`_
|
||||||
|
|
||||||
|
* - ``sentencepiece``
|
||||||
|
- `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
|
||||||
|
|
||||||
|
* - ``tensorboard``
|
||||||
|
- `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
|
||||||
|
|
||||||
|
.. container:: model-doc pyt_train_flux
|
||||||
|
|
||||||
|
``pytorch_benchmark_setup.sh`` installs the following libraries for FLUX:
|
||||||
|
|
||||||
|
.. list-table::
|
||||||
|
:header-rows: 1
|
||||||
|
|
||||||
|
* - Library
|
||||||
|
- Reference
|
||||||
|
|
||||||
|
* - ``accelerate``
|
||||||
|
- `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
|
||||||
|
|
||||||
|
* - ``datasets``
|
||||||
|
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
|
||||||
|
|
||||||
|
* - ``sentencepiece``
|
||||||
|
- `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
|
||||||
|
|
||||||
|
* - ``tensorboard``
|
||||||
|
- `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
|
||||||
|
|
||||||
|
* - ``csvkit``
|
||||||
|
- `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1
|
||||||
|
|
||||||
|
* - ``deepspeed``
|
||||||
|
- `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2
|
||||||
|
|
||||||
|
* - ``diffusers``
|
||||||
|
- `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0
|
||||||
|
|
||||||
|
* - ``GitPython``
|
||||||
|
- `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44
|
||||||
|
|
||||||
|
* - ``opencv-python-headless``
|
||||||
|
- `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84
|
||||||
|
|
||||||
|
* - ``peft``
|
||||||
|
- `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0
|
||||||
|
|
||||||
|
* - ``protobuf``
|
||||||
|
- `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2
|
||||||
|
|
||||||
|
* - ``pytest``
|
||||||
|
- `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4
|
||||||
|
|
||||||
|
* - ``python-dotenv``
|
||||||
|
- `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1
|
||||||
|
|
||||||
|
* - ``seaborn``
|
||||||
|
- `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2
|
||||||
|
|
||||||
|
* - ``transformers``
|
||||||
|
- `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0
|
||||||
|
|
||||||
|
``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face:
|
||||||
|
|
||||||
|
* `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_
|
||||||
|
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
{% for model in model_group.models %}
|
||||||
|
{% if model_group.tag == "pre-training" and model.mad_tag in ["pyt_train_llama-3.1-8b", "pyt_train_llama-3.1-70b", "pyt_train_flux"] %}
|
||||||
|
|
||||||
|
.. container:: model-doc {{ model.mad_tag }}
|
||||||
|
|
||||||
|
.. rubric:: Pretraining
|
||||||
|
|
||||||
|
To start the pre-training benchmark, use the following command with the
|
||||||
|
appropriate options. See the following list of options and their descriptions.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
./pytorch_benchmark_report.sh -t pretrain -m {{ model.model_repo }} -p $datatype -s $sequence_length
|
||||||
|
|
||||||
|
.. list-table::
|
||||||
|
:header-rows: 1
|
||||||
|
|
||||||
|
* - Name
|
||||||
|
- Options
|
||||||
|
- Description
|
||||||
|
|
||||||
|
{% if model.mad_tag == "pyt_train_llama-3.1-8b" %}
|
||||||
|
* - ``$datatype``
|
||||||
|
- ``BF16`` or ``FP8``
|
||||||
|
- Only Llama 3.1 8B supports FP8 precision.
|
||||||
|
{% else %}
|
||||||
|
* - ``$datatype``
|
||||||
|
- ``BF16``
|
||||||
|
- Only Llama 3.1 8B supports FP8 precision.
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
* - ``$sequence_length``
|
||||||
|
- Sequence length for the language model.
|
||||||
|
- Between 2048 and 8192. 8192 by default.
|
||||||
|
|
||||||
|
{% if model.mad_tag == "pyt_train_flux" %}
|
||||||
|
.. container:: model-doc {{ model.mad_tag }}
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
Occasionally, downloading the Flux dataset might fail. In the event of this
|
||||||
|
error, manually download it from Hugging Face at
|
||||||
|
`black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
|
||||||
|
and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
|
||||||
|
the required dataset.
|
||||||
|
{% endif %}
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
{% if model_group.tag == "fine-tuning" %}
|
||||||
|
.. container:: model-doc {{ model.mad_tag }}
|
||||||
|
|
||||||
|
.. rubric:: Fine-tuning
|
||||||
|
|
||||||
|
To start the fine-tuning benchmark, use the following command with the
|
||||||
|
appropriate options. See the following list of options and their descriptions.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
./pytorch_benchmark_report.sh -t $training_mode -m {{ model.model_repo }} -p BF16 -s $sequence_length
|
||||||
|
|
||||||
|
.. list-table::
|
||||||
|
:header-rows: 1
|
||||||
|
|
||||||
|
* - Name
|
||||||
|
- Options
|
||||||
|
- Description
|
||||||
|
|
||||||
|
* - ``$training_mode``
|
||||||
|
- ``finetune_fw``
|
||||||
|
- Full weight fine-tuning (BF16 supported)
|
||||||
|
|
||||||
|
* -
|
||||||
|
- ``finetune_lora``
|
||||||
|
- LoRA fine-tuning (BF16 supported)
|
||||||
|
|
||||||
|
* -
|
||||||
|
- ``finetune_qlora``
|
||||||
|
- QLoRA fine-tuning (BF16 supported)
|
||||||
|
|
||||||
|
* -
|
||||||
|
- ``HF_finetune_lora``
|
||||||
|
- LoRA fine-tuning with Hugging Face PEFT
|
||||||
|
|
||||||
|
* - ``$datatype``
|
||||||
|
- ``BF16``
|
||||||
|
- All models support BF16.
|
||||||
|
|
||||||
|
* - ``$sequence_length``
|
||||||
|
- Between 2048 and 16384.
|
||||||
|
- Sequence length for the language model.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
{{ model.model }} currently supports the following fine-tuning methods:
|
||||||
|
|
||||||
|
{% for method in model.training_modes %}
|
||||||
|
* ``{{ method }}``
|
||||||
|
{% endfor %}
|
||||||
|
{% if model.training_modes|length < 4 %}
|
||||||
|
|
||||||
|
The upstream `torchtune <https://github.com/pytorch/torchtune>`_ repository
|
||||||
|
does not currently provide YAML configuration files for other combinations of
|
||||||
|
model to fine-tuning method
|
||||||
|
However, you can still configure your own YAML files to enable support for
|
||||||
|
fine-tuning methods not listed here by following existing patterns in the
|
||||||
|
``/workspace/torchtune/recipes/configs`` directory.
|
||||||
|
{% endif %}
|
||||||
|
{% endif %}
|
||||||
|
{% endfor %}
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
.. rubric:: Benchmarking examples
|
||||||
|
|
||||||
|
For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.
|
||||||
|
|
||||||
|
Further reading
|
||||||
|
===============
|
||||||
|
|
||||||
|
- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
|
||||||
|
|
||||||
|
- To learn more about system settings and management practices to configure your system for
|
||||||
|
AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
|
||||||
|
|
||||||
|
- For a list of other ready-made Docker images for AI with ROCm, see
|
||||||
|
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
|
||||||
|
|
||||||
|
Previous versions
|
||||||
|
=================
|
||||||
|
|
||||||
|
See :doc:`pytorch-training-history` to find documentation for previous releases
|
||||||
|
of the ``ROCm/pytorch-training`` Docker image.
|
||||||
@@ -0,0 +1,602 @@
|
|||||||
|
.. meta::
|
||||||
|
:description: How to train a model using Megatron-LM for ROCm.
|
||||||
|
:keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
|
||||||
|
|
||||||
|
**********************************************
|
||||||
|
Training a model with Primus and Megatron-Core
|
||||||
|
**********************************************
|
||||||
|
|
||||||
|
`Primus <https://github.com/AMD-AIG-AIMA/Primus>`__ is a unified and flexible
|
||||||
|
LLM training framework designed to streamline training. It streamlines LLM
|
||||||
|
training on AMD Instinct accelerators using a modular, reproducible configuration paradigm.
|
||||||
|
Primus is backend-agnostic and supports multiple training engines -- including Megatron-Core.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
Primus with the Megatron-Core backend is intended to replace ROCm
|
||||||
|
Megatron-LM in this Dockerized training environment. To learn how to migrate
|
||||||
|
workloads from Megatron-LM to Primus with Megatron-Core, see
|
||||||
|
:doc:`previous-versions/megatron-lm-primus-migration-guide`.
|
||||||
|
|
||||||
|
For ease of use, AMD provides a ready-to-use Docker image for MI300 series accelerators
|
||||||
|
containing essential components for Primus and Megatron-Core.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
This Docker environment is based on Python 3.10 and Ubuntu 22.04. For an alternative environment with
|
||||||
|
Python 3.12 and Ubuntu 24.04, see the :doc:`previous ROCm Megatron-LM v25.6 Docker release <previous-versions/megatron-lm-v25.6>`.
|
||||||
|
|
||||||
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
|
||||||
|
|
||||||
|
{% set dockers = data.dockers %}
|
||||||
|
{% set docker = dockers[0] %}
|
||||||
|
.. list-table::
|
||||||
|
:header-rows: 1
|
||||||
|
|
||||||
|
* - Software component
|
||||||
|
- Version
|
||||||
|
|
||||||
|
{% for component_name, component_version in docker.components.items() %}
|
||||||
|
* - {{ component_name }}
|
||||||
|
- {{ component_version }}
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
.. _amd-primus-megatron-lm-model-support:
|
||||||
|
|
||||||
|
Supported models
|
||||||
|
================
|
||||||
|
|
||||||
|
The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
|
||||||
|
Some instructions, commands, and training examples in this documentation might
|
||||||
|
vary by model -- select one to get started.
|
||||||
|
|
||||||
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
|
||||||
|
|
||||||
|
{% set model_groups = data.model_groups %}
|
||||||
|
.. raw:: html
|
||||||
|
|
||||||
|
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||||
|
<div class="row gx-0">
|
||||||
|
<div class="col-2 me-1 px-2 model-param-head">Model</div>
|
||||||
|
<div class="row col-10 pe-0">
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
<div class="col-3 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="row gx-0 pt-1">
|
||||||
|
<div class="col-2 me-1 px-2 model-param-head">Variant</div>
|
||||||
|
<div class="row col-10 pe-0">
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
{% set models = model_group.models %}
|
||||||
|
{% for model in models %}
|
||||||
|
{% if models|length % 3 == 0 %}
|
||||||
|
<div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
|
{% else %}
|
||||||
|
<div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
|
{% endif %}
|
||||||
|
{% endfor %}
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
Some models, such as Llama, require an external license agreement through
|
||||||
|
a third party (for example, Meta).
|
||||||
|
|
||||||
|
System validation
|
||||||
|
=================
|
||||||
|
|
||||||
|
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||||
|
correctly and performing optimally.
|
||||||
|
|
||||||
|
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
||||||
|
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
||||||
|
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
||||||
|
before starting training.
|
||||||
|
|
||||||
|
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||||
|
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||||
|
system's configuration.
|
||||||
|
|
||||||
|
.. _mi300x-amd-primus-megatron-lm-training:
|
||||||
|
|
||||||
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
|
||||||
|
|
||||||
|
{% set dockers = data.dockers %}
|
||||||
|
{% set docker = dockers[0] %}
|
||||||
|
|
||||||
|
Environment setup
|
||||||
|
=================
|
||||||
|
|
||||||
|
Use the following instructions to set up the environment, configure the script to train models, and
|
||||||
|
reproduce the benchmark results on MI300X series accelerators with the ``{{ docker.pull_tag }}`` image.
|
||||||
|
|
||||||
|
.. _amd-primus-megatron-lm-requirements:
|
||||||
|
|
||||||
|
Download the Docker image
|
||||||
|
-------------------------
|
||||||
|
|
||||||
|
1. Use the following command to pull the Docker image from Docker Hub.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker pull {{ docker.pull_tag }}
|
||||||
|
|
||||||
|
2. Launch the Docker container.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker run -it \
|
||||||
|
--device /dev/dri \
|
||||||
|
--device /dev/kfd \
|
||||||
|
--device /dev/infiniband \
|
||||||
|
--network host --ipc host \
|
||||||
|
--group-add video \
|
||||||
|
--cap-add SYS_PTRACE \
|
||||||
|
--security-opt seccomp=unconfined \
|
||||||
|
--privileged \
|
||||||
|
-v $HOME:$HOME \
|
||||||
|
--shm-size 128G \
|
||||||
|
--name primus_training_env \
|
||||||
|
{{ docker.pull_tag }}
|
||||||
|
|
||||||
|
3. Use these commands if you exit the ``primus_training_env`` container and need to return to it.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker start primus_training_env
|
||||||
|
docker exec -it primus_training_env bash
|
||||||
|
|
||||||
|
The Docker container hosts verified release tag ``v0.1.0-rc1`` of the `Primus
|
||||||
|
<https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1>`__ repository.
|
||||||
|
|
||||||
|
.. _amd-primus-megatron-lm-environment-setup:
|
||||||
|
|
||||||
|
Configuration
|
||||||
|
=============
|
||||||
|
|
||||||
|
Primus defines a training configuration in YAML for each model in
|
||||||
|
`examples/megatron/configs <https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1/examples/megatron/configs>`__.
|
||||||
|
|
||||||
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
|
||||||
|
|
||||||
|
{% set model_groups = data.model_groups %}
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
{% for model in model_group.models %}
|
||||||
|
.. container:: model-doc {{ model.mad_tag }}
|
||||||
|
|
||||||
|
To update training parameters for {{ model.model }}, you can update ``examples/megatron/configs/{{ model.config_name }}``.
|
||||||
|
Note that training configuration YAML files for other models follow this naming convention.
|
||||||
|
|
||||||
|
{% endfor %}
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
See :ref:`Key options <amd-primus-megatron-lm-benchmark-test-vars>` for more information on configuration options.
|
||||||
|
|
||||||
|
Dataset options
|
||||||
|
---------------
|
||||||
|
|
||||||
|
You can use either mock data or real data for training.
|
||||||
|
|
||||||
|
* Mock data can be useful for testing and validation. Use the ``mock_data`` field to toggle between mock and real data. The default
|
||||||
|
value is ``true`` for enabled.
|
||||||
|
|
||||||
|
.. code-block:: yaml
|
||||||
|
|
||||||
|
mock_data: true
|
||||||
|
|
||||||
|
* If you're using a real dataset, update the ``train_data_path`` field to point to the location of your dataset.
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
mock_data: false
|
||||||
|
train_data_path: /path/to/your/dataset
|
||||||
|
|
||||||
|
Ensure that the files are accessible inside the Docker container.
|
||||||
|
|
||||||
|
.. _amd-primus-megatron-lm-tokenizer:
|
||||||
|
|
||||||
|
Tokenizer
|
||||||
|
---------
|
||||||
|
|
||||||
|
In Primus, each model uses a tokenizer from Hugging Face. For example, Llama
|
||||||
|
3.1 8B model uses ``tokenizer_model: meta-llama/Llama-3.1-8B`` and
|
||||||
|
``tokenizer_type: Llama3Tokenizer`` defined in the `llama3.1-8B model
|
||||||
|
<https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1/primus/configs/models/megatron/llama3.1_8B.yaml>`__
|
||||||
|
definition. As such, you need to set the ``HF_TOKEN`` environment variable with
|
||||||
|
right permissions to access the tokenizer for each model.
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
# Export your HF_TOKEN in the workspace
|
||||||
|
export HF_TOKEN=<your_hftoken>
|
||||||
|
|
||||||
|
.. _amd-primus-megatron-lm-run-training:
|
||||||
|
|
||||||
|
Run training
|
||||||
|
============
|
||||||
|
|
||||||
|
Use the following example commands to set up the environment, configure
|
||||||
|
:ref:`key options <amd-primus-megatron-lm-benchmark-test-vars>`, and run training on
|
||||||
|
MI300X series accelerators with the AMD Megatron-LM environment.
|
||||||
|
|
||||||
|
Single node training
|
||||||
|
--------------------
|
||||||
|
|
||||||
|
To run training on a single node, navigate to ``/workspace/Primus`` and use the following setup command:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
pip install -r requirements.txt
|
||||||
|
export HSA_NO_SCRATCH_RECLAIM=1
|
||||||
|
export NVTE_CK_USES_BWD_V3=1
|
||||||
|
|
||||||
|
Once setup is complete, run the appropriate training command.
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.3-70b
|
||||||
|
|
||||||
|
To run pre-training for Llama 3.3 70B BF16, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
|
||||||
|
bash ./examples/run_pretrain.sh \
|
||||||
|
--micro_batch_size 2 \
|
||||||
|
--global_batch_size 16 \
|
||||||
|
--train_iters 50
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b
|
||||||
|
|
||||||
|
To run pre-training for Llama 3.1 8B FP8, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
|
||||||
|
bash ./examples/run_pretrain.sh \
|
||||||
|
--train_iters 50 \
|
||||||
|
--fp8 hybrid
|
||||||
|
|
||||||
|
For Llama 3.1 8B BF16, use the following command:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
|
||||||
|
bash ./examples/run_pretrain.sh --train_iters 50
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-70b
|
||||||
|
|
||||||
|
To run pre-training for Llama 3.1 70B BF16, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
|
||||||
|
bash ./examples/run_pretrain.sh \
|
||||||
|
--train_iters 50
|
||||||
|
|
||||||
|
To run the training on a single node for Llama 3.1 70B FP8 with proxy, use the following command:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
|
||||||
|
bash ./examples/run_pretrain.sh \
|
||||||
|
--train_iters 50 \
|
||||||
|
--num_layers 40 \
|
||||||
|
--fp8 hybrid \
|
||||||
|
--no_fp8_weight_transpose_cache true
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
Use two or more nodes to run the *full* Llama 70B model with FP8 precision.
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-7b
|
||||||
|
|
||||||
|
To run pre-training for Llama 2 7B FP8, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
|
||||||
|
bash ./examples/run_pretrain.sh \
|
||||||
|
--train_iters 50 \
|
||||||
|
--fp8 hybrid
|
||||||
|
|
||||||
|
To run pre-training for Llama 2 7B BF16, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
|
||||||
|
bash ./examples/run_pretrain.sh --train_iters 50
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-70b
|
||||||
|
|
||||||
|
To run pre-training for Llama 2 70B BF16, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
|
||||||
|
bash ./examples/run_pretrain.sh --train_iters 50
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v3-proxy
|
||||||
|
|
||||||
|
To run training on a single node for DeepSeek-V3 (MoE with expert parallel) with 3-layer proxy,
|
||||||
|
use the following command:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/deepseek_v3-pretrain.yaml \
|
||||||
|
bash examples/run_pretrain.sh \
|
||||||
|
--num_layers 3 \
|
||||||
|
--moe_layer_freq 1 \
|
||||||
|
--train_iters 50
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v2-lite-16b
|
||||||
|
|
||||||
|
To run training on a single node for DeepSeek-V2-Lite (MoE with expert parallel),
|
||||||
|
use the following command:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/deepseek_v2_lite-pretrain.yaml \
|
||||||
|
bash examples/run_pretrain.sh \
|
||||||
|
--global_batch_size 256 \
|
||||||
|
--train_iters 50
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x7b
|
||||||
|
|
||||||
|
To run training on a single node for Mixtral 8x7B (MoE with expert parallel),
|
||||||
|
use the following command:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/mixtral_8x7B_v0.1-pretrain.yaml \
|
||||||
|
bash examples/run_pretrain.sh --train_iters 50
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x22b-proxy
|
||||||
|
|
||||||
|
To run training on a single node for Mixtral 8x7B (MoE with expert parallel) with 4-layer proxy,
|
||||||
|
use the following command:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/mixtral_8x22B_v0.1-pretrain.yaml \
|
||||||
|
bash examples/run_pretrain.sh \
|
||||||
|
--num_layers 4 \
|
||||||
|
--pipeline_model_parallel_size 1 \
|
||||||
|
--micro_batch_size 1 \
|
||||||
|
--global_batch_size 16 \
|
||||||
|
--train_iters 50
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-7b
|
||||||
|
|
||||||
|
To run training on a single node for Qwen 2.5 7B BF16, use the following
|
||||||
|
command:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \
|
||||||
|
bash examples/run_pretrain.sh --train_iters 50
|
||||||
|
|
||||||
|
For FP8, use the following command.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \
|
||||||
|
bash examples/run_pretrain.sh \
|
||||||
|
--train_iters 50 \
|
||||||
|
--fp8 hybrid
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-72b
|
||||||
|
|
||||||
|
To run the training on a single node for Qwen 2.5 72B BF16, use the following command.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \
|
||||||
|
bash examples/run_pretrain.sh --train_iters 50
|
||||||
|
|
||||||
|
Multi-node training examples
|
||||||
|
----------------------------
|
||||||
|
|
||||||
|
To run training on multiple nodes, you can use the
|
||||||
|
`run_slurm_pretrain.sh <https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1/examples/run_slurm_pretrain.sh>`__
|
||||||
|
to launch the multi-node workload. Use the following steps to setup your environment:
|
||||||
|
|
||||||
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
|
||||||
|
|
||||||
|
{% set dockers = data.dockers %}
|
||||||
|
{% set docker = dockers[0] %}
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
cd /workspace/Primus/
|
||||||
|
export DOCKER_IMAGE={{ docker.pull_tag }}
|
||||||
|
export HF_TOKEN=<your_HF_token>
|
||||||
|
export HSA_NO_SCRATCH_RECLAIM=1
|
||||||
|
export NVTE_CK_USES_BWD_V3=1
|
||||||
|
export NCCL_IB_HCA=<your_NCCL_IB_HCA> # specify which RDMA interfaces to use for communication
|
||||||
|
export NCCL_SOCKET_IFNAME=<your_NCCL_SOCKET_IFNAME> # your Network Interface
|
||||||
|
export GLOO_SOCKET_IFNAME=<your_GLOO_SOCKET_IFNAME> # your Network Interface
|
||||||
|
export NCCL_IB_GID_INDEX=3 # Set InfiniBand GID index for NCCL communication. Default is 3 for ROCE
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
* Make sure correct network drivers are installed on the nodes. If inside a Docker, either install the drivers inside the Docker container or pass the network drivers from the host while creating Docker container.
|
||||||
|
* If ``NCCL_IB_HCA`` and ``NCCL_SOCKET_IFNAME`` are not set, Primus will try to auto-detect. However, since NICs can vary accross different cluster, it is encouraged to explicitly export your NCCL parameters for the cluster.
|
||||||
|
* To find your network interface, you can use ``ip a``.
|
||||||
|
* To find RDMA interfaces, you can use ``ibv_devices`` to get the list of all the RDMA/IB devices.
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.3-70b
|
||||||
|
|
||||||
|
To train Llama 3.3 70B FP8 on 8 nodes, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
NNODES=8 EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
|
||||||
|
bash examples/run_slurm_pretrain.sh \
|
||||||
|
--micro_batch_size 4 \
|
||||||
|
--global_batch_size 256 \
|
||||||
|
--recompute_num_layers 80 \
|
||||||
|
--no_fp8_weight_transpose_cache true \
|
||||||
|
--fp8 hybrid
|
||||||
|
|
||||||
|
To train Llama 3.3 70B BF16 on 8 nodes, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
NNODES=8 EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
|
||||||
|
bash examples/run_slurm_pretrain.sh \
|
||||||
|
--micro_batch_size 1 \
|
||||||
|
--global_batch_size 256 \
|
||||||
|
--recompute_num_layers 12
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b
|
||||||
|
|
||||||
|
To train Llama 3.1 8B FP8 on 8 nodes, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
# Adjust the training parameters. For e.g., `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case
|
||||||
|
NNODES=8 EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
|
||||||
|
bash ./examples/run_slurm_pretrain.sh \
|
||||||
|
--global_batch_size 1024 \
|
||||||
|
--fp8 hybrid
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-70b
|
||||||
|
|
||||||
|
To train Llama 3.1 70B FP8 on 8 nodes, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
NNODES=8 EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
|
||||||
|
bash examples/run_slurm_pretrain.sh \
|
||||||
|
--micro_batch_size 4 \
|
||||||
|
--global_batch_size 256 \
|
||||||
|
--recompute_num_layers 80 \
|
||||||
|
--no_fp8_weight_transpose_cache true \
|
||||||
|
--fp8 hybrid
|
||||||
|
|
||||||
|
To train Llama 3.1 70B BF16 on 8 nodes, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
NNODES=8 EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
|
||||||
|
bash examples/run_slurm_pretrain.sh \
|
||||||
|
--micro_batch_size 1 \
|
||||||
|
--global_batch_size 256 \
|
||||||
|
--recompute_num_layers 12
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-7b
|
||||||
|
|
||||||
|
To train Llama 2 8B FP8 on 8 nodes, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
# Adjust the training parameters. For e.g., `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case
|
||||||
|
NNODES=8 EXP=examples/megatron/configs/llama2_7B-pretrain.yaml bash ./examples/run_slurm_pretrain.sh --global_batch_size 2048 --fp8 hybrid
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-70b
|
||||||
|
|
||||||
|
To train Llama 2 70B FP8 on 8 nodes, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
NNODES=8 EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
|
||||||
|
bash examples/run_slurm_pretrain.sh \
|
||||||
|
--micro_batch_size 10 \
|
||||||
|
--global_batch_size 640 \
|
||||||
|
--recompute_num_layers 80 \
|
||||||
|
--no_fp8_weight_transpose_cache true \
|
||||||
|
--fp8 hybrid
|
||||||
|
|
||||||
|
To train Llama 2 70B BF16 on 8 nodes, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
NNODES=8 EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
|
||||||
|
bash ./examples/run_slurm_pretrain.sh \
|
||||||
|
--micro_batch_size 2 \
|
||||||
|
--global_batch_size 1536 \
|
||||||
|
--recompute_num_layers 12
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x7b
|
||||||
|
|
||||||
|
To train Mixtral 8x7B BF16 on 8 nodes, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
NNODES=8 EXP=examples/megatron/configs/mixtral_8x7B_v0.1-pretrain.yaml \
|
||||||
|
bash examples/run_slurm_pretrain.sh \
|
||||||
|
--micro_batch_size 2 \
|
||||||
|
--global_batch_size 256
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-72b
|
||||||
|
|
||||||
|
To train Qwen2.5 72B FP8 on 8 nodes, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
NNODES=8 EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \
|
||||||
|
bash examples/run_slurm_pretrain.sh \
|
||||||
|
--micro_batch_size 8 \
|
||||||
|
--global_batch_size 512 \
|
||||||
|
--recompute_num_layers 80 \
|
||||||
|
--no_fp8_weight_transpose_cache true \
|
||||||
|
--fp8 hybrid
|
||||||
|
|
||||||
|
.. _amd-primus-megatron-lm-benchmark-test-vars:
|
||||||
|
|
||||||
|
Key options
|
||||||
|
-----------
|
||||||
|
|
||||||
|
The following are key options to take note of
|
||||||
|
|
||||||
|
fp8
|
||||||
|
``hybrid`` enables FP8 GEMMs.
|
||||||
|
|
||||||
|
use_torch_fsdp2
|
||||||
|
``use_torch_fsdp2: 1`` enables torch fsdp-v2. If FSDP is enabled,
|
||||||
|
set ``use_distributed_optimizer`` and ``overlap_param_gather`` to ``false``.
|
||||||
|
|
||||||
|
profile
|
||||||
|
To enable PyTorch profiling, set these parameters:
|
||||||
|
|
||||||
|
.. code-block:: yaml
|
||||||
|
|
||||||
|
profile: true
|
||||||
|
use_pytorch_profiler: true
|
||||||
|
profile_step_end: 7
|
||||||
|
profile_step_start: 6
|
||||||
|
|
||||||
|
train_iters
|
||||||
|
The total number of iterations (default: 50).
|
||||||
|
|
||||||
|
mock_data
|
||||||
|
True by default.
|
||||||
|
|
||||||
|
micro_batch_size
|
||||||
|
Micro batch size.
|
||||||
|
|
||||||
|
global_batch_size
|
||||||
|
Global batch size.
|
||||||
|
|
||||||
|
recompute_granularity
|
||||||
|
For activation checkpointing.
|
||||||
|
|
||||||
|
num_layers
|
||||||
|
For using a reduced number of layers as with proxy models.
|
||||||
|
|
||||||
|
Previous versions
|
||||||
|
=================
|
||||||
|
|
||||||
|
See :doc:`previous-versions/megatron-lm-history` to find documentation for previous releases
|
||||||
|
of the ``ROCm/megatron-lm`` Docker image.
|
||||||
|
|
||||||
|
This training environment now uses Primus with Megatron as the primary
|
||||||
|
configuration. Limited support for the legacy ROCm Megatron-LM is still
|
||||||
|
available. For instructions on using ROCm Megatron-LM, see the
|
||||||
|
:doc:`megatron-lm` document.
|
||||||
@@ -9,28 +9,25 @@ Training a model with PyTorch for ROCm
|
|||||||
PyTorch is an open-source machine learning framework that is widely used for
|
PyTorch is an open-source machine learning framework that is widely used for
|
||||||
model training with GPU-optimized components for transformer-based models.
|
model training with GPU-optimized components for transformer-based models.
|
||||||
|
|
||||||
The `PyTorch for ROCm training Docker <https://hub.docker.com/r/rocm/pytorch-training/tags>`_
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
|
||||||
(``rocm/pytorch-training:v25.6``) image provides a prebuilt optimized environment for fine-tuning and pretraining a
|
|
||||||
model on AMD Instinct MI325X and MI300X accelerators. It includes the following software components to accelerate
|
|
||||||
training workloads:
|
|
||||||
|
|
||||||
+--------------------------+--------------------------------+
|
{% set dockers = data.dockers %}
|
||||||
| Software component | Version |
|
{% set docker = dockers[0] %}
|
||||||
+==========================+================================+
|
The `PyTorch for ROCm training Docker <{{ docker.docker_hub_url }}>`__
|
||||||
| ROCm | 6.3.4 |
|
(``{{ docker.pull_tag }}``) image provides a prebuilt optimized environment for fine-tuning and pretraining a
|
||||||
+--------------------------+--------------------------------+
|
model on AMD Instinct MI325X and MI300X accelerators. It includes the following software components to accelerate
|
||||||
| PyTorch | 2.8.0a0+git7d205b2 |
|
training workloads:
|
||||||
+--------------------------+--------------------------------+
|
|
||||||
| Python | 3.10.17 |
|
.. list-table::
|
||||||
+--------------------------+--------------------------------+
|
:header-rows: 1
|
||||||
| Transformer Engine | 1.14.0+2f85f5f2 |
|
|
||||||
+--------------------------+--------------------------------+
|
* - Software component
|
||||||
| Flash Attention | 3.0.0.post1 |
|
- Version
|
||||||
+--------------------------+--------------------------------+
|
|
||||||
| hipBLASLt | 0.15.0-8c6919d |
|
{% for component_name, component_version in docker.components.items() %}
|
||||||
+--------------------------+--------------------------------+
|
* - {{ component_name }}
|
||||||
| Triton | 3.3.0 |
|
- {{ component_version }}
|
||||||
+--------------------------+--------------------------------+
|
{% endfor %}
|
||||||
|
|
||||||
.. _amd-pytorch-training-model-support:
|
.. _amd-pytorch-training-model-support:
|
||||||
|
|
||||||
@@ -38,119 +35,152 @@ Supported models
|
|||||||
================
|
================
|
||||||
|
|
||||||
The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X accelerators.
|
The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X accelerators.
|
||||||
|
Some instructions, commands, and training recommendations in this documentation might
|
||||||
|
vary by model -- select one to get started.
|
||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
|
||||||
|
|
||||||
{% set unified_docker = data.unified_docker.latest %}
|
{% set unified_docker = data.dockers[0] %}
|
||||||
{% set model_groups = data.model_groups %}
|
{% set model_groups = data.model_groups %}
|
||||||
|
|
||||||
.. raw:: html
|
.. raw:: html
|
||||||
|
|
||||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||||
<div class="row">
|
<div class="row gx-0">
|
||||||
<div class="col-2 me-2 model-param-head">Workload</div>
|
<div class="col-2 me-1 px-2 model-param-head">Model</div>
|
||||||
<div class="row col-10">
|
<div class="row col-10 pe-0">
|
||||||
{% for model_group in model_groups %}
|
{% for model_group in model_groups %}
|
||||||
<div class="col-6 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
<div class="col-3 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||||
{% endfor %}
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="row mt-1">
|
|
||||||
<div class="col-2 me-2 model-param-head">Model</div>
|
|
||||||
<div class="row col-10">
|
|
||||||
{% for model_group in model_groups %}
|
|
||||||
{% set models = model_group.models %}
|
|
||||||
{% for model in models %}
|
|
||||||
{% if models|length % 3 == 0 %}
|
|
||||||
<div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
|
||||||
{% else %}
|
|
||||||
<div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
|
||||||
{% endif %}
|
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
{% endfor %}
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
|
||||||
|
<div class="row gx-0 pt-1">
|
||||||
|
<div class="col-2 me-1 px-2 model-param-head">Variant</div>
|
||||||
|
<div class="row col-10 pe-0">
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
{% set models = model_group.models %}
|
||||||
|
{% for model in models %}
|
||||||
|
{% if models|length % 3 == 0 %}
|
||||||
|
<div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
|
{% else %}
|
||||||
|
<div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
|
{% endif %}
|
||||||
|
{% endfor %}
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
.. note::
|
|
||||||
|
|
||||||
Some models require an external license agreement through a third party (for example, Meta).
|
.. _amd-pytorch-training-supported-training-modes:
|
||||||
|
|
||||||
.. _amd-pytorch-training-performance-measurements:
|
The following table lists supported training modes per model.
|
||||||
|
|
||||||
Performance measurements
|
.. dropdown:: Supported training modes
|
||||||
========================
|
|
||||||
|
|
||||||
To evaluate performance, the
|
.. list-table::
|
||||||
|
:header-rows: 1
|
||||||
|
|
||||||
|
* - Model
|
||||||
|
- Supported training modes
|
||||||
|
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
{% set models = model_group.models %}
|
||||||
|
{% for model in models %}
|
||||||
|
* - {{ model.model }}
|
||||||
|
- ``{{ model.training_modes | join('``, ``') }}``
|
||||||
|
|
||||||
|
{% endfor %}
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
Some model and fine-tuning combinations are not listed. This is
|
||||||
|
because the `upstream torchtune repository <https://github.com/pytorch/torchtune>`__
|
||||||
|
doesn't provide default YAML configurations for them.
|
||||||
|
For advanced usage, you can create a custom configuration to enable
|
||||||
|
unlisted fine-tuning methods by using an existing file in the
|
||||||
|
``/workspace/torchtune/recipes/configs`` directory as a template.
|
||||||
|
|
||||||
|
.. _amd-pytorch-training-performance-measurements:
|
||||||
|
|
||||||
|
Performance measurements
|
||||||
|
========================
|
||||||
|
|
||||||
|
To evaluate performance, the
|
||||||
|
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
|
||||||
|
page provides reference throughput and latency measurements for training
|
||||||
|
popular AI models.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
The performance data presented in
|
||||||
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
|
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
|
||||||
page provides reference throughput and latency measurements for training
|
should not be interpreted as the peak performance achievable by AMD
|
||||||
popular AI models.
|
Instinct MI325X and MI300X accelerators or ROCm software.
|
||||||
|
|
||||||
.. note::
|
System validation
|
||||||
|
=================
|
||||||
|
|
||||||
The performance data presented in
|
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||||
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
|
correctly and performing optimally.
|
||||||
should not be interpreted as the peak performance achievable by AMD
|
|
||||||
Instinct MI325X and MI300X accelerators or ROCm software.
|
|
||||||
|
|
||||||
System validation
|
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
||||||
=================
|
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
||||||
|
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
||||||
|
before starting training.
|
||||||
|
|
||||||
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||||
correctly and performing optimally.
|
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||||
|
system's configuration.
|
||||||
|
|
||||||
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
This Docker image is optimized for specific model configurations outlined
|
||||||
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
below. Performance can vary for other training workloads, as AMD
|
||||||
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
doesn’t test configurations and run conditions outside those described.
|
||||||
before starting training.
|
|
||||||
|
|
||||||
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
Run training
|
||||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
============
|
||||||
system's configuration.
|
|
||||||
|
|
||||||
This Docker image is optimized for specific model configurations outlined
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
|
||||||
below. Performance can vary for other training workloads, as AMD
|
|
||||||
doesn’t validate configurations and run conditions outside those described.
|
|
||||||
|
|
||||||
Benchmarking
|
{% set unified_docker = data.dockers[0] %}
|
||||||
============
|
{% set model_groups = data.model_groups %}
|
||||||
|
|
||||||
Once the setup is complete, choose between two options to start benchmarking:
|
Once the setup is complete, choose between two options to start benchmarking training:
|
||||||
|
|
||||||
.. tab-set::
|
.. tab-set::
|
||||||
|
|
||||||
.. tab-item:: MAD-integrated benchmarking
|
.. tab-item:: MAD-integrated benchmarking
|
||||||
|
|
||||||
Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||||
directory and install the required packages on the host machine.
|
directory and install the required packages on the host machine.
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
git clone https://github.com/ROCm/MAD
|
git clone https://github.com/ROCm/MAD
|
||||||
cd MAD
|
cd MAD
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
|
|
||||||
{% for model_group in model_groups %}
|
{% for model_group in model_groups %}
|
||||||
{% for model in model_group.models %}
|
{% for model in model_group.models %}
|
||||||
|
|
||||||
.. container:: model-doc {{ model.mad_tag }}
|
.. container:: model-doc {{ model.mad_tag }}
|
||||||
|
|
||||||
For example, use this command to run the performance benchmark test on the {{ model.model }} model
|
2. For example, use this command to run the performance benchmark test on the {{ model.model }} model
|
||||||
using one GPU with the {{ model.precision }} data type on the host machine.
|
using one node with the {{ model.precision }} data type on the host machine.
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
||||||
madengine run \
|
madengine run \
|
||||||
--tags {{ model.mad_tag }} \
|
--tags {{ model.mad_tag }} \
|
||||||
--keep-model-dir \
|
--keep-model-dir \
|
||||||
--live-output \
|
--live-output \
|
||||||
--timeout 28800
|
--timeout 28800
|
||||||
|
|
||||||
MAD launches a Docker container with the name
|
MAD launches a Docker container with the name
|
||||||
``container_ci-{{ model.mad_tag }}``, for example. The latency and throughput reports of the
|
``container_ci-{{ model.mad_tag }}``. The latency and throughput reports of the
|
||||||
model are collected in the following path: ``~/MAD/perf.csv``.
|
model are collected in ``~/MAD/perf.csv``.
|
||||||
|
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
@@ -159,222 +189,213 @@ The following models are pre-optimized for performance on the AMD Instinct MI325
|
|||||||
|
|
||||||
.. rubric:: Download the Docker image and required packages
|
.. rubric:: Download the Docker image and required packages
|
||||||
|
|
||||||
Use the following command to pull the Docker image from Docker Hub.
|
1. Use the following command to pull the Docker image from Docker Hub.
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
docker pull {{ unified_docker.pull_tag }}
|
docker pull {{ unified_docker.pull_tag }}
|
||||||
|
|
||||||
Run the Docker container.
|
2. Run the Docker container.
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env {{ unified_docker.pull_tag }}
|
docker run -it \
|
||||||
|
--device /dev/dri \
|
||||||
|
--device /dev/kfd \
|
||||||
|
--network host \
|
||||||
|
--ipc host \
|
||||||
|
--group-add video \
|
||||||
|
--cap-add SYS_PTRACE \
|
||||||
|
--security-opt seccomp=unconfined \
|
||||||
|
--privileged \
|
||||||
|
-v $HOME:$HOME \
|
||||||
|
-v $HOME/.ssh:/root/.ssh \
|
||||||
|
--shm-size 64G \
|
||||||
|
--name training_env \
|
||||||
|
{{ unified_docker.pull_tag }}
|
||||||
|
|
||||||
Use these commands if you exit the ``training_env`` container and need to return to it.
|
Use these commands if you exit the ``training_env`` container and need to return to it.
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
docker start training_env
|
docker start training_env
|
||||||
docker exec -it training_env bash
|
docker exec -it training_env bash
|
||||||
|
|
||||||
In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
|
3. In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
|
||||||
repository and navigate to the benchmark scripts directory
|
repository and navigate to the benchmark scripts directory
|
||||||
``/workspace/MAD/scripts/pytorch_train``.
|
``/workspace/MAD/scripts/pytorch_train``.
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
git clone https://github.com/ROCm/MAD
|
git clone https://github.com/ROCm/MAD
|
||||||
cd MAD/scripts/pytorch_train
|
cd MAD/scripts/pytorch_train
|
||||||
|
|
||||||
.. rubric:: Prepare training datasets and dependencies
|
.. rubric:: Prepare training datasets and dependencies
|
||||||
|
|
||||||
The following benchmarking examples require downloading models and datasets
|
1. The following benchmarking examples require downloading models and datasets
|
||||||
from Hugging Face. To ensure successful access to gated repos, set your
|
from Hugging Face. To ensure successful access to gated repos, set your
|
||||||
``HF_TOKEN``.
|
``HF_TOKEN``.
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
export HF_TOKEN=$your_personal_hugging_face_access_token
|
export HF_TOKEN=$your_personal_hugging_face_access_token
|
||||||
|
|
||||||
Run the setup script to install libraries and datasets needed for benchmarking.
|
2. Run the setup script to install libraries and datasets needed for benchmarking.
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
./pytorch_benchmark_setup.sh
|
./pytorch_benchmark_setup.sh
|
||||||
|
|
||||||
.. container:: model-doc pyt_train_llama-3.1-8b
|
.. container:: model-doc pyt_train_llama-3.1-8b
|
||||||
|
|
||||||
``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 8B:
|
``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 8B:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Library
|
* - Library
|
||||||
- Reference
|
- Reference
|
||||||
|
|
||||||
* - ``accelerate``
|
* - ``accelerate``
|
||||||
- `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
|
- `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
|
||||||
|
|
||||||
* - ``datasets``
|
* - ``datasets``
|
||||||
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
|
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
|
||||||
|
|
||||||
.. container:: model-doc pyt_train_llama-3.1-70b
|
.. container:: model-doc pyt_train_llama-3.1-70b
|
||||||
|
|
||||||
``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 70B:
|
``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 70B:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Library
|
* - Library
|
||||||
- Reference
|
- Reference
|
||||||
|
|
||||||
* - ``datasets``
|
* - ``datasets``
|
||||||
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
|
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
|
||||||
|
|
||||||
* - ``torchdata``
|
* - ``torchdata``
|
||||||
- `TorchData <https://pytorch.org/data/beta/index.html>`_
|
- `TorchData <https://pytorch.org/data/beta/index.html>`_
|
||||||
|
|
||||||
* - ``tomli``
|
* - ``tomli``
|
||||||
- `Tomli <https://pypi.org/project/tomli/>`_
|
- `Tomli <https://pypi.org/project/tomli/>`_
|
||||||
|
|
||||||
* - ``tiktoken``
|
* - ``tiktoken``
|
||||||
- `tiktoken <https://github.com/openai/tiktoken>`_
|
- `tiktoken <https://github.com/openai/tiktoken>`_
|
||||||
|
|
||||||
* - ``blobfile``
|
* - ``blobfile``
|
||||||
- `blobfile <https://pypi.org/project/blobfile/>`_
|
- `blobfile <https://pypi.org/project/blobfile/>`_
|
||||||
|
|
||||||
* - ``tabulate``
|
* - ``tabulate``
|
||||||
- `tabulate <https://pypi.org/project/tabulate/>`_
|
- `tabulate <https://pypi.org/project/tabulate/>`_
|
||||||
|
|
||||||
* - ``wandb``
|
* - ``wandb``
|
||||||
- `Weights & Biases <https://github.com/wandb/wandb>`_
|
- `Weights & Biases <https://github.com/wandb/wandb>`_
|
||||||
|
|
||||||
* - ``sentencepiece``
|
* - ``sentencepiece``
|
||||||
- `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
|
- `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
|
||||||
|
|
||||||
* - ``tensorboard``
|
* - ``tensorboard``
|
||||||
- `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
|
- `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
|
||||||
|
|
||||||
.. container:: model-doc pyt_train_flux
|
.. container:: model-doc pyt_train_flux
|
||||||
|
|
||||||
``pytorch_benchmark_setup.sh`` installs the following libraries for FLUX:
|
``pytorch_benchmark_setup.sh`` installs the following libraries for FLUX:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Library
|
* - Library
|
||||||
- Reference
|
- Reference
|
||||||
|
|
||||||
* - ``accelerate``
|
* - ``accelerate``
|
||||||
- `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
|
- `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
|
||||||
|
|
||||||
* - ``datasets``
|
* - ``datasets``
|
||||||
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
|
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
|
||||||
|
|
||||||
* - ``sentencepiece``
|
* - ``sentencepiece``
|
||||||
- `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
|
- `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
|
||||||
|
|
||||||
* - ``tensorboard``
|
* - ``tensorboard``
|
||||||
- `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
|
- `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
|
||||||
|
|
||||||
* - ``csvkit``
|
* - ``csvkit``
|
||||||
- `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1
|
- `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1
|
||||||
|
|
||||||
* - ``deepspeed``
|
* - ``deepspeed``
|
||||||
- `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2
|
- `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2
|
||||||
|
|
||||||
* - ``diffusers``
|
* - ``diffusers``
|
||||||
- `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0
|
- `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0
|
||||||
|
|
||||||
* - ``GitPython``
|
* - ``GitPython``
|
||||||
- `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44
|
- `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44
|
||||||
|
|
||||||
* - ``opencv-python-headless``
|
* - ``opencv-python-headless``
|
||||||
- `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84
|
- `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84
|
||||||
|
|
||||||
* - ``peft``
|
* - ``peft``
|
||||||
- `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0
|
- `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0
|
||||||
|
|
||||||
* - ``protobuf``
|
* - ``protobuf``
|
||||||
- `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2
|
- `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2
|
||||||
|
|
||||||
* - ``pytest``
|
* - ``pytest``
|
||||||
- `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4
|
- `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4
|
||||||
|
|
||||||
* - ``python-dotenv``
|
* - ``python-dotenv``
|
||||||
- `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1
|
- `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1
|
||||||
|
|
||||||
* - ``seaborn``
|
* - ``seaborn``
|
||||||
- `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2
|
- `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2
|
||||||
|
|
||||||
* - ``transformers``
|
* - ``transformers``
|
||||||
- `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0
|
- `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0
|
||||||
|
|
||||||
``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face:
|
``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face:
|
||||||
|
|
||||||
* `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_
|
* `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_
|
||||||
|
|
||||||
{% for model_group in model_groups %}
|
{% for model_group in model_groups %}
|
||||||
{% for model in model_group.models %}
|
{% for model in model_group.models %}
|
||||||
{% if model_group.tag == "pre-training" and model.mad_tag in ["pyt_train_llama-3.1-8b", "pyt_train_llama-3.1-70b", "pyt_train_flux"] %}
|
{% set training_modes = model.training_modes %}
|
||||||
|
{% set training_mode_descs = {
|
||||||
|
"pretrain": "Benchmark pre-training.",
|
||||||
|
"HF_pretrain": "Llama 3.1 8B pre-training with FP8 precision."
|
||||||
|
} %}
|
||||||
|
{% set available_modes = training_modes | select("in", ["pretrain", "HF_pretrain"]) | list %}
|
||||||
|
{% if available_modes %}
|
||||||
|
|
||||||
.. container:: model-doc {{ model.mad_tag }}
|
.. container:: model-doc {{ model.mad_tag }}
|
||||||
|
|
||||||
.. rubric:: Pretraining
|
.. rubric:: Pre-training
|
||||||
|
|
||||||
To start the pre-training benchmark, use the following command with the
|
To start the pre-training benchmark, use the following command with the
|
||||||
appropriate options. See the following list of options and their descriptions.
|
appropriate options. See the following list of options and their descriptions.
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
./pytorch_benchmark_report.sh -t pretrain -m {{ model.model_repo }} -p $datatype -s $sequence_length
|
./pytorch_benchmark_report.sh -t {% if available_modes | length == 1 %}{{ available_modes[0] }}{% else %}$training_mode{% endif %} \
|
||||||
|
-m {{ model.model_repo }} \
|
||||||
.. list-table::
|
-p $datatype \
|
||||||
:header-rows: 1
|
-s $sequence_length
|
||||||
|
|
||||||
* - Name
|
|
||||||
- Options
|
|
||||||
- Description
|
|
||||||
|
|
||||||
{% if model.mad_tag == "pyt_train_llama-3.1-8b" %}
|
|
||||||
* - ``$datatype``
|
|
||||||
- ``BF16`` or ``FP8``
|
|
||||||
- Only Llama 3.1 8B supports FP8 precision.
|
|
||||||
{% else %}
|
|
||||||
* - ``$datatype``
|
|
||||||
- ``BF16``
|
|
||||||
- Only Llama 3.1 8B supports FP8 precision.
|
|
||||||
{% endif %}
|
|
||||||
|
|
||||||
* - ``$sequence_length``
|
|
||||||
- Sequence length for the language model.
|
|
||||||
- Between 2048 and 8192. 8192 by default.
|
|
||||||
|
|
||||||
{% if model.mad_tag == "pyt_train_flux" %}
|
{% if model.mad_tag == "pyt_train_flux" %}
|
||||||
.. container:: model-doc {{ model.mad_tag }}
|
.. container:: model-doc {{ model.mad_tag }}
|
||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
|
|
||||||
|
Currently, FLUX models are not supported out-of-the-box on {{ unified_docker.pull_tag }}.
|
||||||
|
To use FLUX, refer to the previous version of the ``pytorch-training`` Docker: :doc:`previous-versions/pytorch-training-v25.6`
|
||||||
|
|
||||||
Occasionally, downloading the Flux dataset might fail. In the event of this
|
Occasionally, downloading the Flux dataset might fail. In the event of this
|
||||||
error, manually download it from Hugging Face at
|
error, manually download it from Hugging Face at
|
||||||
`black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
|
`black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
|
||||||
and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
|
and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
|
||||||
the required dataset.
|
the required dataset.
|
||||||
{% endif %}
|
{% endif %}
|
||||||
{% endif %}
|
|
||||||
|
|
||||||
{% if model_group.tag == "fine-tuning" %}
|
|
||||||
.. container:: model-doc {{ model.mad_tag }}
|
|
||||||
|
|
||||||
.. rubric:: Fine-tuning
|
|
||||||
|
|
||||||
To start the fine-tuning benchmark, use the following command with the
|
|
||||||
appropriate options. See the following list of options and their descriptions.
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
./pytorch_benchmark_report.sh -t $training_mode -m {{ model.model_repo }} -p BF16 -s $sequence_length
|
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
@@ -383,53 +404,143 @@ The following models are pre-optimized for performance on the AMD Instinct MI325
|
|||||||
- Options
|
- Options
|
||||||
- Description
|
- Description
|
||||||
|
|
||||||
* - ``$training_mode``
|
{% for mode in available_modes %}
|
||||||
- ``finetune_fw``
|
* - {% if loop.first %}``$training_mode``{% endif %}
|
||||||
- Full weight fine-tuning (BF16 supported)
|
- ``{{ mode }}``
|
||||||
|
- {{ training_mode_descs[mode] }}
|
||||||
* -
|
{% endfor %}
|
||||||
- ``finetune_lora``
|
|
||||||
- LoRA fine-tuning (BF16 supported)
|
|
||||||
|
|
||||||
* -
|
|
||||||
- ``finetune_qlora``
|
|
||||||
- QLoRA fine-tuning (BF16 supported)
|
|
||||||
|
|
||||||
* -
|
|
||||||
- ``HF_finetune_lora``
|
|
||||||
- LoRA fine-tuning with Hugging Face PEFT
|
|
||||||
|
|
||||||
* - ``$datatype``
|
* - ``$datatype``
|
||||||
- ``BF16``
|
- ``BF16``{% if model.mad_tag == "pyt_train_llama-3.1-8b" %} or ``FP8``{% endif %}
|
||||||
- All models support BF16.
|
- Only Llama 3.1 8B supports FP8 precision.
|
||||||
|
|
||||||
|
* - ``$sequence_length``
|
||||||
|
- Sequence length for the language model.
|
||||||
|
- Between 2048 and 8192. 8192 by default.
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
{% set training_mode_descs = {
|
||||||
|
"finetune_fw": "Full weight fine-tuning (BF16 and FP8 supported).",
|
||||||
|
"finetune_lora": "LoRA fine-tuning (BF16 supported).",
|
||||||
|
"finetune_qlora": "QLoRA fine-tuning (BF16 supported).",
|
||||||
|
"HF_finetune_lora": "LoRA fine-tuning with Hugging Face PEFT.",
|
||||||
|
} %}
|
||||||
|
{% set available_modes = training_modes | select("in", ["finetune_fw", "finetune_lora", "finetune_qlora", "HF_finetune_lora"]) | list %}
|
||||||
|
{% if available_modes %}
|
||||||
|
.. container:: model-doc {{ model.mad_tag }}
|
||||||
|
|
||||||
|
.. rubric:: Fine-tuning
|
||||||
|
|
||||||
|
To start the fine-tuning benchmark, use the following command with the
|
||||||
|
appropriate options. See the following list of options and their descriptions.
|
||||||
|
See :ref:`supported training modes <amd-pytorch-training-supported-training-modes>`.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
./pytorch_benchmark_report.sh -t $training_mode \
|
||||||
|
-m {{ model.model_repo }} \
|
||||||
|
-p $datatype \
|
||||||
|
-s $sequence_length
|
||||||
|
|
||||||
|
.. list-table::
|
||||||
|
:header-rows: 1
|
||||||
|
|
||||||
|
* - Name
|
||||||
|
- Options
|
||||||
|
- Description
|
||||||
|
|
||||||
|
{% for mode in available_modes %}
|
||||||
|
* - {% if loop.first %}``$training_mode``{% endif %}
|
||||||
|
- ``{{ mode }}``
|
||||||
|
- {{ training_mode_descs[mode] }}
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
* - ``$datatype``
|
||||||
|
- ``BF16``{% if "finetune_fw" in available_modes %} or ``FP8``{% endif %}
|
||||||
|
- All models support BF16.{% if "finetune_fw" in available_modes %} FP8 is only available for full weight fine-tuning.{% endif %}
|
||||||
|
|
||||||
* - ``$sequence_length``
|
* - ``$sequence_length``
|
||||||
- Between 2048 and 16384.
|
- Between 2048 and 16384.
|
||||||
- Sequence length for the language model.
|
- Sequence length for the language model.
|
||||||
|
|
||||||
|
{% if model.mad_tag in ["pyt_train_llama3.2-vision-11b", "pyt_train_llama-3.2-vision-90b"] %}
|
||||||
.. note::
|
.. note::
|
||||||
|
|
||||||
{{ model.model }} currently supports the following fine-tuning methods:
|
For LoRA and QLoRA support with vision models (Llama 3.2 11B and 90B),
|
||||||
|
use the following torchtune commit for compatibility:
|
||||||
|
|
||||||
{% for method in model.training_modes %}
|
.. code-block:: shell
|
||||||
* ``{{ method }}``
|
|
||||||
{% endfor %}
|
git checkout 48192e23188b1fc524dd6d127725ceb2348e7f0e
|
||||||
{% if model.training_modes|length < 4 %}
|
|
||||||
|
{% elif model.mad_tag in ["pyt_train_llama-2-7b", "pyt_train_llama-2-13b", "pyt_train_llama-2-70b"] %}
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
You might encounter the following error with Llama 2: ``ValueError: seq_len (16384) of
|
||||||
|
input tensor should be smaller than max_seq_len (4096)``.
|
||||||
|
This error indicates that an input sequence is longer than the model's maximum context window.
|
||||||
|
|
||||||
|
Ensure your tokenized input does not exceed the model's ``max_seq_len`` (4096
|
||||||
|
tokens in this case). You can resolve this by truncating the input or splitting
|
||||||
|
it into smaller chunks before passing it to the model.
|
||||||
|
|
||||||
|
Note on reproducibility: The results in this guide are based on
|
||||||
|
commit ``b4c98ac`` from the upstream
|
||||||
|
`<https://github.com/pytorch/torchtune>`__ repository. For the
|
||||||
|
latest updates, you can use the main branch.
|
||||||
|
|
||||||
The upstream `torchtune <https://github.com/pytorch/torchtune>`_ repository
|
|
||||||
does not currently provide YAML configuration files for other combinations of
|
|
||||||
model to fine-tuning method
|
|
||||||
However, you can still configure your own YAML files to enable support for
|
|
||||||
fine-tuning methods not listed here by following existing patterns in the
|
|
||||||
``/workspace/torchtune/recipes/configs`` directory.
|
|
||||||
{% endif %}
|
{% endif %}
|
||||||
{% endif %}
|
{% endif %}
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
|
|
||||||
.. rubric:: Benchmarking examples
|
.. rubric:: Benchmarking examples
|
||||||
|
|
||||||
For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.
|
For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.
|
||||||
|
|
||||||
|
Multi-node training
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
Pre-training
|
||||||
|
~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Multi-node training with torchtitan is supported. The provided SLURM script is pre-configured for Llama 3 70B.
|
||||||
|
|
||||||
|
To launch the training job on a SLURM cluster for Llama 3 70B, run the following commands from the MAD repository.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
# In the MAD repository
|
||||||
|
cd scripts/pytorch_train
|
||||||
|
sbatch run_slurm_train.sh
|
||||||
|
|
||||||
|
Fine-tuning
|
||||||
|
~~~~~~~~~~~
|
||||||
|
|
||||||
|
Multi-node training with torchtune is supported. The provided SLURM script is pre-configured for Llama 3.3 70B.
|
||||||
|
|
||||||
|
To launch the training job on a SLURM cluster for Llama 3.3 70B, run the following commands from the MAD repository.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
huggingface-cli login # Get access to HF Llama model space
|
||||||
|
huggingface-cli download meta-llama/Llama-3.3-70B-Instruct --local-dir ./models/Llama-3.3-70B-Instruct # Download the Llama 3.3 model locally
|
||||||
|
# In the MAD repository
|
||||||
|
cd scripts/pytorch_train
|
||||||
|
sbatch Torchtune_Multinode.sh
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
Information regarding benchmark setup:
|
||||||
|
|
||||||
|
* By default, Llama 3.3 70B is fine-tuned using ``alpaca_dataset``.
|
||||||
|
* You can adjust the torchtune `YAML configuration file
|
||||||
|
<https://github.com/pytorch/torchtune/blob/main/recipes/configs/llama3_3/70B_full_multinode.yaml>`__
|
||||||
|
if you're using a different model.
|
||||||
|
* The number of nodes and other parameters can be tuned in the SLURM script ``Torchtune_Multinode.sh``.
|
||||||
|
* Set the ``mounting_paths`` inside the SLURM script.
|
||||||
|
|
||||||
|
Once the run is finished, you can find the log files in the ``result_torchtune/`` directory.
|
||||||
|
|
||||||
Further reading
|
Further reading
|
||||||
===============
|
===============
|
||||||
|
|||||||
@@ -21,6 +21,8 @@ In this guide, you'll learn about:
|
|||||||
|
|
||||||
- Training a model
|
- Training a model
|
||||||
|
|
||||||
|
- :doc:`With Primus (Megatron-LM backend) <benchmark-docker/primus-megatron>`
|
||||||
|
|
||||||
- :doc:`With Megatron-LM <benchmark-docker/megatron-lm>`
|
- :doc:`With Megatron-LM <benchmark-docker/megatron-lm>`
|
||||||
|
|
||||||
- :doc:`With PyTorch <benchmark-docker/pytorch-training>`
|
- :doc:`With PyTorch <benchmark-docker/pytorch-training>`
|
||||||
|
|||||||
@@ -285,7 +285,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
|||||||
- Radeon AI PRO R9700
|
- Radeon AI PRO R9700
|
||||||
- RDNA4
|
- RDNA4
|
||||||
- gfx1201
|
- gfx1201
|
||||||
- 16
|
- 32
|
||||||
- 64
|
- 64
|
||||||
- 32 or 64
|
- 32 or 64
|
||||||
- 128
|
- 128
|
||||||
|
|||||||
@@ -93,7 +93,7 @@ The following table shows whether a ROCm library is graph-safe.
|
|||||||
- ⚠️ (experimental)
|
- ⚠️ (experimental)
|
||||||
*
|
*
|
||||||
- `rocThrust <https://github.com/ROCm/rocThrust>`_
|
- `rocThrust <https://github.com/ROCm/rocThrust>`_
|
||||||
- ❌ (see :doc:`details <rocthrust:hipgraph-support>`)
|
- ❌ (see :doc:`details <rocthrust:reference/rocThrust-hipgraph-support>`)
|
||||||
*
|
*
|
||||||
- `rocWMMA <https://github.com/ROCm/rocWMMA>`_
|
- `rocWMMA <https://github.com/ROCm/rocWMMA>`_
|
||||||
- ❌
|
- ❌
|
||||||
|
|||||||
@@ -723,7 +723,7 @@ detailed description.
|
|||||||
- ❌/❌
|
- ❌/❌
|
||||||
|
|
||||||
*
|
*
|
||||||
- :doc:`MIGraphX <amdmigraphx:reference/cpp>`
|
- :doc:`MIGraphX <amdmigraphx:reference/MIGraphX-cpp>`
|
||||||
- ✅/✅
|
- ✅/✅
|
||||||
- ✅/✅
|
- ✅/✅
|
||||||
- ✅/✅
|
- ✅/✅
|
||||||
@@ -863,7 +863,7 @@ detailed description.
|
|||||||
- ✅/✅
|
- ✅/✅
|
||||||
|
|
||||||
*
|
*
|
||||||
- :doc:`MIGraphX <amdmigraphx:reference/cpp>`
|
- :doc:`MIGraphX <amdmigraphx:reference/MIGraphX-cpp>`
|
||||||
- ✅/✅
|
- ✅/✅
|
||||||
- ✅/✅
|
- ✅/✅
|
||||||
- ✅/✅
|
- ✅/✅
|
||||||
|
|||||||
@@ -27,6 +27,26 @@ subtrees:
|
|||||||
title: ROCm on Radeon GPUs
|
title: ROCm on Radeon GPUs
|
||||||
- file: how-to/deep-learning-rocm.md
|
- file: how-to/deep-learning-rocm.md
|
||||||
title: Deep learning frameworks
|
title: Deep learning frameworks
|
||||||
|
subtrees:
|
||||||
|
- entries:
|
||||||
|
- file: compatibility/ml-compatibility/pytorch-compatibility.rst
|
||||||
|
title: PyTorch compatibility
|
||||||
|
- file: compatibility/ml-compatibility/tensorflow-compatibility.rst
|
||||||
|
title: TensorFlow compatibility
|
||||||
|
- file: compatibility/ml-compatibility/jax-compatibility.rst
|
||||||
|
title: JAX compatibility
|
||||||
|
- file: compatibility/ml-compatibility/verl-compatibility.rst
|
||||||
|
title: verl compatibility
|
||||||
|
- file: compatibility/ml-compatibility/stanford-megatron-lm-compatibility.rst
|
||||||
|
title: Stanford Megatron-LM compatibility
|
||||||
|
- file: compatibility/ml-compatibility/dgl-compatibility.rst
|
||||||
|
title: DGL compatibility
|
||||||
|
- file: compatibility/ml-compatibility/megablocks-compatibility.rst
|
||||||
|
title: Megablocks compatibility
|
||||||
|
- file: compatibility/ml-compatibility/ray-compatibility.rst
|
||||||
|
title: Ray compatibility
|
||||||
|
- file: compatibility/ml-compatibility/llama-cpp-compatibility.rst
|
||||||
|
title: llama.cpp compatibility
|
||||||
- file: how-to/build-rocm.rst
|
- file: how-to/build-rocm.rst
|
||||||
title: Build ROCm from source
|
title: Build ROCm from source
|
||||||
|
|
||||||
@@ -44,8 +64,8 @@ subtrees:
|
|||||||
title: Training
|
title: Training
|
||||||
subtrees:
|
subtrees:
|
||||||
- entries:
|
- entries:
|
||||||
- file: how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
|
- file: how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst
|
||||||
title: Train a model with Megatron-LM
|
title: Train a model with Primus and Megatron-Core
|
||||||
- file: how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
|
- file: how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
|
||||||
title: Train a model with PyTorch
|
title: Train a model with PyTorch
|
||||||
- file: how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
|
- file: how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
rocm-docs-core==1.20.1
|
rocm-docs-core==1.26.0
|
||||||
sphinx-reredirects
|
sphinx-reredirects
|
||||||
sphinx-sitemap
|
sphinx-sitemap
|
||||||
sphinxcontrib.datatemplates==0.11.0
|
sphinxcontrib.datatemplates==0.11.0
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ alabaster==1.0.0
|
|||||||
# via sphinx
|
# via sphinx
|
||||||
asttokens==3.0.0
|
asttokens==3.0.0
|
||||||
# via stack-data
|
# via stack-data
|
||||||
attrs==25.3.0
|
attrs==25.4.0
|
||||||
# via
|
# via
|
||||||
# jsonschema
|
# jsonschema
|
||||||
# jupyter-cache
|
# jupyter-cache
|
||||||
@@ -19,34 +19,32 @@ babel==2.17.0
|
|||||||
# via
|
# via
|
||||||
# pydata-sphinx-theme
|
# pydata-sphinx-theme
|
||||||
# sphinx
|
# sphinx
|
||||||
beautifulsoup4==4.13.4
|
beautifulsoup4==4.14.2
|
||||||
# via pydata-sphinx-theme
|
# via pydata-sphinx-theme
|
||||||
breathe==4.36.0
|
breathe==4.36.0
|
||||||
# via rocm-docs-core
|
# via rocm-docs-core
|
||||||
certifi==2025.4.26
|
certifi==2025.10.5
|
||||||
# via requests
|
# via requests
|
||||||
cffi==1.17.1
|
cffi==2.0.0
|
||||||
# via
|
# via
|
||||||
# cryptography
|
# cryptography
|
||||||
# pynacl
|
# pynacl
|
||||||
charset-normalizer==3.4.2
|
charset-normalizer==3.4.4
|
||||||
# via requests
|
# via requests
|
||||||
click==8.2.1
|
click==8.3.0
|
||||||
# via
|
# via
|
||||||
# jupyter-cache
|
# jupyter-cache
|
||||||
# sphinx-external-toc
|
# sphinx-external-toc
|
||||||
comm==0.2.2
|
comm==0.2.3
|
||||||
# via ipykernel
|
# via ipykernel
|
||||||
cryptography==45.0.3
|
cryptography==46.0.2
|
||||||
# via pyjwt
|
# via pyjwt
|
||||||
debugpy==1.8.14
|
debugpy==1.8.17
|
||||||
# via ipykernel
|
# via ipykernel
|
||||||
decorator==5.2.1
|
decorator==5.2.1
|
||||||
# via ipython
|
# via ipython
|
||||||
defusedxml==0.7.1
|
defusedxml==0.7.1
|
||||||
# via sphinxcontrib-datatemplates
|
# via sphinxcontrib-datatemplates
|
||||||
deprecated==1.2.18
|
|
||||||
# via pygithub
|
|
||||||
docutils==0.21.2
|
docutils==0.21.2
|
||||||
# via
|
# via
|
||||||
# myst-parser
|
# myst-parser
|
||||||
@@ -54,19 +52,19 @@ docutils==0.21.2
|
|||||||
# sphinx
|
# sphinx
|
||||||
exceptiongroup==1.3.0
|
exceptiongroup==1.3.0
|
||||||
# via ipython
|
# via ipython
|
||||||
executing==2.2.0
|
executing==2.2.1
|
||||||
# via stack-data
|
# via stack-data
|
||||||
fastjsonschema==2.21.1
|
fastjsonschema==2.21.2
|
||||||
# via
|
# via
|
||||||
# nbformat
|
# nbformat
|
||||||
# rocm-docs-core
|
# rocm-docs-core
|
||||||
gitdb==4.0.12
|
gitdb==4.0.12
|
||||||
# via gitpython
|
# via gitpython
|
||||||
gitpython==3.1.44
|
gitpython==3.1.45
|
||||||
# via rocm-docs-core
|
# via rocm-docs-core
|
||||||
greenlet==3.2.3
|
greenlet==3.2.4
|
||||||
# via sqlalchemy
|
# via sqlalchemy
|
||||||
idna==3.10
|
idna==3.11
|
||||||
# via requests
|
# via requests
|
||||||
imagesize==1.4.1
|
imagesize==1.4.1
|
||||||
# via sphinx
|
# via sphinx
|
||||||
@@ -74,7 +72,7 @@ importlib-metadata==8.7.0
|
|||||||
# via
|
# via
|
||||||
# jupyter-cache
|
# jupyter-cache
|
||||||
# myst-nb
|
# myst-nb
|
||||||
ipykernel==6.29.5
|
ipykernel==7.0.0
|
||||||
# via myst-nb
|
# via myst-nb
|
||||||
ipython==8.37.0
|
ipython==8.37.0
|
||||||
# via
|
# via
|
||||||
@@ -86,9 +84,9 @@ jinja2==3.1.6
|
|||||||
# via
|
# via
|
||||||
# myst-parser
|
# myst-parser
|
||||||
# sphinx
|
# sphinx
|
||||||
jsonschema==4.24.0
|
jsonschema==4.25.1
|
||||||
# via nbformat
|
# via nbformat
|
||||||
jsonschema-specifications==2025.4.1
|
jsonschema-specifications==2025.9.1
|
||||||
# via jsonschema
|
# via jsonschema
|
||||||
jupyter-cache==1.0.1
|
jupyter-cache==1.0.1
|
||||||
# via myst-nb
|
# via myst-nb
|
||||||
@@ -106,17 +104,17 @@ markdown-it-py==3.0.0
|
|||||||
# via
|
# via
|
||||||
# mdit-py-plugins
|
# mdit-py-plugins
|
||||||
# myst-parser
|
# myst-parser
|
||||||
markupsafe==3.0.2
|
markupsafe==3.0.3
|
||||||
# via jinja2
|
# via jinja2
|
||||||
matplotlib-inline==0.1.7
|
matplotlib-inline==0.1.7
|
||||||
# via
|
# via
|
||||||
# ipykernel
|
# ipykernel
|
||||||
# ipython
|
# ipython
|
||||||
mdit-py-plugins==0.4.2
|
mdit-py-plugins==0.5.0
|
||||||
# via myst-parser
|
# via myst-parser
|
||||||
mdurl==0.1.2
|
mdurl==0.1.2
|
||||||
# via markdown-it-py
|
# via markdown-it-py
|
||||||
myst-nb==1.2.0
|
myst-nb==1.3.0
|
||||||
# via rocm-docs-core
|
# via rocm-docs-core
|
||||||
myst-parser==4.0.1
|
myst-parser==4.0.1
|
||||||
# via myst-nb
|
# via myst-nb
|
||||||
@@ -134,31 +132,30 @@ nest-asyncio==1.6.0
|
|||||||
packaging==25.0
|
packaging==25.0
|
||||||
# via
|
# via
|
||||||
# ipykernel
|
# ipykernel
|
||||||
# pydata-sphinx-theme
|
|
||||||
# sphinx
|
# sphinx
|
||||||
parso==0.8.4
|
parso==0.8.5
|
||||||
# via jedi
|
# via jedi
|
||||||
pexpect==4.9.0
|
pexpect==4.9.0
|
||||||
# via ipython
|
# via ipython
|
||||||
platformdirs==4.3.8
|
platformdirs==4.5.0
|
||||||
# via jupyter-core
|
# via jupyter-core
|
||||||
prompt-toolkit==3.0.51
|
prompt-toolkit==3.0.52
|
||||||
# via ipython
|
# via ipython
|
||||||
psutil==7.0.0
|
psutil==7.1.0
|
||||||
# via ipykernel
|
# via ipykernel
|
||||||
ptyprocess==0.7.0
|
ptyprocess==0.7.0
|
||||||
# via pexpect
|
# via pexpect
|
||||||
pure-eval==0.2.3
|
pure-eval==0.2.3
|
||||||
# via stack-data
|
# via stack-data
|
||||||
pycparser==2.22
|
pycparser==2.23
|
||||||
# via cffi
|
# via cffi
|
||||||
pydata-sphinx-theme==0.15.4
|
pydata-sphinx-theme==0.16.1
|
||||||
# via
|
# via
|
||||||
# rocm-docs-core
|
# rocm-docs-core
|
||||||
# sphinx-book-theme
|
# sphinx-book-theme
|
||||||
pygithub==2.6.1
|
pygithub==2.8.1
|
||||||
# via rocm-docs-core
|
# via rocm-docs-core
|
||||||
pygments==2.19.1
|
pygments==2.19.2
|
||||||
# via
|
# via
|
||||||
# accessible-pygments
|
# accessible-pygments
|
||||||
# ipython
|
# ipython
|
||||||
@@ -166,11 +163,11 @@ pygments==2.19.1
|
|||||||
# sphinx
|
# sphinx
|
||||||
pyjwt[crypto]==2.10.1
|
pyjwt[crypto]==2.10.1
|
||||||
# via pygithub
|
# via pygithub
|
||||||
pynacl==1.5.0
|
pynacl==1.6.0
|
||||||
# via pygithub
|
# via pygithub
|
||||||
python-dateutil==2.9.0.post0
|
python-dateutil==2.9.0.post0
|
||||||
# via jupyter-client
|
# via jupyter-client
|
||||||
pyyaml==6.0.2
|
pyyaml==6.0.3
|
||||||
# via
|
# via
|
||||||
# jupyter-cache
|
# jupyter-cache
|
||||||
# myst-nb
|
# myst-nb
|
||||||
@@ -178,21 +175,21 @@ pyyaml==6.0.2
|
|||||||
# rocm-docs-core
|
# rocm-docs-core
|
||||||
# sphinx-external-toc
|
# sphinx-external-toc
|
||||||
# sphinxcontrib-datatemplates
|
# sphinxcontrib-datatemplates
|
||||||
pyzmq==26.4.0
|
pyzmq==27.1.0
|
||||||
# via
|
# via
|
||||||
# ipykernel
|
# ipykernel
|
||||||
# jupyter-client
|
# jupyter-client
|
||||||
referencing==0.36.2
|
referencing==0.37.0
|
||||||
# via
|
# via
|
||||||
# jsonschema
|
# jsonschema
|
||||||
# jsonschema-specifications
|
# jsonschema-specifications
|
||||||
requests==2.32.4
|
requests==2.32.5
|
||||||
# via
|
# via
|
||||||
# pygithub
|
# pygithub
|
||||||
# sphinx
|
# sphinx
|
||||||
rocm-docs-core==1.20.1
|
rocm-docs-core==1.26.0
|
||||||
# via -r requirements.in
|
# via -r requirements.in
|
||||||
rpds-py==0.25.1
|
rpds-py==0.27.1
|
||||||
# via
|
# via
|
||||||
# jsonschema
|
# jsonschema
|
||||||
# referencing
|
# referencing
|
||||||
@@ -202,7 +199,7 @@ smmap==5.0.2
|
|||||||
# via gitdb
|
# via gitdb
|
||||||
snowballstemmer==3.0.1
|
snowballstemmer==3.0.1
|
||||||
# via sphinx
|
# via sphinx
|
||||||
soupsieve==2.7
|
soupsieve==2.8
|
||||||
# via beautifulsoup4
|
# via beautifulsoup4
|
||||||
sphinx==8.1.3
|
sphinx==8.1.3
|
||||||
# via
|
# via
|
||||||
@@ -220,7 +217,7 @@ sphinx==8.1.3
|
|||||||
# sphinx-reredirects
|
# sphinx-reredirects
|
||||||
# sphinxcontrib-datatemplates
|
# sphinxcontrib-datatemplates
|
||||||
# sphinxcontrib-runcmd
|
# sphinxcontrib-runcmd
|
||||||
sphinx-book-theme==1.1.4
|
sphinx-book-theme==1.1.3
|
||||||
# via rocm-docs-core
|
# via rocm-docs-core
|
||||||
sphinx-copybutton==0.5.2
|
sphinx-copybutton==0.5.2
|
||||||
# via rocm-docs-core
|
# via rocm-docs-core
|
||||||
@@ -234,7 +231,7 @@ sphinx-notfound-page==1.1.0
|
|||||||
# via rocm-docs-core
|
# via rocm-docs-core
|
||||||
sphinx-reredirects==0.1.6
|
sphinx-reredirects==0.1.6
|
||||||
# via -r requirements.in
|
# via -r requirements.in
|
||||||
sphinx-sitemap==2.7.2
|
sphinx-sitemap==2.9.0
|
||||||
# via -r requirements.in
|
# via -r requirements.in
|
||||||
sphinxcontrib-applehelp==2.0.0
|
sphinxcontrib-applehelp==2.0.0
|
||||||
# via sphinx
|
# via sphinx
|
||||||
@@ -252,21 +249,20 @@ sphinxcontrib-runcmd==0.2.0
|
|||||||
# via sphinxcontrib-datatemplates
|
# via sphinxcontrib-datatemplates
|
||||||
sphinxcontrib-serializinghtml==2.0.0
|
sphinxcontrib-serializinghtml==2.0.0
|
||||||
# via sphinx
|
# via sphinx
|
||||||
sqlalchemy==2.0.41
|
sqlalchemy==2.0.44
|
||||||
# via jupyter-cache
|
# via jupyter-cache
|
||||||
stack-data==0.6.3
|
stack-data==0.6.3
|
||||||
# via ipython
|
# via ipython
|
||||||
tabulate==0.9.0
|
tabulate==0.9.0
|
||||||
# via jupyter-cache
|
# via jupyter-cache
|
||||||
tomli==2.2.1
|
tomli==2.3.0
|
||||||
# via sphinx
|
# via sphinx
|
||||||
tornado==6.5.1
|
tornado==6.5.2
|
||||||
# via
|
# via
|
||||||
# ipykernel
|
# ipykernel
|
||||||
# jupyter-client
|
# jupyter-client
|
||||||
traitlets==5.14.3
|
traitlets==5.14.3
|
||||||
# via
|
# via
|
||||||
# comm
|
|
||||||
# ipykernel
|
# ipykernel
|
||||||
# ipython
|
# ipython
|
||||||
# jupyter-client
|
# jupyter-client
|
||||||
@@ -274,9 +270,10 @@ traitlets==5.14.3
|
|||||||
# matplotlib-inline
|
# matplotlib-inline
|
||||||
# nbclient
|
# nbclient
|
||||||
# nbformat
|
# nbformat
|
||||||
typing-extensions==4.14.0
|
typing-extensions==4.15.0
|
||||||
# via
|
# via
|
||||||
# beautifulsoup4
|
# beautifulsoup4
|
||||||
|
# cryptography
|
||||||
# exceptiongroup
|
# exceptiongroup
|
||||||
# ipython
|
# ipython
|
||||||
# myst-nb
|
# myst-nb
|
||||||
@@ -288,9 +285,7 @@ urllib3==2.5.0
|
|||||||
# via
|
# via
|
||||||
# pygithub
|
# pygithub
|
||||||
# requests
|
# requests
|
||||||
wcwidth==0.2.13
|
wcwidth==0.2.14
|
||||||
# via prompt-toolkit
|
# via prompt-toolkit
|
||||||
wrapt==1.17.2
|
|
||||||
# via deprecated
|
|
||||||
zipp==3.23.0
|
zipp==3.23.0
|
||||||
# via importlib-metadata
|
# via importlib-metadata
|
||||||
|
|||||||
@@ -7,15 +7,14 @@ html {
|
|||||||
--compat-head-color: var(--pst-color-surface);
|
--compat-head-color: var(--pst-color-surface);
|
||||||
--compat-param-hover-color: var(--pst-color-link-hover);
|
--compat-param-hover-color: var(--pst-color-link-hover);
|
||||||
--compat-param-selected-color: var(--pst-color-primary);
|
--compat-param-selected-color: var(--pst-color-primary);
|
||||||
|
--compat-border-color: var(--pst-color-border);
|
||||||
}
|
}
|
||||||
|
|
||||||
html[data-theme="light"] {
|
html[data-theme="light"] {
|
||||||
--compat-border-color: var(--pst-gray-500);
|
|
||||||
--compat-param-disabled-color: var(--pst-gray-300);
|
--compat-param-disabled-color: var(--pst-gray-300);
|
||||||
}
|
}
|
||||||
|
|
||||||
html[data-theme="dark"] {
|
html[data-theme="dark"] {
|
||||||
--compat-border-color: var(--pst-gray-600);
|
|
||||||
--compat-param-disabled-color: var(--pst-gray-600);
|
--compat-param-disabled-color: var(--pst-gray-600);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -23,6 +22,7 @@ div#vllm-benchmark-ud-params-picker.container-fluid {
|
|||||||
padding: 0 0 1rem 0;
|
padding: 0 0 1rem 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
div[data-param-k="model-group"],
|
||||||
div[data-param-k="model"] {
|
div[data-param-k="model"] {
|
||||||
background-color: var(--compat-bg-color);
|
background-color: var(--compat-bg-color);
|
||||||
padding: 2px;
|
padding: 2px;
|
||||||
@@ -31,40 +31,19 @@ div[data-param-k="model"] {
|
|||||||
cursor: pointer;
|
cursor: pointer;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
div[data-param-k="model-group"][data-param-state="selected"],
|
||||||
div[data-param-k="model"][data-param-state="selected"] {
|
div[data-param-k="model"][data-param-state="selected"] {
|
||||||
background-color: var(--compat-param-selected-color);
|
background-color: var(--compat-param-selected-color);
|
||||||
color: var(--compat-fg-color);
|
color: var(--compat-fg-color);
|
||||||
}
|
}
|
||||||
|
|
||||||
div[data-param-k="model"][data-param-state="latest-version"] {
|
div[data-param-k="model-group"]:hover,
|
||||||
background-color: var(--compat-param-selected-color);
|
div[data-param-k="model"]:hover {
|
||||||
color: var(--compat-fg-color);
|
|
||||||
}
|
|
||||||
|
|
||||||
div[data-param-k="model"][data-param-state="disabled"] {
|
|
||||||
background-color: var(--compat-param-disabled-color);
|
|
||||||
text-decoration: line-through;
|
|
||||||
/* text-decoration-color: var(--pst-color-danger); */
|
|
||||||
cursor: auto;
|
|
||||||
}
|
|
||||||
|
|
||||||
div[data-param-k="model"]:not([data-param-state]):hover {
|
|
||||||
background-color: var(--compat-param-hover-color);
|
background-color: var(--compat-param-hover-color);
|
||||||
}
|
|
||||||
|
|
||||||
div[data-param-k="model-group"] {
|
|
||||||
background-color: var(--compat-bg-color);
|
|
||||||
padding: 2px;
|
|
||||||
border: solid 1px var(--compat-border-color);
|
|
||||||
font-weight: 500;
|
|
||||||
cursor: pointer;
|
|
||||||
}
|
|
||||||
|
|
||||||
div[data-param-k="model-group"][data-param-state="selected"] {
|
|
||||||
background-color: var(--compat-param-selected-color);
|
|
||||||
color: var(--compat-fg-color);
|
color: var(--compat-fg-color);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
div[data-param-k="model-group"][data-param-state="latest-version"] {
|
div[data-param-k="model-group"][data-param-state="latest-version"] {
|
||||||
background-color: var(--compat-param-selected-color);
|
background-color: var(--compat-param-selected-color);
|
||||||
color: var(--compat-fg-color);
|
color: var(--compat-fg-color);
|
||||||
@@ -73,26 +52,19 @@ div[data-param-k="model-group"][data-param-state="latest-version"] {
|
|||||||
div[data-param-k="model-group"][data-param-state="disabled"] {
|
div[data-param-k="model-group"][data-param-state="disabled"] {
|
||||||
background-color: var(--compat-param-disabled-color);
|
background-color: var(--compat-param-disabled-color);
|
||||||
text-decoration: line-through;
|
text-decoration: line-through;
|
||||||
/* text-decoration-color: var(--pst-color-danger); */
|
text-decoration-color: var(--pst-color-danger);
|
||||||
cursor: auto;
|
cursor: auto;
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
div[data-param-k="model-group"]:not([data-param-state]):hover {
|
|
||||||
background-color: var(--compat-param-hover-color);
|
|
||||||
}
|
|
||||||
|
|
||||||
.model-param-head {
|
.model-param-head {
|
||||||
background-color: var(--compat-head-color);
|
background-color: var(--compat-head-color);
|
||||||
padding: 0.15rem 0.15rem 0.15rem 0.67rem;
|
padding: 0.15rem 0.15rem 0.15rem 0.67rem;
|
||||||
/* margin: 2px; */
|
border-right: solid 4px var(--compat-accent-color);
|
||||||
border-right: solid 2px var(--compat-accent-color);
|
|
||||||
font-weight: 600;
|
font-weight: 600;
|
||||||
}
|
}
|
||||||
|
|
||||||
.model-param {
|
.model-param {
|
||||||
/* padding: 2px; */
|
|
||||||
/* margin: 0 2px 0 2px; */
|
|
||||||
/* margin: 2px; */
|
|
||||||
border: solid 1px var(--compat-border-color);
|
border: solid 1px var(--compat-border-color);
|
||||||
font-weight: 500;
|
font-weight: 500;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user