mirror of
https://github.com/ROCm/ROCm.git
synced 2026-01-10 23:28:03 -05:00
Compare commits
35 Commits
docs/rever
...
fix-links
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b3211cc6fa | ||
|
|
5853468fca | ||
|
|
245c95690f | ||
|
|
39c1b926f6 | ||
|
|
3c3847f9f7 | ||
|
|
249bd177ec | ||
|
|
b2ee8d4b2e | ||
|
|
3f834cf520 | ||
|
|
70ba866c5b | ||
|
|
320ec4669a | ||
|
|
c9bd93b537 | ||
|
|
a060550bcd | ||
|
|
c92cbaee66 | ||
|
|
c84afacc8d | ||
|
|
843fd1b3fb | ||
|
|
82221c4e2d | ||
|
|
d0ebe126e7 | ||
|
|
74610893a9 | ||
|
|
afe3e21cad | ||
|
|
ae2440772f | ||
|
|
61f970a24d | ||
|
|
85a1682573 | ||
|
|
87c6e320b4 | ||
|
|
b50948fe6b | ||
|
|
91407405a9 | ||
|
|
8f23f63a6b | ||
|
|
11747aaadc | ||
|
|
1088beefe5 | ||
|
|
b7988925a5 | ||
|
|
89dafa6232 | ||
|
|
8054852dad | ||
|
|
542d7813ce | ||
|
|
bc1ffe4fcb | ||
|
|
09997c68bb | ||
|
|
42bc3501ac |
@@ -5,6 +5,7 @@ ACEs
|
|||||||
ACS
|
ACS
|
||||||
AccVGPR
|
AccVGPR
|
||||||
AccVGPRs
|
AccVGPRs
|
||||||
|
AITER
|
||||||
ALU
|
ALU
|
||||||
AllReduce
|
AllReduce
|
||||||
AMD
|
AMD
|
||||||
@@ -115,6 +116,7 @@ Deprecations
|
|||||||
DevCap
|
DevCap
|
||||||
DirectX
|
DirectX
|
||||||
Dockerfile
|
Dockerfile
|
||||||
|
Dockerized
|
||||||
Doxygen
|
Doxygen
|
||||||
dropless
|
dropless
|
||||||
ELMo
|
ELMo
|
||||||
@@ -122,6 +124,7 @@ ENDPGM
|
|||||||
EPYC
|
EPYC
|
||||||
ESXi
|
ESXi
|
||||||
EoS
|
EoS
|
||||||
|
fas
|
||||||
FBGEMM
|
FBGEMM
|
||||||
FFT
|
FFT
|
||||||
FFTs
|
FFTs
|
||||||
@@ -194,6 +197,7 @@ HWE
|
|||||||
HWS
|
HWS
|
||||||
Haswell
|
Haswell
|
||||||
Higgs
|
Higgs
|
||||||
|
href
|
||||||
Hyperparameters
|
Hyperparameters
|
||||||
Huggingface
|
Huggingface
|
||||||
ICD
|
ICD
|
||||||
@@ -360,6 +364,7 @@ PowerEdge
|
|||||||
PowerShell
|
PowerShell
|
||||||
Pretrained
|
Pretrained
|
||||||
Pretraining
|
Pretraining
|
||||||
|
Primus
|
||||||
Profiler's
|
Profiler's
|
||||||
PyPi
|
PyPi
|
||||||
Pytest
|
Pytest
|
||||||
@@ -524,6 +529,7 @@ Xilinx
|
|||||||
Xnack
|
Xnack
|
||||||
Xteam
|
Xteam
|
||||||
YAML
|
YAML
|
||||||
|
YAMLs
|
||||||
YML
|
YML
|
||||||
YModel
|
YModel
|
||||||
ZeRO
|
ZeRO
|
||||||
@@ -584,6 +590,7 @@ completers
|
|||||||
composable
|
composable
|
||||||
concretization
|
concretization
|
||||||
config
|
config
|
||||||
|
configs
|
||||||
conformant
|
conformant
|
||||||
constructible
|
constructible
|
||||||
convolutional
|
convolutional
|
||||||
@@ -794,7 +801,9 @@ preprocessing
|
|||||||
preprocessor
|
preprocessor
|
||||||
prequantized
|
prequantized
|
||||||
prerequisites
|
prerequisites
|
||||||
|
pretrain
|
||||||
pretraining
|
pretraining
|
||||||
|
primus
|
||||||
profiler
|
profiler
|
||||||
profilers
|
profilers
|
||||||
protobuf
|
protobuf
|
||||||
@@ -909,6 +918,7 @@ toolchain
|
|||||||
toolchains
|
toolchains
|
||||||
toolset
|
toolset
|
||||||
toolsets
|
toolsets
|
||||||
|
torchtitan
|
||||||
torchvision
|
torchvision
|
||||||
tqdm
|
tqdm
|
||||||
tracebacks
|
tracebacks
|
||||||
|
|||||||
@@ -31,9 +31,9 @@ ROCm Version,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6
|
|||||||
:doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.14.0, 2.13.1, 2.12.1","2.14.0, 2.13.1, 2.12.1"
|
:doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.14.0, 2.13.1, 2.12.1","2.14.0, 2.13.1, 2.12.1"
|
||||||
:doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.4.35,0.4.35,0.4.35,0.4.35,0.4.31,0.4.31,0.4.31,0.4.31,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26
|
:doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.4.35,0.4.35,0.4.35,0.4.35,0.4.31,0.4.31,0.4.31,0.4.31,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26
|
||||||
:doc:`verl <../compatibility/ml-compatibility/verl-compatibility>` [#verl_compat]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.3.0.post0,N/A,N/A,N/A,N/A,N/A
|
:doc:`verl <../compatibility/ml-compatibility/verl-compatibility>` [#verl_compat]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.3.0.post0,N/A,N/A,N/A,N/A,N/A
|
||||||
:doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>`,N/A,N/A,N/A,N/A,85f95ae,85f95ae,85f95ae,85f95ae,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
:doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>` [#stanford-megatron-lm_compat]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,85f95ae,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||||
:doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat]_,N/A,N/A,N/A,2.4.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,
|
:doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat]_,N/A,N/A,N/A,2.4.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,
|
||||||
:doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>`,N/A,N/A,N/A,N/A,0.7.0,0.7.0,0.7.0,0.7.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
:doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>` [#megablocks_compat]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.7.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||||
:doc:`Taichi <../compatibility/ml-compatibility/taichi-compatibility>` [#taichi_compat]_,N/A,N/A,N/A,N/A,N/A,1.8.0b1,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
:doc:`Taichi <../compatibility/ml-compatibility/taichi-compatibility>` [#taichi_compat]_,N/A,N/A,N/A,N/A,N/A,1.8.0b1,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||||
`ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.2,1.2,1.2,1.2,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.14.1,1.14.1
|
`ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.2,1.2,1.2,1.2,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.14.1,1.14.1
|
||||||
,,,,,,,,,,,,,,,,,,
|
,,,,,,,,,,,,,,,,,,
|
||||||
|
|||||||
|
@@ -242,7 +242,9 @@ Expand for full historical view of:
|
|||||||
.. [#mi300_602-past-60] **For ROCm 6.0.2** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.
|
.. [#mi300_602-past-60] **For ROCm 6.0.2** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.
|
||||||
.. [#mi300_600-past-60] **For ROCm 6.0.0** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.
|
.. [#mi300_600-past-60] **For ROCm 6.0.0** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.
|
||||||
.. [#verl_compat] verl is only supported on ROCm 6.2.0.
|
.. [#verl_compat] verl is only supported on ROCm 6.2.0.
|
||||||
|
.. [#stanford-megatron-lm_compat] Stanford Megatron-LM is only supported on ROCm 6.3.0.
|
||||||
.. [#dgl_compat] DGL is only supported on ROCm 6.4.0.
|
.. [#dgl_compat] DGL is only supported on ROCm 6.4.0.
|
||||||
|
.. [#megablocks_compat] Megablocks is only supported on ROCm 6.3.0.
|
||||||
.. [#taichi_compat] Taichi is only supported on ROCm 6.3.2.
|
.. [#taichi_compat] Taichi is only supported on ROCm 6.3.2.
|
||||||
.. [#kfd_support-past-60] As of ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The tested user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and kernel-space support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
|
.. [#kfd_support-past-60] As of ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The tested user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and kernel-space support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
|
||||||
.. [#ROCT-rocr-past-60] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.
|
.. [#ROCT-rocr-past-60] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.
|
||||||
|
|||||||
@@ -117,11 +117,15 @@ article_pages = [
|
|||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.5", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.5", "os": ["linux"]},
|
||||||
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.6", "os": ["linux"]},
|
||||||
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-primus-migration-guide", "os": ["linux"]},
|
||||||
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/primus-megatron", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.3", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.3", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.4", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.4", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5", "os": ["linux"]},
|
||||||
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.6", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4", "os": ["linux"]},
|
||||||
@@ -147,6 +151,8 @@ article_pages = [
|
|||||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250521", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250521", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250605", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250605", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250702", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250702", "os": ["linux"]},
|
||||||
|
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250702", "os": ["linux"]},
|
||||||
|
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/inference/deploy-your-model", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/inference/deploy-your-model", "os": ["linux"]},
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,163 @@
|
|||||||
|
vllm_benchmark:
|
||||||
|
unified_docker:
|
||||||
|
latest:
|
||||||
|
# TODO: update me
|
||||||
|
pull_tag: rocm/vllm:rocm6.4.1_vllm_0.9.1_20250715
|
||||||
|
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250715/images/sha256-4a429705fa95a58f6d20aceab43b1b76fa769d57f32d5d28bd3f4e030e2a78ea
|
||||||
|
rocm_version: 6.4.1
|
||||||
|
vllm_version: 0.9.1 (0.9.2.dev364+gb432b7a28.rocm641)
|
||||||
|
pytorch_version: 2.7.0+gitf717b2a
|
||||||
|
hipblaslt_version: 0.15
|
||||||
|
model_groups:
|
||||||
|
- group: Meta Llama
|
||||||
|
tag: llama
|
||||||
|
models:
|
||||||
|
- model: Llama 3.1 8B
|
||||||
|
mad_tag: pyt_vllm_llama-3.1-8b
|
||||||
|
model_repo: meta-llama/Llama-3.1-8B-Instruct
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||||
|
precision: float16
|
||||||
|
- model: Llama 3.1 70B
|
||||||
|
mad_tag: pyt_vllm_llama-3.1-70b
|
||||||
|
model_repo: meta-llama/Llama-3.1-70B-Instruct
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
|
||||||
|
precision: float16
|
||||||
|
- model: Llama 3.1 405B
|
||||||
|
mad_tag: pyt_vllm_llama-3.1-405b
|
||||||
|
model_repo: meta-llama/Llama-3.1-405B-Instruct
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
|
||||||
|
precision: float16
|
||||||
|
- model: Llama 2 7B
|
||||||
|
mad_tag: pyt_vllm_llama-2-7b
|
||||||
|
model_repo: meta-llama/Llama-2-7b-chat-hf
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
|
||||||
|
precision: float16
|
||||||
|
- model: Llama 2 70B
|
||||||
|
mad_tag: pyt_vllm_llama-2-70b
|
||||||
|
model_repo: meta-llama/Llama-2-70b-chat-hf
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
|
||||||
|
precision: float16
|
||||||
|
- model: Llama 3.1 8B FP8
|
||||||
|
mad_tag: pyt_vllm_llama-3.1-8b_fp8
|
||||||
|
model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- model: Llama 3.1 70B FP8
|
||||||
|
mad_tag: pyt_vllm_llama-3.1-70b_fp8
|
||||||
|
model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- model: Llama 3.1 405B FP8
|
||||||
|
mad_tag: pyt_vllm_llama-3.1-405b_fp8
|
||||||
|
model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- group: Mistral AI
|
||||||
|
tag: mistral
|
||||||
|
models:
|
||||||
|
- model: Mixtral MoE 8x7B
|
||||||
|
mad_tag: pyt_vllm_mixtral-8x7b
|
||||||
|
model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||||
|
url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||||
|
precision: float16
|
||||||
|
- model: Mixtral MoE 8x22B
|
||||||
|
mad_tag: pyt_vllm_mixtral-8x22b
|
||||||
|
model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||||
|
url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||||
|
precision: float16
|
||||||
|
- model: Mistral 7B
|
||||||
|
mad_tag: pyt_vllm_mistral-7b
|
||||||
|
model_repo: mistralai/Mistral-7B-Instruct-v0.3
|
||||||
|
url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
|
||||||
|
precision: float16
|
||||||
|
- model: Mixtral MoE 8x7B FP8
|
||||||
|
mad_tag: pyt_vllm_mixtral-8x7b_fp8
|
||||||
|
model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- model: Mixtral MoE 8x22B FP8
|
||||||
|
mad_tag: pyt_vllm_mixtral-8x22b_fp8
|
||||||
|
model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- model: Mistral 7B FP8
|
||||||
|
mad_tag: pyt_vllm_mistral-7b_fp8
|
||||||
|
model_repo: amd/Mistral-7B-v0.1-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- group: Qwen
|
||||||
|
tag: qwen
|
||||||
|
models:
|
||||||
|
- model: Qwen2 7B
|
||||||
|
mad_tag: pyt_vllm_qwen2-7b
|
||||||
|
model_repo: Qwen/Qwen2-7B-Instruct
|
||||||
|
url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
|
||||||
|
precision: float16
|
||||||
|
- model: Qwen2 72B
|
||||||
|
mad_tag: pyt_vllm_qwen2-72b
|
||||||
|
model_repo: Qwen/Qwen2-72B-Instruct
|
||||||
|
url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
|
||||||
|
precision: float16
|
||||||
|
- model: QwQ-32B
|
||||||
|
mad_tag: pyt_vllm_qwq-32b
|
||||||
|
model_repo: Qwen/QwQ-32B
|
||||||
|
url: https://huggingface.co/Qwen/QwQ-32B
|
||||||
|
precision: float16
|
||||||
|
tunableop: true
|
||||||
|
- group: Databricks DBRX
|
||||||
|
tag: dbrx
|
||||||
|
models:
|
||||||
|
- model: DBRX Instruct
|
||||||
|
mad_tag: pyt_vllm_dbrx-instruct
|
||||||
|
model_repo: databricks/dbrx-instruct
|
||||||
|
url: https://huggingface.co/databricks/dbrx-instruct
|
||||||
|
precision: float16
|
||||||
|
- model: DBRX Instruct FP8
|
||||||
|
mad_tag: pyt_vllm_dbrx_fp8
|
||||||
|
model_repo: amd/dbrx-instruct-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- group: Google Gemma
|
||||||
|
tag: gemma
|
||||||
|
models:
|
||||||
|
- model: Gemma 2 27B
|
||||||
|
mad_tag: pyt_vllm_gemma-2-27b
|
||||||
|
model_repo: google/gemma-2-27b
|
||||||
|
url: https://huggingface.co/google/gemma-2-27b
|
||||||
|
precision: float16
|
||||||
|
- group: Cohere
|
||||||
|
tag: cohere
|
||||||
|
models:
|
||||||
|
- model: C4AI Command R+ 08-2024
|
||||||
|
mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
|
||||||
|
model_repo: CohereForAI/c4ai-command-r-plus-08-2024
|
||||||
|
url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
|
||||||
|
precision: float16
|
||||||
|
- model: C4AI Command R+ 08-2024 FP8
|
||||||
|
mad_tag: pyt_vllm_command-r-plus_fp8
|
||||||
|
model_repo: amd/c4ai-command-r-plus-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- group: DeepSeek
|
||||||
|
tag: deepseek
|
||||||
|
models:
|
||||||
|
- model: DeepSeek MoE 16B
|
||||||
|
mad_tag: pyt_vllm_deepseek-moe-16b-chat
|
||||||
|
model_repo: deepseek-ai/deepseek-moe-16b-chat
|
||||||
|
url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
|
||||||
|
precision: float16
|
||||||
|
- group: Microsoft Phi
|
||||||
|
tag: phi
|
||||||
|
models:
|
||||||
|
- model: Phi-4
|
||||||
|
mad_tag: pyt_vllm_phi-4
|
||||||
|
model_repo: microsoft/phi-4
|
||||||
|
url: https://huggingface.co/microsoft/phi-4
|
||||||
|
- group: TII Falcon
|
||||||
|
tag: falcon
|
||||||
|
models:
|
||||||
|
- model: Falcon 180B
|
||||||
|
mad_tag: pyt_vllm_falcon-180b
|
||||||
|
model_repo: tiiuae/falcon-180B
|
||||||
|
url: https://huggingface.co/tiiuae/falcon-180B
|
||||||
|
precision: float16
|
||||||
@@ -39,7 +39,7 @@ pytorch_inference_benchmark:
|
|||||||
model_repo: Wan-AI/Wan2.1-T2V-14B
|
model_repo: Wan-AI/Wan2.1-T2V-14B
|
||||||
url: https://huggingface.co/Wan-AI/Wan2.1-T2V-14B
|
url: https://huggingface.co/Wan-AI/Wan2.1-T2V-14B
|
||||||
precision: bfloat16
|
precision: bfloat16
|
||||||
- group: Janus-Pro
|
- group: Janus Pro
|
||||||
tag: janus-pro
|
tag: janus-pro
|
||||||
models:
|
models:
|
||||||
- model: Janus Pro 7B
|
- model: Janus Pro 7B
|
||||||
@@ -47,3 +47,11 @@ pytorch_inference_benchmark:
|
|||||||
model_repo: deepseek-ai/Janus-Pro-7B
|
model_repo: deepseek-ai/Janus-Pro-7B
|
||||||
url: https://huggingface.co/deepseek-ai/Janus-Pro-7B
|
url: https://huggingface.co/deepseek-ai/Janus-Pro-7B
|
||||||
precision: bfloat16
|
precision: bfloat16
|
||||||
|
- group: Hunyuan Video
|
||||||
|
tag: hunyuan
|
||||||
|
models:
|
||||||
|
- model: Hunyuan Video
|
||||||
|
mad_tag: pyt_hy_video
|
||||||
|
model_repo: tencent/HunyuanVideo
|
||||||
|
url: https://huggingface.co/tencent/HunyuanVideo
|
||||||
|
precision: float16
|
||||||
|
|||||||
@@ -2,11 +2,11 @@ vllm_benchmark:
|
|||||||
unified_docker:
|
unified_docker:
|
||||||
latest:
|
latest:
|
||||||
# TODO: update me
|
# TODO: update me
|
||||||
pull_tag: rocm/vllm:rocm6.4.1_vllm_0.9.1_20250715
|
pull_tag: rocm/vllm:rocm6.4.1_vllm_0.10.0_20250812
|
||||||
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250715/images/sha256-4a429705fa95a58f6d20aceab43b1b76fa769d57f32d5d28bd3f4e030e2a78ea
|
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.0_20250812/images/sha256-4c277ad39af3a8c9feac9b30bf78d439c74d9b4728e788a419d3f1d0c30cacaa
|
||||||
rocm_version: 6.4.1
|
rocm_version: 6.4.1
|
||||||
vllm_version: 0.9.1 (0.9.2.dev364+gb432b7a28.rocm641)
|
vllm_version: 0.10.0 (0.10.1.dev395+g340ea86df.rocm641)
|
||||||
pytorch_version: 2.7.0+gitf717b2a
|
pytorch_version: 2.7.0+gitf717b2a (2.7.0+gitf717b2a)
|
||||||
hipblaslt_version: 0.15
|
hipblaslt_version: 0.15
|
||||||
model_groups:
|
model_groups:
|
||||||
- group: Meta Llama
|
- group: Meta Llama
|
||||||
@@ -27,11 +27,6 @@ vllm_benchmark:
|
|||||||
model_repo: meta-llama/Llama-3.1-405B-Instruct
|
model_repo: meta-llama/Llama-3.1-405B-Instruct
|
||||||
url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
|
url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
|
||||||
precision: float16
|
precision: float16
|
||||||
- model: Llama 2 7B
|
|
||||||
mad_tag: pyt_vllm_llama-2-7b
|
|
||||||
model_repo: meta-llama/Llama-2-7b-chat-hf
|
|
||||||
url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
|
|
||||||
precision: float16
|
|
||||||
- model: Llama 2 70B
|
- model: Llama 2 70B
|
||||||
mad_tag: pyt_vllm_llama-2-70b
|
mad_tag: pyt_vllm_llama-2-70b
|
||||||
model_repo: meta-llama/Llama-2-70b-chat-hf
|
model_repo: meta-llama/Llama-2-70b-chat-hf
|
||||||
@@ -65,11 +60,6 @@ vllm_benchmark:
|
|||||||
model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
|
model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||||
url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
|
url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||||
precision: float16
|
precision: float16
|
||||||
- model: Mistral 7B
|
|
||||||
mad_tag: pyt_vllm_mistral-7b
|
|
||||||
model_repo: mistralai/Mistral-7B-Instruct-v0.3
|
|
||||||
url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
|
|
||||||
precision: float16
|
|
||||||
- model: Mixtral MoE 8x7B FP8
|
- model: Mixtral MoE 8x7B FP8
|
||||||
mad_tag: pyt_vllm_mixtral-8x7b_fp8
|
mad_tag: pyt_vllm_mixtral-8x7b_fp8
|
||||||
model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||||
@@ -80,72 +70,15 @@ vllm_benchmark:
|
|||||||
model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||||
url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||||
precision: float8
|
precision: float8
|
||||||
- model: Mistral 7B FP8
|
|
||||||
mad_tag: pyt_vllm_mistral-7b_fp8
|
|
||||||
model_repo: amd/Mistral-7B-v0.1-FP8-KV
|
|
||||||
url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
|
|
||||||
precision: float8
|
|
||||||
- group: Qwen
|
- group: Qwen
|
||||||
tag: qwen
|
tag: qwen
|
||||||
models:
|
models:
|
||||||
- model: Qwen2 7B
|
|
||||||
mad_tag: pyt_vllm_qwen2-7b
|
|
||||||
model_repo: Qwen/Qwen2-7B-Instruct
|
|
||||||
url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
|
|
||||||
precision: float16
|
|
||||||
- model: Qwen2 72B
|
|
||||||
mad_tag: pyt_vllm_qwen2-72b
|
|
||||||
model_repo: Qwen/Qwen2-72B-Instruct
|
|
||||||
url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
|
|
||||||
precision: float16
|
|
||||||
- model: QwQ-32B
|
- model: QwQ-32B
|
||||||
mad_tag: pyt_vllm_qwq-32b
|
mad_tag: pyt_vllm_qwq-32b
|
||||||
model_repo: Qwen/QwQ-32B
|
model_repo: Qwen/QwQ-32B
|
||||||
url: https://huggingface.co/Qwen/QwQ-32B
|
url: https://huggingface.co/Qwen/QwQ-32B
|
||||||
precision: float16
|
precision: float16
|
||||||
tunableop: true
|
tunableop: true
|
||||||
- group: Databricks DBRX
|
|
||||||
tag: dbrx
|
|
||||||
models:
|
|
||||||
- model: DBRX Instruct
|
|
||||||
mad_tag: pyt_vllm_dbrx-instruct
|
|
||||||
model_repo: databricks/dbrx-instruct
|
|
||||||
url: https://huggingface.co/databricks/dbrx-instruct
|
|
||||||
precision: float16
|
|
||||||
- model: DBRX Instruct FP8
|
|
||||||
mad_tag: pyt_vllm_dbrx_fp8
|
|
||||||
model_repo: amd/dbrx-instruct-FP8-KV
|
|
||||||
url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
|
|
||||||
precision: float8
|
|
||||||
- group: Google Gemma
|
|
||||||
tag: gemma
|
|
||||||
models:
|
|
||||||
- model: Gemma 2 27B
|
|
||||||
mad_tag: pyt_vllm_gemma-2-27b
|
|
||||||
model_repo: google/gemma-2-27b
|
|
||||||
url: https://huggingface.co/google/gemma-2-27b
|
|
||||||
precision: float16
|
|
||||||
- group: Cohere
|
|
||||||
tag: cohere
|
|
||||||
models:
|
|
||||||
- model: C4AI Command R+ 08-2024
|
|
||||||
mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
|
|
||||||
model_repo: CohereForAI/c4ai-command-r-plus-08-2024
|
|
||||||
url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
|
|
||||||
precision: float16
|
|
||||||
- model: C4AI Command R+ 08-2024 FP8
|
|
||||||
mad_tag: pyt_vllm_command-r-plus_fp8
|
|
||||||
model_repo: amd/c4ai-command-r-plus-FP8-KV
|
|
||||||
url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
|
|
||||||
precision: float8
|
|
||||||
- group: DeepSeek
|
|
||||||
tag: deepseek
|
|
||||||
models:
|
|
||||||
- model: DeepSeek MoE 16B
|
|
||||||
mad_tag: pyt_vllm_deepseek-moe-16b-chat
|
|
||||||
model_repo: deepseek-ai/deepseek-moe-16b-chat
|
|
||||||
url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
|
|
||||||
precision: float16
|
|
||||||
- group: Microsoft Phi
|
- group: Microsoft Phi
|
||||||
tag: phi
|
tag: phi
|
||||||
models:
|
models:
|
||||||
@@ -153,11 +86,3 @@ vllm_benchmark:
|
|||||||
mad_tag: pyt_vllm_phi-4
|
mad_tag: pyt_vllm_phi-4
|
||||||
model_repo: microsoft/phi-4
|
model_repo: microsoft/phi-4
|
||||||
url: https://huggingface.co/microsoft/phi-4
|
url: https://huggingface.co/microsoft/phi-4
|
||||||
- group: TII Falcon
|
|
||||||
tag: falcon
|
|
||||||
models:
|
|
||||||
- model: Falcon 180B
|
|
||||||
mad_tag: pyt_vllm_falcon-180b
|
|
||||||
model_repo: tiiuae/falcon-180B
|
|
||||||
url: https://huggingface.co/tiiuae/falcon-180B
|
|
||||||
precision: float16
|
|
||||||
|
|||||||
@@ -1,26 +1,15 @@
|
|||||||
dockers:
|
dockers:
|
||||||
- pull_tag: rocm/megatron-lm:v25.6_py312
|
- pull_tag: rocm/megatron-lm:v25.7_py310
|
||||||
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py312/images/sha256-482ff906532285bceabdf2bda629bd32cb6174d2d07f4243a736378001b28df0
|
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a
|
||||||
components:
|
components:
|
||||||
ROCm: 6.4.1
|
ROCm: 6.4.2
|
||||||
PyTorch: 2.8.0a0+git7d205b2
|
Primus: v0.1.0-rc1
|
||||||
Python: 3.12
|
PyTorch: 2.8.0a0+gitd06a406
|
||||||
Transformer Engine: 2.1.0.dev0+8c4a512
|
|
||||||
hipBLASLt: 393e413
|
|
||||||
Triton: 3.3.0
|
|
||||||
RCCL: 2.23.4.7a84c5d
|
|
||||||
doc_name: Ubuntu 24.04 + Python 3.12
|
|
||||||
- pull_tag: rocm/megatron-lm:v25.6_py310
|
|
||||||
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py310/images/sha256-9627bd9378684fe26cb1a10c7dd817868f553b33402e49b058355b0f095568d6
|
|
||||||
components:
|
|
||||||
ROCm: 6.4.1
|
|
||||||
PyTorch: 2.8.0a0+git7d205b2
|
|
||||||
Python: "3.10"
|
Python: "3.10"
|
||||||
Transformer Engine: 2.1.0.dev0+8c4a512
|
Transformer Engine: 2.1.0.dev0+ba586519
|
||||||
hipBLASLt: 393e413
|
hipBLASLt: 37ba1d36
|
||||||
Triton: 3.3.0
|
Triton: 3.3.0
|
||||||
RCCL: 2.23.4.7a84c5d
|
RCCL: 2.22.3
|
||||||
doc_name: Ubuntu 22.04 + Python 3.10
|
|
||||||
model_groups:
|
model_groups:
|
||||||
- group: Meta Llama
|
- group: Meta Llama
|
||||||
tag: llama
|
tag: llama
|
||||||
|
|||||||
@@ -0,0 +1,60 @@
|
|||||||
|
dockers:
|
||||||
|
- pull_tag: rocm/megatron-lm:v25.6_py312
|
||||||
|
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py312/images/sha256-482ff906532285bceabdf2bda629bd32cb6174d2d07f4243a736378001b28df0
|
||||||
|
components:
|
||||||
|
ROCm: 6.4.1
|
||||||
|
PyTorch: 2.8.0a0+git7d205b2
|
||||||
|
Python: 3.12
|
||||||
|
Transformer Engine: 2.1.0.dev0+8c4a512
|
||||||
|
hipBLASLt: 393e413
|
||||||
|
Triton: 3.3.0
|
||||||
|
RCCL: 2.23.4.7a84c5d
|
||||||
|
doc_name: Ubuntu 24.04 + Python 3.12
|
||||||
|
- pull_tag: rocm/megatron-lm:v25.6_py310
|
||||||
|
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py310/images/sha256-9627bd9378684fe26cb1a10c7dd817868f553b33402e49b058355b0f095568d6
|
||||||
|
components:
|
||||||
|
ROCm: 6.4.1
|
||||||
|
PyTorch: 2.8.0a0+git7d205b2
|
||||||
|
Python: "3.10"
|
||||||
|
Transformer Engine: 2.1.0.dev0+8c4a512
|
||||||
|
hipBLASLt: 393e413
|
||||||
|
Triton: 3.3.0
|
||||||
|
RCCL: 2.23.4.7a84c5d
|
||||||
|
doc_name: Ubuntu 22.04 + Python 3.10
|
||||||
|
model_groups:
|
||||||
|
- group: Meta Llama
|
||||||
|
tag: llama
|
||||||
|
models:
|
||||||
|
- model: Llama 3.3 70B
|
||||||
|
mad_tag: pyt_megatron_lm_train_llama-3.3-70b
|
||||||
|
- model: Llama 3.1 8B
|
||||||
|
mad_tag: pyt_megatron_lm_train_llama-3.1-8b
|
||||||
|
- model: Llama 3.1 70B
|
||||||
|
mad_tag: pyt_megatron_lm_train_llama-3.1-70b
|
||||||
|
- model: Llama 3.1 70B (proxy)
|
||||||
|
mad_tag: pyt_megatron_lm_train_llama-3.1-70b-proxy
|
||||||
|
- model: Llama 2 7B
|
||||||
|
mad_tag: pyt_megatron_lm_train_llama-2-7b
|
||||||
|
- model: Llama 2 70B
|
||||||
|
mad_tag: pyt_megatron_lm_train_llama-2-70b
|
||||||
|
- group: DeepSeek
|
||||||
|
tag: deepseek
|
||||||
|
models:
|
||||||
|
- model: DeepSeek-V3 (proxy)
|
||||||
|
mad_tag: pyt_megatron_lm_train_deepseek-v3-proxy
|
||||||
|
- model: DeepSeek-V2-Lite
|
||||||
|
mad_tag: pyt_megatron_lm_train_deepseek-v2-lite-16b
|
||||||
|
- group: Mistral AI
|
||||||
|
tag: mistral
|
||||||
|
models:
|
||||||
|
- model: Mixtral 8x7B
|
||||||
|
mad_tag: pyt_megatron_lm_train_mixtral-8x7b
|
||||||
|
- model: Mixtral 8x22B (proxy)
|
||||||
|
mad_tag: pyt_megatron_lm_train_mixtral-8x22b-proxy
|
||||||
|
- group: Qwen
|
||||||
|
tag: qwen
|
||||||
|
models:
|
||||||
|
- model: Qwen 2.5 7B
|
||||||
|
mad_tag: pyt_megatron_lm_train_qwen2.5-7b
|
||||||
|
- model: Qwen 2.5 72B
|
||||||
|
mad_tag: pyt_megatron_lm_train_qwen2.5-72b
|
||||||
@@ -0,0 +1,120 @@
|
|||||||
|
unified_docker:
|
||||||
|
latest:
|
||||||
|
pull_tag: rocm/pytorch-training:v25.6
|
||||||
|
docker_hub_url: https://hub.docker.com/r/rocm/pytorch-training/tags
|
||||||
|
rocm_version: 6.4.1
|
||||||
|
pytorch_version: 2.8.0a0+git7d205b2
|
||||||
|
python_version: 3.10.17
|
||||||
|
transformer_engine_version: 1.14.0+2f85f5f2
|
||||||
|
flash_attention_version: 3.0.0.post1
|
||||||
|
hipblaslt_version: 0.15.0-8c6919d
|
||||||
|
triton_version: 3.3.0
|
||||||
|
model_groups:
|
||||||
|
- group: Pre-training
|
||||||
|
tag: pre-training
|
||||||
|
models:
|
||||||
|
- model: Llama 3.1 8B
|
||||||
|
mad_tag: pyt_train_llama-3.1-8b
|
||||||
|
model_repo: Llama-3.1-8B
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [pretrain]
|
||||||
|
- model: Llama 3.1 70B
|
||||||
|
mad_tag: pyt_train_llama-3.1-70b
|
||||||
|
model_repo: Llama-3.1-70B
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [pretrain]
|
||||||
|
- model: FLUX.1-dev
|
||||||
|
mad_tag: pyt_train_flux
|
||||||
|
model_repo: Flux
|
||||||
|
url: https://huggingface.co/black-forest-labs/FLUX.1-dev
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [pretrain]
|
||||||
|
- group: Fine-tuning
|
||||||
|
tag: fine-tuning
|
||||||
|
models:
|
||||||
|
- model: Llama 4 Scout 17B-16E
|
||||||
|
mad_tag: pyt_train_llama-4-scout-17b-16e
|
||||||
|
model_repo: Llama-4-17B_16E
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [finetune_fw, finetune_lora]
|
||||||
|
- model: Llama 3.3 70B
|
||||||
|
mad_tag: pyt_train_llama-3.3-70b
|
||||||
|
model_repo: Llama-3.3-70B
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [finetune_fw, finetune_lora, finetune_qlora]
|
||||||
|
- model: Llama 3.2 1B
|
||||||
|
mad_tag: pyt_train_llama-3.2-1b
|
||||||
|
model_repo: Llama-3.2-1B
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-3.2-1B
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [finetune_fw, finetune_lora]
|
||||||
|
- model: Llama 3.2 3B
|
||||||
|
mad_tag: pyt_train_llama-3.2-3b
|
||||||
|
model_repo: Llama-3.2-3B
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-3.2-3B
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [finetune_fw, finetune_lora]
|
||||||
|
- model: Llama 3.2 Vision 11B
|
||||||
|
mad_tag: pyt_train_llama-3.2-vision-11b
|
||||||
|
model_repo: Llama-3.2-Vision-11B
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [finetune_fw]
|
||||||
|
- model: Llama 3.2 Vision 90B
|
||||||
|
mad_tag: pyt_train_llama-3.2-vision-90b
|
||||||
|
model_repo: Llama-3.2-Vision-90B
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-3.2-90B-Vision
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [finetune_fw]
|
||||||
|
- model: Llama 3.1 8B
|
||||||
|
mad_tag: pyt_train_llama-3.1-8b
|
||||||
|
model_repo: Llama-3.1-8B
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [finetune_fw, finetune_lora]
|
||||||
|
- model: Llama 3.1 70B
|
||||||
|
mad_tag: pyt_train_llama-3.1-70b
|
||||||
|
model_repo: Llama-3.1-70B
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-3.1-70B
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [finetune_fw, finetune_lora, finetune_qlora]
|
||||||
|
- model: Llama 3.1 405B
|
||||||
|
mad_tag: pyt_train_llama-3.1-405b
|
||||||
|
model_repo: Llama-3.1-405B
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-3.1-405B
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [finetune_qlora, HF_finetune_lora]
|
||||||
|
- model: Llama 3 8B
|
||||||
|
mad_tag: pyt_train_llama-3-8b
|
||||||
|
model_repo: Llama-3-8B
|
||||||
|
url: https://huggingface.co/meta-llama/Meta-Llama-3-8B
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [finetune_fw, finetune_lora]
|
||||||
|
- model: Llama 3 70B
|
||||||
|
mad_tag: pyt_train_llama-3-70b
|
||||||
|
model_repo: Llama-3-70B
|
||||||
|
url: https://huggingface.co/meta-llama/Meta-Llama-3-70B
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [finetune_fw, finetune_lora]
|
||||||
|
- model: Llama 2 7B
|
||||||
|
mad_tag: pyt_train_llama-2-7b
|
||||||
|
model_repo: Llama-2-7B
|
||||||
|
url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [finetune_fw, finetune_lora, finetune_qlora]
|
||||||
|
- model: Llama 2 13B
|
||||||
|
mad_tag: pyt_train_llama-2-13b
|
||||||
|
model_repo: Llama-2-13B
|
||||||
|
url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [finetune_fw, finetune_lora]
|
||||||
|
- model: Llama 2 70B
|
||||||
|
mad_tag: pyt_train_llama-2-70b
|
||||||
|
model_repo: Llama-2-70B
|
||||||
|
url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [finetune_lora, finetune_qlora, HF_finetune_lora]
|
||||||
@@ -0,0 +1,58 @@
|
|||||||
|
dockers:
|
||||||
|
- pull_tag: rocm/megatron-lm:v25.7_py310
|
||||||
|
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a
|
||||||
|
components:
|
||||||
|
ROCm: 6.4.2
|
||||||
|
Primus: v0.1.0-rc1
|
||||||
|
PyTorch: 2.8.0a0+gitd06a406
|
||||||
|
Python: "3.10"
|
||||||
|
Transformer Engine: 2.1.0.dev0+ba586519
|
||||||
|
hipBLASLt: 37ba1d36
|
||||||
|
Triton: 3.3.0
|
||||||
|
RCCL: 2.22.3
|
||||||
|
model_groups:
|
||||||
|
- group: Meta Llama
|
||||||
|
tag: llama
|
||||||
|
models:
|
||||||
|
- model: Llama 3.3 70B
|
||||||
|
mad_tag: primus_pyt_megatron_lm_train_llama-3.3-70b
|
||||||
|
config_name: llama3.3_70B-pretrain.yaml
|
||||||
|
- model: Llama 3.1 70B
|
||||||
|
mad_tag: primus_pyt_megatron_lm_train_llama-3.1-70b
|
||||||
|
config_name: llama3.1_70B-pretrain.yaml
|
||||||
|
- model: Llama 3.1 8B
|
||||||
|
mad_tag: primus_pyt_megatron_lm_train_llama-3.1-8b
|
||||||
|
config_name: llama3.1_8B-pretrain.yaml
|
||||||
|
- model: Llama 2 7B
|
||||||
|
mad_tag: primus_pyt_megatron_lm_train_llama-2-7b
|
||||||
|
config_name: llama2_7B-pretrain.yaml
|
||||||
|
- model: Llama 2 70B
|
||||||
|
mad_tag: primus_pyt_megatron_lm_train_llama-2-70b
|
||||||
|
config_name: llama2_70B-pretrain.yaml
|
||||||
|
- group: DeepSeek
|
||||||
|
tag: deepseek
|
||||||
|
models:
|
||||||
|
- model: DeepSeek-V3 (proxy)
|
||||||
|
mad_tag: primus_pyt_megatron_lm_train_deepseek-v3-proxy
|
||||||
|
config_name: deepseek_v3-pretrain.yaml
|
||||||
|
- model: DeepSeek-V2-Lite
|
||||||
|
mad_tag: primus_pyt_megatron_lm_train_deepseek-v2-lite-16b
|
||||||
|
config_name: deepseek_v2_lite-pretrain.yaml
|
||||||
|
- group: Mistral AI
|
||||||
|
tag: mistral
|
||||||
|
models:
|
||||||
|
- model: Mixtral 8x7B
|
||||||
|
mad_tag: primus_pyt_megatron_lm_train_mixtral-8x7b
|
||||||
|
config_name: mixtral_8x7B_v0.1-pretrain.yaml
|
||||||
|
- model: Mixtral 8x22B (proxy)
|
||||||
|
mad_tag: primus_pyt_megatron_lm_train_mixtral-8x22b-proxy
|
||||||
|
config_name: mixtral_8x22B_v0.1-pretrain.yaml
|
||||||
|
- group: Qwen
|
||||||
|
tag: qwen
|
||||||
|
models:
|
||||||
|
- model: Qwen 2.5 7B
|
||||||
|
mad_tag: primus_pyt_megatron_lm_train_qwen2.5-7b
|
||||||
|
config_name: primus_qwen2.5_7B-pretrain.yaml
|
||||||
|
- model: Qwen 2.5 72B
|
||||||
|
mad_tag: primus_pyt_megatron_lm_train_qwen2.5-72b
|
||||||
|
config_name: qwen2.5_72B-pretrain.yaml
|
||||||
@@ -1,38 +1,17 @@
|
|||||||
unified_docker:
|
dockers:
|
||||||
latest:
|
- pull_tag: rocm/pytorch-training:v25.7
|
||||||
pull_tag: rocm/pytorch-training:v25.6
|
docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.7/images/sha256-cc6fd840ab89cb81d926fc29eca6d075aee9875a55a522675a4b9231c9a0a712
|
||||||
docker_hub_url: https://hub.docker.com/r/rocm/pytorch-training/tags
|
components:
|
||||||
rocm_version: 6.4.1
|
ROCm: 6.4.2
|
||||||
pytorch_version: 2.8.0a0+git7d205b2
|
PyTorch: 2.8.0a0+gitd06a406
|
||||||
python_version: 3.10.17
|
Python: 3.10.18
|
||||||
transformer_engine_version: 1.14.0+2f85f5f2
|
Transformer Engine: 2.2.0.dev0+94e53dd8
|
||||||
flash_attention_version: 3.0.0.post1
|
Flash Attention: 3.0.0.post1
|
||||||
hipblaslt_version: 0.15.0-8c6919d
|
hipBLASLt: 1.1.0-4b9a52edfc
|
||||||
triton_version: 3.3.0
|
Triton: 3.3.0
|
||||||
model_groups:
|
model_groups:
|
||||||
- group: Pre-training
|
- group: Meta Llama
|
||||||
tag: pre-training
|
tag: llama
|
||||||
models:
|
|
||||||
- model: Llama 3.1 8B
|
|
||||||
mad_tag: pyt_train_llama-3.1-8b
|
|
||||||
model_repo: Llama-3.1-8B
|
|
||||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
|
||||||
precision: BF16
|
|
||||||
training_modes: [pretrain]
|
|
||||||
- model: Llama 3.1 70B
|
|
||||||
mad_tag: pyt_train_llama-3.1-70b
|
|
||||||
model_repo: Llama-3.1-70B
|
|
||||||
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
|
|
||||||
precision: BF16
|
|
||||||
training_modes: [pretrain]
|
|
||||||
- model: FLUX.1-dev
|
|
||||||
mad_tag: pyt_train_flux
|
|
||||||
model_repo: Flux
|
|
||||||
url: https://huggingface.co/black-forest-labs/FLUX.1-dev
|
|
||||||
precision: BF16
|
|
||||||
training_modes: [pretrain]
|
|
||||||
- group: Fine-tuning
|
|
||||||
tag: fine-tuning
|
|
||||||
models:
|
models:
|
||||||
- model: Llama 4 Scout 17B-16E
|
- model: Llama 4 Scout 17B-16E
|
||||||
mad_tag: pyt_train_llama-4-scout-17b-16e
|
mad_tag: pyt_train_llama-4-scout-17b-16e
|
||||||
@@ -75,19 +54,19 @@ model_groups:
|
|||||||
model_repo: Llama-3.1-8B
|
model_repo: Llama-3.1-8B
|
||||||
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||||
precision: BF16
|
precision: BF16
|
||||||
training_modes: [finetune_fw, finetune_lora]
|
training_modes: [pretrain, finetune_fw, finetune_lora, HF_pretrain]
|
||||||
- model: Llama 3.1 70B
|
- model: Llama 3.1 70B
|
||||||
mad_tag: pyt_train_llama-3.1-70b
|
mad_tag: pyt_train_llama-3.1-70b
|
||||||
model_repo: Llama-3.1-70B
|
model_repo: Llama-3.1-70B
|
||||||
url: https://huggingface.co/meta-llama/Llama-3.1-70B
|
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
|
||||||
precision: BF16
|
precision: BF16
|
||||||
training_modes: [finetune_fw, finetune_lora, finetune_qlora]
|
training_modes: [pretrain, finetune_fw, finetune_lora]
|
||||||
- model: Llama 3.1 405B
|
- model: Llama 3.1 405B
|
||||||
mad_tag: pyt_train_llama-3.1-405b
|
mad_tag: pyt_train_llama-3.1-405b
|
||||||
model_repo: Llama-3.1-405B
|
model_repo: Llama-3.1-405B
|
||||||
url: https://huggingface.co/meta-llama/Llama-3.1-405B
|
url: https://huggingface.co/meta-llama/Llama-3.1-405B
|
||||||
precision: BF16
|
precision: BF16
|
||||||
training_modes: [finetune_qlora, HF_finetune_lora]
|
training_modes: [finetune_qlora]
|
||||||
- model: Llama 3 8B
|
- model: Llama 3 8B
|
||||||
mad_tag: pyt_train_llama-3-8b
|
mad_tag: pyt_train_llama-3-8b
|
||||||
model_repo: Llama-3-8B
|
model_repo: Llama-3-8B
|
||||||
@@ -117,4 +96,67 @@ model_groups:
|
|||||||
model_repo: Llama-2-70B
|
model_repo: Llama-2-70B
|
||||||
url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
|
url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
|
||||||
precision: BF16
|
precision: BF16
|
||||||
training_modes: [finetune_lora, finetune_qlora, HF_finetune_lora]
|
training_modes: [finetune_lora, finetune_qlora]
|
||||||
|
- group: OpenAI
|
||||||
|
tag: openai
|
||||||
|
models:
|
||||||
|
- model: GPT OSS 20B
|
||||||
|
mad_tag: pyt_train_gpt_oss_20b
|
||||||
|
model_repo: GPT-OSS-20B
|
||||||
|
url: https://huggingface.co/openai/gpt-oss-20b
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [HF_finetune_lora]
|
||||||
|
- model: GPT OSS 120B
|
||||||
|
mad_tag: pyt_train_gpt_oss_120b
|
||||||
|
model_repo: GPT-OSS-120B
|
||||||
|
url: https://huggingface.co/openai/gpt-oss-120b
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [HF_finetune_lora]
|
||||||
|
- group: Qwen
|
||||||
|
tag: qwen
|
||||||
|
models:
|
||||||
|
- model: Qwen 3 8B
|
||||||
|
mad_tag: pyt_train_qwen3-8b
|
||||||
|
model_repo: Qwen3-8B
|
||||||
|
url: https://huggingface.co/Qwen/Qwen3-8B
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [finetune_fw, finetune_lora]
|
||||||
|
- model: Qwen 3 32B
|
||||||
|
mad_tag: pyt_train_qwen3-32b
|
||||||
|
model_repo: Qwen3-32
|
||||||
|
url: https://huggingface.co/Qwen/Qwen3-32B
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [finetune_lora]
|
||||||
|
- model: Qwen 2.5 32B
|
||||||
|
mad_tag: pyt_train_qwen2.5-32b
|
||||||
|
model_repo: Qwen2.5-32B
|
||||||
|
url: https://huggingface.co/Qwen/Qwen2.5-32B
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [finetune_lora]
|
||||||
|
- model: Qwen 2.5 72B
|
||||||
|
mad_tag: pyt_train_qwen2.5-72b
|
||||||
|
model_repo: Qwen2.5-72B
|
||||||
|
url: https://huggingface.co/Qwen/Qwen2.5-72B
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [finetune_lora]
|
||||||
|
- model: Qwen 2 1.5B
|
||||||
|
mad_tag: pyt_train_qwen2-1.5b
|
||||||
|
model_repo: Qwen2-1.5B
|
||||||
|
url: https://huggingface.co/Qwen/Qwen2-1.5B
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [finetune_fw, finetune_lora]
|
||||||
|
- model: Qwen 2 7B
|
||||||
|
mad_tag: pyt_train_qwen2-7b
|
||||||
|
model_repo: Qwen2-7B
|
||||||
|
url: https://huggingface.co/Qwen/Qwen2-7B
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [finetune_fw, finetune_lora]
|
||||||
|
- group: Flux
|
||||||
|
tag: flux
|
||||||
|
models:
|
||||||
|
- model: FLUX.1-dev
|
||||||
|
mad_tag: pyt_train_flux
|
||||||
|
model_repo: Flux
|
||||||
|
url: https://huggingface.co/black-forest-labs/FLUX.1-dev
|
||||||
|
precision: BF16
|
||||||
|
training_modes: [pretrain]
|
||||||
|
|||||||
@@ -19,5 +19,6 @@ The general steps to build ROCm are:
|
|||||||
#. Run the build command
|
#. Run the build command
|
||||||
|
|
||||||
Because the ROCm stack is constantly evolving, the most current instructions are stored with the source code in GitHub.
|
Because the ROCm stack is constantly evolving, the most current instructions are stored with the source code in GitHub.
|
||||||
For detailed build instructions, see `Getting and Building ROCm from Source <https://github.com/ROCm/ROCm?tab=readme-ov-file#getting-and-building-rocm-from-source>`.
|
For detailed build instructions, see `Getting and Building ROCm from Source <https://github.com/ROCm/ROCm?tab=readme-ov-file#getting-and-building-rocm-from-source>`_.
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -2,58 +2,124 @@
|
|||||||
:description: How to install deep learning frameworks for ROCm
|
:description: How to install deep learning frameworks for ROCm
|
||||||
:keywords: deep learning, frameworks, ROCm, install, PyTorch, TensorFlow, JAX, MAGMA, DeepSpeed, ML, AI
|
:keywords: deep learning, frameworks, ROCm, install, PyTorch, TensorFlow, JAX, MAGMA, DeepSpeed, ML, AI
|
||||||
|
|
||||||
********************************************
|
**********************************
|
||||||
Installing deep learning frameworks for ROCm
|
Deep learning frameworks for ROCm
|
||||||
********************************************
|
**********************************
|
||||||
|
|
||||||
ROCm provides a comprehensive ecosystem for deep learning development, including
|
Deep learning frameworks provide environments for machine learning, training, fine-tuning, inference, and performance optimization.
|
||||||
:ref:`libraries <artificial-intelligence-apis>` for optimized deep learning operations and ROCm-aware versions of popular
|
|
||||||
deep learning frameworks and libraries such as PyTorch, TensorFlow, and JAX. ROCm works closely with these
|
|
||||||
frameworks to ensure that framework-specific optimizations take advantage of AMD accelerator and GPU architectures.
|
|
||||||
|
|
||||||
The following guides provide information on compatibility and supported
|
ROCm offers a complete ecosystem for developing and running deep learning applications efficiently. It also provides ROCm-compatible versions of popular frameworks and libraries, such as PyTorch, TensorFlow, JAX, and others.
|
||||||
features for these ROCm-enabled deep learning frameworks.
|
|
||||||
|
|
||||||
* :doc:`PyTorch compatibility <../compatibility/ml-compatibility/pytorch-compatibility>`
|
The AMD ROCm organization actively contributes to open-source development and collaborates closely with framework organizations. This collaboration ensures that framework-specific optimizations effectively leverage AMD GPUs and accelerators.
|
||||||
* :doc:`TensorFlow compatibility <../compatibility/ml-compatibility/tensorflow-compatibility>`
|
|
||||||
* :doc:`JAX compatibility <../compatibility/ml-compatibility/jax-compatibility>`
|
|
||||||
* :doc:`verl compatibility <../compatibility/ml-compatibility/verl-compatibility>`
|
|
||||||
* :doc:`Stanford Megatron-LM compatibility <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>`
|
|
||||||
* :doc:`DGL compatibility <../compatibility/ml-compatibility/dgl-compatibility>`
|
|
||||||
* :doc:`Megablocks compatibility <../compatibility/ml-compatibility/megablocks-compatibility>`
|
|
||||||
* :doc:`Taichi compatibility <../compatibility/ml-compatibility/taichi-compatibility>`
|
|
||||||
|
|
||||||
This chart steps through typical installation workflows for installing deep learning frameworks for ROCm.
|
The table below summarizes information about ROCm-enabled deep learning frameworks. It includes details on ROCm compatibility and third-party tool support, installation steps and options, and links to GitHub resources. For a complete list of supported framework versions on ROCm, see the :doc:`Compatibility matrix <../compatibility/compatibility-matrix>` topic.
|
||||||
|
|
||||||
.. image:: ../data/how-to/framework_install_2024_07_04.png
|
.. list-table::
|
||||||
:alt: Flowchart for installing ROCm-aware machine learning frameworks
|
:header-rows: 1
|
||||||
:align: center
|
:widths: 5 3 6 3
|
||||||
|
|
||||||
See the installation instructions to get started.
|
* - Framework
|
||||||
|
- Installation
|
||||||
|
- Installation options
|
||||||
|
- GitHub
|
||||||
|
|
||||||
* :doc:`PyTorch for ROCm <rocm-install-on-linux:install/3rd-party/pytorch-install>`
|
* - `PyTorch <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/pytorch-compatibility.html>`__
|
||||||
* :doc:`TensorFlow for ROCm <rocm-install-on-linux:install/3rd-party/tensorflow-install>`
|
- .. raw:: html
|
||||||
* :doc:`JAX for ROCm <rocm-install-on-linux:install/3rd-party/jax-install>`
|
|
||||||
* :doc:`verl for ROCm <rocm-install-on-linux:install/3rd-party/verl-install>`
|
|
||||||
* :doc:`Stanford Megatron-LM for ROCm <rocm-install-on-linux:install/3rd-party/stanford-megatron-lm-install>`
|
|
||||||
* :doc:`DGL for ROCm <rocm-install-on-linux:install/3rd-party/dgl-install>`
|
|
||||||
* :doc:`Megablocks for ROCm <rocm-install-on-linux:install/3rd-party/megablocks-install>`
|
|
||||||
* :doc:`Taichi for ROCm <rocm-install-on-linux:install/3rd-party/taichi-install>`
|
|
||||||
|
|
||||||
.. note::
|
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||||
|
-
|
||||||
|
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-a-docker-image-with-pytorch-pre-installed>`__
|
||||||
|
- `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-a-wheels-package>`__
|
||||||
|
- `ROCm Base Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-the-pytorch-rocm-base-docker-image>`__
|
||||||
|
- `Upstream Docker file <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-the-pytorch-upstream-dockerfile>`__
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
For guidance on installing ROCm itself, refer to :doc:`ROCm installation for Linux <rocm-install-on-linux:index>`.
|
<a href="https://github.com/ROCm/pytorch"><i class="fab fa-github fa-lg"></i></a>
|
||||||
|
|
||||||
|
* - `TensorFlow <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/tensorflow-compatibility.html>`__
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||||
|
-
|
||||||
|
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html#using-a-docker-image-with-tensorflow-pre-installed>`__
|
||||||
|
- `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html#using-a-wheels-package>`__
|
||||||
|
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://github.com/ROCm/tensorflow-upstream"><i class="fab fa-github fa-lg"></i></a>
|
||||||
|
|
||||||
|
* - `JAX <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/jax-compatibility.html>`__
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/jax-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||||
|
-
|
||||||
|
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/jax-install.html#using-a-prebuilt-docker-image>`__
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://github.com/ROCm/jax"><i class="fab fa-github fa-lg"></i></a>
|
||||||
|
|
||||||
|
* - `verl <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/verl-compatibility.html>`__
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/verl-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||||
|
-
|
||||||
|
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/verl-install.html#use-a-prebuilt-docker-image-with-verl-pre-installed>`__
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://github.com/ROCm/verl"><i class="fab fa-github fa-lg"></i></a>
|
||||||
|
|
||||||
|
* - `Stanford Megatron-LM <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/stanford-megatron-lm-compatibility.html>`__
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/stanford-megatron-lm-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||||
|
-
|
||||||
|
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/stanford-megatron-lm-install.html#use-a-prebuilt-docker-image-with-stanford-megatron-lm-pre-installed>`__
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://github.com/ROCm/Stanford-Megatron-LM"><i class="fab fa-github fa-lg"></i></a>
|
||||||
|
|
||||||
|
* - `DGL <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/dgl-compatibility.html>`__
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/dgl-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||||
|
-
|
||||||
|
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/dgl-install.html#use-a-prebuilt-docker-image-with-dgl-pre-installed>`__
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://github.com/ROCm/dgl"><i class="fab fa-github fa-lg"></i></a>
|
||||||
|
|
||||||
|
* - `Megablocks <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/megablocks-compatibility.html>`__
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/megablocks-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||||
|
-
|
||||||
|
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/megablocks-install.html#using-a-prebuilt-docker-image-with-megablocks-pre-installed>`__
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://github.com/ROCm/megablocks"><i class="fab fa-github fa-lg"></i></a>
|
||||||
|
|
||||||
|
* - `Taichi <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/taichi-compatibility.html>`__
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||||
|
-
|
||||||
|
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html#use-a-prebuilt-docker-image-with-taichi-pre-installed>`__
|
||||||
|
- `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html#use-a-wheels-package>`__
|
||||||
|
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://github.com/ROCm/taichi"><i class="fab fa-github fa-lg"></i></a>
|
||||||
|
|
||||||
Learn how to use your ROCm deep learning environment for training, fine-tuning, inference, and performance optimization
|
Learn how to use your ROCm deep learning environment for training, fine-tuning, inference, and performance optimization
|
||||||
through the following guides.
|
through the following guides.
|
||||||
|
|
||||||
* :doc:`rocm-for-ai/index`
|
* :doc:`rocm-for-ai/index`
|
||||||
|
|
||||||
* :doc:`Training <rocm-for-ai/training/index>`
|
* :doc:`Use ROCm for training <rocm-for-ai/training/index>`
|
||||||
|
|
||||||
* :doc:`Fine-tuning LLMs <rocm-for-ai/fine-tuning/index>`
|
* :doc:`Use ROCm for fine-tuning LLMs <rocm-for-ai/fine-tuning/index>`
|
||||||
|
|
||||||
* :doc:`Inference <rocm-for-ai/inference/index>`
|
* :doc:`Use ROCm for AI inference <rocm-for-ai/inference/index>`
|
||||||
|
|
||||||
* :doc:`Inference optimization <rocm-for-ai/inference-optimization/index>`
|
* :doc:`Use ROCm for AI inference optimization <rocm-for-ai/inference-optimization/index>`
|
||||||
|
|
||||||
|
|||||||
@@ -939,7 +939,7 @@ hipBLASLt benchmarking
|
|||||||
The GEMM library
|
The GEMM library
|
||||||
`hipBLASLt <https://rocm.docs.amd.com/projects/hipBLASLt/en/latest/index.html>`_
|
`hipBLASLt <https://rocm.docs.amd.com/projects/hipBLASLt/en/latest/index.html>`_
|
||||||
provides a benchmark tool for its supported operations. Refer to the
|
provides a benchmark tool for its supported operations. Refer to the
|
||||||
`documentation <https://github.com/ROCm/hipBLASLt/blob/develop/clients/benchmarks/README.md>`_
|
`documentation <https://github.com/ROCm/hipBLASLt/blob/develop/clients/bench/README.md>`_
|
||||||
for details.
|
for details.
|
||||||
|
|
||||||
* Example 1: Benchmark mix fp8 GEMM
|
* Example 1: Benchmark mix fp8 GEMM
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ vLLM inference performance testing
|
|||||||
This documentation does not reflect the latest version of ROCm vLLM
|
This documentation does not reflect the latest version of ROCm vLLM
|
||||||
inference performance documentation. See :doc:`../vllm` for the latest version.
|
inference performance documentation. See :doc:`../vllm` for the latest version.
|
||||||
|
|
||||||
.. _vllm-benchmark-unified-docker:
|
.. _vllm-benchmark-unified-docker-702:
|
||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250702-benchmark-models.yaml
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250702-benchmark-models.yaml
|
||||||
|
|
||||||
@@ -77,7 +77,7 @@ vLLM inference performance testing
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
.. _vllm-benchmark-vllm:
|
.. _vllm-benchmark-vllm-702:
|
||||||
|
|
||||||
{% for model_group in model_groups %}
|
{% for model_group in model_groups %}
|
||||||
{% for model in model_group.models %}
|
{% for model in model_group.models %}
|
||||||
@@ -159,7 +159,7 @@ vLLM inference performance testing
|
|||||||
Once the setup is complete, choose between two options to reproduce the
|
Once the setup is complete, choose between two options to reproduce the
|
||||||
benchmark results:
|
benchmark results:
|
||||||
|
|
||||||
.. _vllm-benchmark-mad:
|
.. _vllm-benchmark-mad-702:
|
||||||
|
|
||||||
{% for model_group in model_groups %}
|
{% for model_group in model_groups %}
|
||||||
{% for model in model_group.models %}
|
{% for model in model_group.models %}
|
||||||
|
|||||||
@@ -0,0 +1,450 @@
|
|||||||
|
:orphan:
|
||||||
|
|
||||||
|
.. meta::
|
||||||
|
:description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
|
||||||
|
ROCm vLLM Docker image.
|
||||||
|
:keywords: model, MAD, automation, dashboarding, validate
|
||||||
|
|
||||||
|
**********************************
|
||||||
|
vLLM inference performance testing
|
||||||
|
**********************************
|
||||||
|
|
||||||
|
.. caution::
|
||||||
|
|
||||||
|
This documentation does not reflect the latest version of ROCm vLLM
|
||||||
|
inference performance documentation. See :doc:`../vllm` for the latest version.
|
||||||
|
|
||||||
|
.. _vllm-benchmark-unified-docker-715:
|
||||||
|
|
||||||
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml
|
||||||
|
|
||||||
|
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
||||||
|
{% set model_groups = data.vllm_benchmark.model_groups %}
|
||||||
|
|
||||||
|
The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
|
||||||
|
a prebuilt, optimized environment for validating large language model (LLM)
|
||||||
|
inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
|
||||||
|
Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
|
||||||
|
accelerators and includes the following components:
|
||||||
|
|
||||||
|
.. list-table::
|
||||||
|
:header-rows: 1
|
||||||
|
|
||||||
|
* - Software component
|
||||||
|
- Version
|
||||||
|
|
||||||
|
* - `ROCm <https://github.com/ROCm/ROCm>`__
|
||||||
|
- {{ unified_docker.rocm_version }}
|
||||||
|
|
||||||
|
* - `vLLM <https://docs.vllm.ai/en/latest>`__
|
||||||
|
- {{ unified_docker.vllm_version }}
|
||||||
|
|
||||||
|
* - `PyTorch <https://github.com/ROCm/pytorch>`__
|
||||||
|
- {{ unified_docker.pytorch_version }}
|
||||||
|
|
||||||
|
* - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`__
|
||||||
|
- {{ unified_docker.hipblaslt_version }}
|
||||||
|
|
||||||
|
With this Docker image, you can quickly test the :ref:`expected
|
||||||
|
inference performance numbers <vllm-benchmark-performance-measurements-715>` for
|
||||||
|
MI300X series accelerators.
|
||||||
|
|
||||||
|
What's new
|
||||||
|
==========
|
||||||
|
|
||||||
|
The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <vllm-history>`.
|
||||||
|
|
||||||
|
* The ``--compilation-config-parameter`` is no longer required as its options are now enabled by default.
|
||||||
|
This parameter has been removed from the benchmarking script.
|
||||||
|
|
||||||
|
* Resolved Llama 3.1 405 B custom all-reduce issue, eliminating the need for ``--disable-custom-all-reduce``.
|
||||||
|
This parameter has been removed from the benchmarking script.
|
||||||
|
|
||||||
|
* Fixed a ``+rms_norm`` custom kernel issue.
|
||||||
|
|
||||||
|
* Added quick reduce functionality. Set ``VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=FP`` to enable; supported modes are ``FP``, ``INT8``, ``INT6``, ``INT4``.
|
||||||
|
|
||||||
|
* Implemented a workaround to potentially mitigate GPU crashes experienced with the Command R+ model, pending a driver fix.
|
||||||
|
|
||||||
|
Supported models
|
||||||
|
================
|
||||||
|
|
||||||
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml
|
||||||
|
|
||||||
|
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
||||||
|
{% set model_groups = data.vllm_benchmark.model_groups %}
|
||||||
|
|
||||||
|
.. _vllm-benchmark-available-models-715:
|
||||||
|
|
||||||
|
The following models are supported for inference performance benchmarking
|
||||||
|
with vLLM and ROCm. Some instructions, commands, and recommendations in this
|
||||||
|
documentation might vary by model -- select one to get started.
|
||||||
|
|
||||||
|
.. raw:: html
|
||||||
|
|
||||||
|
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||||
|
<div class="row">
|
||||||
|
<div class="col-2 me-2 model-param-head">Model group</div>
|
||||||
|
<div class="row col-10">
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
<div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="row mt-1">
|
||||||
|
<div class="col-2 me-2 model-param-head">Model</div>
|
||||||
|
<div class="row col-10">
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
{% set models = model_group.models %}
|
||||||
|
{% for model in models %}
|
||||||
|
{% if models|length % 3 == 0 %}
|
||||||
|
<div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
|
{% else %}
|
||||||
|
<div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
|
{% endif %}
|
||||||
|
{% endfor %}
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
.. _vllm-benchmark-vllm-715:
|
||||||
|
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
{% for model in model_group.models %}
|
||||||
|
|
||||||
|
.. container:: model-doc {{model.mad_tag}}
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
|
||||||
|
Some models require access authorization prior to use via an external license agreement through a third party.
|
||||||
|
|
||||||
|
{% endfor %}
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
vLLM is a toolkit and library for LLM inference and serving. AMD implements
|
||||||
|
high-performance custom kernels and modules in vLLM to enhance performance.
|
||||||
|
See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
|
||||||
|
more information.
|
||||||
|
|
||||||
|
.. _vllm-benchmark-performance-measurements-715:
|
||||||
|
|
||||||
|
Performance measurements
|
||||||
|
========================
|
||||||
|
|
||||||
|
To evaluate performance, the
|
||||||
|
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
|
||||||
|
page provides reference throughput and latency measurements for inferencing popular AI models.
|
||||||
|
|
||||||
|
.. important::
|
||||||
|
|
||||||
|
The performance data presented in
|
||||||
|
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
|
||||||
|
only reflects the latest version of this inference benchmarking environment.
|
||||||
|
The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
|
||||||
|
|
||||||
|
System validation
|
||||||
|
=================
|
||||||
|
|
||||||
|
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||||
|
correctly and performing optimally.
|
||||||
|
|
||||||
|
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
||||||
|
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
||||||
|
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
||||||
|
before starting training.
|
||||||
|
|
||||||
|
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||||
|
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||||
|
system's configuration.
|
||||||
|
|
||||||
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml
|
||||||
|
|
||||||
|
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
||||||
|
{% set model_groups = data.vllm_benchmark.model_groups %}
|
||||||
|
|
||||||
|
Pull the Docker image
|
||||||
|
=====================
|
||||||
|
|
||||||
|
Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
|
||||||
|
Use the following command to pull the Docker image from Docker Hub.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker pull {{ unified_docker.pull_tag }}
|
||||||
|
|
||||||
|
Benchmarking
|
||||||
|
============
|
||||||
|
|
||||||
|
Once the setup is complete, choose between two options to reproduce the
|
||||||
|
benchmark results:
|
||||||
|
|
||||||
|
.. _vllm-benchmark-mad-715:
|
||||||
|
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
{% for model in model_group.models %}
|
||||||
|
|
||||||
|
.. container:: model-doc {{model.mad_tag}}
|
||||||
|
|
||||||
|
.. tab-set::
|
||||||
|
|
||||||
|
.. tab-item:: MAD-integrated benchmarking
|
||||||
|
|
||||||
|
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||||
|
directory and install the required packages on the host machine.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
git clone https://github.com/ROCm/MAD
|
||||||
|
cd MAD
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
2. Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
|
||||||
|
using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
||||||
|
madengine run \
|
||||||
|
--tags {{model.mad_tag}} \
|
||||||
|
--keep-model-dir \
|
||||||
|
--live-output \
|
||||||
|
--timeout 28800
|
||||||
|
|
||||||
|
MAD launches a Docker container with the name
|
||||||
|
``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
|
||||||
|
model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
|
||||||
|
|
||||||
|
Although the :ref:`available models <vllm-benchmark-available-models-715>` are preconfigured
|
||||||
|
to collect latency and throughput performance data, you can also change the benchmarking
|
||||||
|
parameters. See the standalone benchmarking tab for more information.
|
||||||
|
|
||||||
|
{% if model.tunableop %}
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
|
||||||
|
TunableOp automatically explores different implementations and configurations of certain PyTorch
|
||||||
|
operators to find the fastest one for your hardware.
|
||||||
|
|
||||||
|
By default, ``{{model.mad_tag}}`` runs with TunableOp disabled
|
||||||
|
(see
|
||||||
|
`<https://github.com/ROCm/MAD/blob/develop/models.json>`__).
|
||||||
|
To enable it, include the ``--tunableop on`` argument in your
|
||||||
|
run.
|
||||||
|
|
||||||
|
Enabling TunableOp triggers a two-pass run -- a warm-up followed
|
||||||
|
by the performance-collection run.
|
||||||
|
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
.. tab-item:: Standalone benchmarking
|
||||||
|
|
||||||
|
.. rubric:: Download the Docker image and required scripts
|
||||||
|
|
||||||
|
1. Run the vLLM benchmark tool independently by starting the
|
||||||
|
`Docker container <{{ unified_docker.docker_hub_url }}>`_
|
||||||
|
as shown in the following snippet.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker pull {{ unified_docker.pull_tag }}
|
||||||
|
docker run -it \
|
||||||
|
--device=/dev/kfd \
|
||||||
|
--device=/dev/dri \
|
||||||
|
--group-add video \
|
||||||
|
--shm-size 16G \
|
||||||
|
--security-opt seccomp=unconfined \
|
||||||
|
--security-opt apparmor=unconfined \
|
||||||
|
--cap-add=SYS_PTRACE \
|
||||||
|
-v $(pwd):/workspace \
|
||||||
|
--env HUGGINGFACE_HUB_CACHE=/workspace \
|
||||||
|
--name test \
|
||||||
|
{{ unified_docker.pull_tag }}
|
||||||
|
|
||||||
|
2. In the Docker container, clone the ROCm MAD repository and navigate to the
|
||||||
|
benchmark scripts directory at ``~/MAD/scripts/vllm``.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
git clone https://github.com/ROCm/MAD
|
||||||
|
cd MAD/scripts/vllm
|
||||||
|
|
||||||
|
3. To start the benchmark, use the following command with the appropriate options.
|
||||||
|
|
||||||
|
.. dropdown:: Benchmark options
|
||||||
|
:open:
|
||||||
|
|
||||||
|
.. list-table::
|
||||||
|
:header-rows: 1
|
||||||
|
:align: center
|
||||||
|
|
||||||
|
* - Name
|
||||||
|
- Options
|
||||||
|
- Description
|
||||||
|
|
||||||
|
* - ``$test_option``
|
||||||
|
- latency
|
||||||
|
- Measure decoding token latency
|
||||||
|
|
||||||
|
* -
|
||||||
|
- throughput
|
||||||
|
- Measure token generation throughput
|
||||||
|
|
||||||
|
* -
|
||||||
|
- all
|
||||||
|
- Measure both throughput and latency
|
||||||
|
|
||||||
|
* - ``$num_gpu``
|
||||||
|
- 1 or 8
|
||||||
|
- Number of GPUs
|
||||||
|
|
||||||
|
* - ``$datatype``
|
||||||
|
- ``float16`` or ``float8``
|
||||||
|
- Data type
|
||||||
|
|
||||||
|
The input sequence length, output sequence length, and tensor parallel (TP) are
|
||||||
|
already configured. You don't need to specify them with this script.
|
||||||
|
|
||||||
|
Command:
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
./vllm_benchmark_report.sh \
|
||||||
|
-s $test_option \
|
||||||
|
-m {{model.model_repo}} \
|
||||||
|
-g $num_gpu \
|
||||||
|
-d {{model.precision}}
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
For best performance, it's recommend to run with ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``.
|
||||||
|
|
||||||
|
If you encounter the following error, pass your access-authorized Hugging
|
||||||
|
Face token to the gated models.
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
OSError: You are trying to access a gated repo.
|
||||||
|
|
||||||
|
# pass your HF_TOKEN
|
||||||
|
export HF_TOKEN=$your_personal_hf_token
|
||||||
|
|
||||||
|
.. rubric:: Benchmarking examples
|
||||||
|
|
||||||
|
Here are some examples of running the benchmark with various options:
|
||||||
|
|
||||||
|
* Latency benchmark
|
||||||
|
|
||||||
|
Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
./vllm_benchmark_report.sh \
|
||||||
|
-s latency \
|
||||||
|
-m {{model.model_repo}} \
|
||||||
|
-g 8 \
|
||||||
|
-d {{model.precision}}
|
||||||
|
|
||||||
|
Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
|
||||||
|
|
||||||
|
* Throughput benchmark
|
||||||
|
|
||||||
|
Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
./vllm_benchmark_report.sh \
|
||||||
|
-s throughput \
|
||||||
|
-m {{model.model_repo}} \
|
||||||
|
-g 8 \
|
||||||
|
-d {{model.precision}}
|
||||||
|
|
||||||
|
Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
|
||||||
|
|
||||||
|
.. raw:: html
|
||||||
|
|
||||||
|
<style>
|
||||||
|
mjx-container[jax="CHTML"][display="true"] {
|
||||||
|
text-align: left;
|
||||||
|
margin: 0;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
Throughput is calculated as:
|
||||||
|
|
||||||
|
- .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
|
||||||
|
|
||||||
|
- .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
|
||||||
|
{% endfor %}
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
Advanced usage
|
||||||
|
==============
|
||||||
|
|
||||||
|
For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
|
||||||
|
see the developer's guide at `<https://github.com/ROCm/vllm/tree/f94ec9beeca1071cc34f9d1e206d8c7f3ac76129/docs/dev-docker>`__.
|
||||||
|
|
||||||
|
Reproducing the Docker image
|
||||||
|
----------------------------
|
||||||
|
|
||||||
|
To reproduce this ROCm/vLLM Docker image release, follow these steps:
|
||||||
|
|
||||||
|
1. Clone the `vLLM repository <https://github.com/ROCm/vllm>`__.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
git clone https://github.com/ROCm/vllm.git
|
||||||
|
|
||||||
|
2. Checkout the specific release commit.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
cd vllm
|
||||||
|
git checkout b432b7a285aa0dcb9677380936ffa74931bb6d6f
|
||||||
|
|
||||||
|
3. Build the Docker image. Replace ``vllm-rocm`` with your desired image tag.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker build -f docker/Dockerfile.rocm -t vllm-rocm .
|
||||||
|
|
||||||
|
Known issues and workarounds
|
||||||
|
============================
|
||||||
|
|
||||||
|
AITER does not support FP8 KV cache yet.
|
||||||
|
|
||||||
|
Further reading
|
||||||
|
===============
|
||||||
|
|
||||||
|
- To learn more about the options for latency and throughput benchmark scripts,
|
||||||
|
see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
|
||||||
|
|
||||||
|
- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
|
||||||
|
|
||||||
|
- To learn more about system settings and management practices to configure your system for
|
||||||
|
AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
|
||||||
|
|
||||||
|
- For application performance optimization strategies for HPC and AI workloads,
|
||||||
|
including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
|
||||||
|
|
||||||
|
- To learn how to run community models from Hugging Face on AMD GPUs, see
|
||||||
|
:doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
|
||||||
|
|
||||||
|
- To learn how to fine-tune LLMs and optimize inference, see
|
||||||
|
:doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
|
||||||
|
|
||||||
|
- For a list of other ready-made Docker images for AI with ROCm, see
|
||||||
|
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
|
||||||
|
|
||||||
|
Previous versions
|
||||||
|
=================
|
||||||
|
|
||||||
|
See :doc:`vllm-history` to find documentation for previous releases
|
||||||
|
of the ``ROCm/vllm`` Docker image.
|
||||||
@@ -16,14 +16,23 @@ previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub <https://hub.
|
|||||||
- Components
|
- Components
|
||||||
- Resources
|
- Resources
|
||||||
|
|
||||||
* - ``rocm/vllm:rocm6.4.1_vllm_0.9.1_20250715``
|
* - ``rocm/vllm:rocm6.4.1_vllm_0.10.0_20250812``
|
||||||
(latest)
|
(latest)
|
||||||
|
-
|
||||||
|
* ROCm 6.4.1
|
||||||
|
* vLLM 0.10.0
|
||||||
|
* PyTorch 2.7.0
|
||||||
|
-
|
||||||
|
* :doc:`Documentation <../vllm>`
|
||||||
|
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.0_20250812/images/sha256-4c277ad39af3a8c9feac9b30bf78d439c74d9b4728e788a419d3f1d0c30cacaa>`__
|
||||||
|
|
||||||
|
* - ``rocm/vllm:rocm6.4.1_vllm_0.9.1_20250715``
|
||||||
-
|
-
|
||||||
* ROCm 6.4.1
|
* ROCm 6.4.1
|
||||||
* vLLM 0.9.1
|
* vLLM 0.9.1
|
||||||
* PyTorch 2.7.0
|
* PyTorch 2.7.0
|
||||||
-
|
-
|
||||||
* :doc:`Documentation <../vllm>`
|
* :doc:`Documentation <vllm-0.9.1-20250715>`
|
||||||
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250715/images/sha256-4a429705fa95a58f6d20aceab43b1b76fa769d57f32d5d28bd3f4e030e2a78ea>`__
|
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250715/images/sha256-4a429705fa95a58f6d20aceab43b1b76fa769d57f32d5d28bd3f4e030e2a78ea>`__
|
||||||
|
|
||||||
* - ``rocm/vllm:rocm6.4.1_vllm_0.9.1_20250702``
|
* - ``rocm/vllm:rocm6.4.1_vllm_0.9.1_20250702``
|
||||||
|
|||||||
@@ -103,7 +103,7 @@ PyTorch inference performance testing
|
|||||||
|
|
||||||
The Chai-1 benchmark uses a specifically selected Docker image using ROCm 6.2.3 and PyTorch 2.3.0 to address an accuracy issue.
|
The Chai-1 benchmark uses a specifically selected Docker image using ROCm 6.2.3 and PyTorch 2.3.0 to address an accuracy issue.
|
||||||
|
|
||||||
.. container:: model-doc pyt_clip_inference pyt_mochi_video_inference pyt_wan2.1_inference pyt_janus_pro_inference
|
.. container:: model-doc pyt_clip_inference pyt_mochi_video_inference pyt_wan2.1_inference pyt_janus_pro_inference pyt_hy_video
|
||||||
|
|
||||||
Use the following command to pull the `ROCm PyTorch Docker image <https://hub.docker.com/layers/rocm/pytorch/latest/images/sha256-05b55983e5154f46e7441897d0908d79877370adca4d1fff4899d9539d6c4969>`__ from Docker Hub.
|
Use the following command to pull the `ROCm PyTorch Docker image <https://hub.docker.com/layers/rocm/pytorch/latest/images/sha256-05b55983e5154f46e7441897d0908d79877370adca4d1fff4899d9539d6c4969>`__ from Docker Hub.
|
||||||
|
|
||||||
|
|||||||
@@ -7,7 +7,7 @@
|
|||||||
vLLM inference performance testing
|
vLLM inference performance testing
|
||||||
**********************************
|
**********************************
|
||||||
|
|
||||||
.. _vllm-benchmark-unified-docker:
|
.. _vllm-benchmark-unified-docker-812:
|
||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
|
||||||
|
|
||||||
@@ -39,7 +39,7 @@ vLLM inference performance testing
|
|||||||
- {{ unified_docker.hipblaslt_version }}
|
- {{ unified_docker.hipblaslt_version }}
|
||||||
|
|
||||||
With this Docker image, you can quickly test the :ref:`expected
|
With this Docker image, you can quickly test the :ref:`expected
|
||||||
inference performance numbers <vllm-benchmark-performance-measurements>` for
|
inference performance numbers <vllm-benchmark-performance-measurements-812>` for
|
||||||
MI300X series accelerators.
|
MI300X series accelerators.
|
||||||
|
|
||||||
What's new
|
What's new
|
||||||
@@ -47,17 +47,11 @@ What's new
|
|||||||
|
|
||||||
The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <previous-versions/vllm-history>`.
|
The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <previous-versions/vllm-history>`.
|
||||||
|
|
||||||
* The ``--compilation-config-parameter`` is no longer required as its options are now enabled by default.
|
* Upgraded to vLLM v0.10.
|
||||||
This parameter has been removed from the benchmarking script.
|
|
||||||
|
|
||||||
* Resolved Llama 3.1 405 B custom all-reduce issue, eliminating the need for ``--disable-custom-all-reduce``.
|
* FP8 KV cache support via AITER.
|
||||||
This parameter has been removed from the benchmarking script.
|
|
||||||
|
|
||||||
* Fixed a ``+rms_norm`` custom kernel issue.
|
* Full graph capture support via AITER.
|
||||||
|
|
||||||
* Added quick reduce functionality. Set ``VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=FP`` to enable; supported modes are ``FP``, ``INT8``, ``INT6``, ``INT4``.
|
|
||||||
|
|
||||||
* Implemented a workaround to potentially mitigate GPU crashes experienced with the Command R+ model, pending a driver fix.
|
|
||||||
|
|
||||||
Supported models
|
Supported models
|
||||||
================
|
================
|
||||||
@@ -67,7 +61,7 @@ Supported models
|
|||||||
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
||||||
{% set model_groups = data.vllm_benchmark.model_groups %}
|
{% set model_groups = data.vllm_benchmark.model_groups %}
|
||||||
|
|
||||||
.. _vllm-benchmark-available-models:
|
.. _vllm-benchmark-available-models-812:
|
||||||
|
|
||||||
The following models are supported for inference performance benchmarking
|
The following models are supported for inference performance benchmarking
|
||||||
with vLLM and ROCm. Some instructions, commands, and recommendations in this
|
with vLLM and ROCm. Some instructions, commands, and recommendations in this
|
||||||
@@ -102,7 +96,7 @@ Supported models
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
.. _vllm-benchmark-vllm:
|
.. _vllm-benchmark-vllm-812:
|
||||||
|
|
||||||
{% for model_group in model_groups %}
|
{% for model_group in model_groups %}
|
||||||
{% for model in model_group.models %}
|
{% for model in model_group.models %}
|
||||||
@@ -124,14 +118,14 @@ Supported models
|
|||||||
See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
|
See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
|
||||||
more information.
|
more information.
|
||||||
|
|
||||||
.. _vllm-benchmark-performance-measurements:
|
.. _vllm-benchmark-performance-measurements-812:
|
||||||
|
|
||||||
Performance measurements
|
Performance measurements
|
||||||
========================
|
========================
|
||||||
|
|
||||||
To evaluate performance, the
|
To evaluate performance, the
|
||||||
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
|
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
|
||||||
page provides reference throughput and latency measurements for inferencing popular AI models.
|
page provides reference throughput and serving measurements for inferencing popular AI models.
|
||||||
|
|
||||||
.. important::
|
.. important::
|
||||||
|
|
||||||
@@ -176,7 +170,7 @@ system's configuration.
|
|||||||
Once the setup is complete, choose between two options to reproduce the
|
Once the setup is complete, choose between two options to reproduce the
|
||||||
benchmark results:
|
benchmark results:
|
||||||
|
|
||||||
.. _vllm-benchmark-mad:
|
.. _vllm-benchmark-mad-812:
|
||||||
|
|
||||||
{% for model_group in model_groups %}
|
{% for model_group in model_groups %}
|
||||||
{% for model in model_group.models %}
|
{% for model in model_group.models %}
|
||||||
@@ -209,12 +203,15 @@ system's configuration.
|
|||||||
--timeout 28800
|
--timeout 28800
|
||||||
|
|
||||||
MAD launches a Docker container with the name
|
MAD launches a Docker container with the name
|
||||||
``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
|
``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the
|
||||||
model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
|
model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv``
|
||||||
|
and ``{{ model.mad_tag }}_serving.csv``.
|
||||||
|
|
||||||
Although the :ref:`available models <vllm-benchmark-available-models>` are preconfigured
|
Although the :ref:`available models
|
||||||
to collect latency and throughput performance data, you can also change the benchmarking
|
<vllm-benchmark-available-models-812>` are preconfigured to collect
|
||||||
parameters. See the standalone benchmarking tab for more information.
|
offline throughput and online serving performance data, you can
|
||||||
|
also change the benchmarking parameters. See the standalone
|
||||||
|
benchmarking tab for more information.
|
||||||
|
|
||||||
{% if model.tunableop %}
|
{% if model.tunableop %}
|
||||||
|
|
||||||
@@ -224,14 +221,12 @@ system's configuration.
|
|||||||
TunableOp automatically explores different implementations and configurations of certain PyTorch
|
TunableOp automatically explores different implementations and configurations of certain PyTorch
|
||||||
operators to find the fastest one for your hardware.
|
operators to find the fastest one for your hardware.
|
||||||
|
|
||||||
By default, ``{{model.mad_tag}}`` runs with TunableOp disabled
|
By default, ``{{model.mad_tag}}`` runs with TunableOp disabled (see
|
||||||
(see
|
`<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To enable it, include
|
||||||
`<https://github.com/ROCm/MAD/blob/develop/models.json>`__).
|
the ``--tunableop on`` argument in your run.
|
||||||
To enable it, include the ``--tunableop on`` argument in your
|
|
||||||
run.
|
|
||||||
|
|
||||||
Enabling TunableOp triggers a two-pass run -- a warm-up followed
|
Enabling TunableOp triggers a two-pass run -- a warm-up followed by the
|
||||||
by the performance-collection run.
|
performance-collection run.
|
||||||
|
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
@@ -269,6 +264,13 @@ system's configuration.
|
|||||||
|
|
||||||
3. To start the benchmark, use the following command with the appropriate options.
|
3. To start the benchmark, use the following command with the appropriate options.
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
./run.sh \
|
||||||
|
--config $CONFIG_CSV \
|
||||||
|
--model_repo {{ model.model_repo }} \
|
||||||
|
<overrides>
|
||||||
|
|
||||||
.. dropdown:: Benchmark options
|
.. dropdown:: Benchmark options
|
||||||
:open:
|
:open:
|
||||||
|
|
||||||
@@ -280,42 +282,40 @@ system's configuration.
|
|||||||
- Options
|
- Options
|
||||||
- Description
|
- Description
|
||||||
|
|
||||||
* - ``$test_option``
|
* - ``--config``
|
||||||
- latency
|
- ``configs/default.csv``
|
||||||
- Measure decoding token latency
|
- Run configs from the CSV for the chosen model repo and benchmark.
|
||||||
|
|
||||||
* -
|
* -
|
||||||
- throughput
|
- ``configs/extended.csv``
|
||||||
- Measure token generation throughput
|
-
|
||||||
|
|
||||||
* -
|
* -
|
||||||
- all
|
- ``configs/performance.csv``
|
||||||
- Measure both throughput and latency
|
-
|
||||||
|
|
||||||
* - ``$num_gpu``
|
* - ``--benchmark``
|
||||||
- 1 or 8
|
- ``throughput``
|
||||||
- Number of GPUs
|
- Measure offline end-to-end throughput.
|
||||||
|
|
||||||
* - ``$datatype``
|
* -
|
||||||
- ``float16`` or ``float8``
|
- ``serving``
|
||||||
- Data type
|
- Measure online serving performance.
|
||||||
|
|
||||||
|
* -
|
||||||
|
- ``all``
|
||||||
|
- Measure both throughput and serving.
|
||||||
|
|
||||||
|
* - `<overrides>`
|
||||||
|
- See `run.sh <https://github.com/ROCm/MAD/blob/develop/scripts/vllm/run.sh>`__ for more info.
|
||||||
|
- Additional overrides to the config CSV.
|
||||||
|
|
||||||
The input sequence length, output sequence length, and tensor parallel (TP) are
|
The input sequence length, output sequence length, and tensor parallel (TP) are
|
||||||
already configured. You don't need to specify them with this script.
|
already configured. You don't need to specify them with this script.
|
||||||
|
|
||||||
Command:
|
|
||||||
|
|
||||||
.. code-block::
|
|
||||||
|
|
||||||
./vllm_benchmark_report.sh \
|
|
||||||
-s $test_option \
|
|
||||||
-m {{model.model_repo}} \
|
|
||||||
-g $num_gpu \
|
|
||||||
-d {{model.precision}}
|
|
||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
|
|
||||||
For best performance, it's recommend to run with ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``.
|
For best performance, it's recommended to run with ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``.
|
||||||
|
|
||||||
If you encounter the following error, pass your access-authorized Hugging
|
If you encounter the following error, pass your access-authorized Hugging
|
||||||
Face token to the gated models.
|
Face token to the gated models.
|
||||||
@@ -331,33 +331,33 @@ system's configuration.
|
|||||||
|
|
||||||
Here are some examples of running the benchmark with various options:
|
Here are some examples of running the benchmark with various options:
|
||||||
|
|
||||||
* Latency benchmark
|
|
||||||
|
|
||||||
Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
|
|
||||||
|
|
||||||
.. code-block::
|
|
||||||
|
|
||||||
./vllm_benchmark_report.sh \
|
|
||||||
-s latency \
|
|
||||||
-m {{model.model_repo}} \
|
|
||||||
-g 8 \
|
|
||||||
-d {{model.precision}}
|
|
||||||
|
|
||||||
Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
|
|
||||||
|
|
||||||
* Throughput benchmark
|
* Throughput benchmark
|
||||||
|
|
||||||
Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
|
Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
./vllm_benchmark_report.sh \
|
export MAD_MODEL_NAME={{ model.mad_tag }}
|
||||||
-s throughput \
|
./run.sh \
|
||||||
-m {{model.model_repo}} \
|
--config configs/default.csv \
|
||||||
-g 8 \
|
--model_repo {{model.model_repo}} \
|
||||||
-d {{model.precision}}
|
--benchmark throughput
|
||||||
|
|
||||||
Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
|
Find the throughput benchmark report at ``./{{ model.mad_tag }}_throughput.csv``.
|
||||||
|
|
||||||
|
* Serving benchmark
|
||||||
|
|
||||||
|
Use this command to benchmark the serving performance of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
export MAD_MODEL_NAME={{ model.mad_tag }}
|
||||||
|
./run.sh \
|
||||||
|
--config configs/default.csv \
|
||||||
|
--model_repo {{model.model_repo}} \
|
||||||
|
--benchmark serving
|
||||||
|
|
||||||
|
Find the serving benchmark report at ``./{{ model.mad_tag }}_serving.csv``.
|
||||||
|
|
||||||
.. raw:: html
|
.. raw:: html
|
||||||
|
|
||||||
@@ -400,7 +400,7 @@ To reproduce this ROCm/vLLM Docker image release, follow these steps:
|
|||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
cd vllm
|
cd vllm
|
||||||
git checkout b432b7a285aa0dcb9677380936ffa74931bb6d6f
|
git checkout 340ea86dfe5955d6f9a9e767d6abab5aacf2c978
|
||||||
|
|
||||||
3. Build the Docker image. Replace ``vllm-rocm`` with your desired image tag.
|
3. Build the Docker image. Replace ``vllm-rocm`` with your desired image tag.
|
||||||
|
|
||||||
@@ -408,11 +408,6 @@ To reproduce this ROCm/vLLM Docker image release, follow these steps:
|
|||||||
|
|
||||||
docker build -f docker/Dockerfile.rocm -t vllm-rocm .
|
docker build -f docker/Dockerfile.rocm -t vllm-rocm .
|
||||||
|
|
||||||
Known issues and workarounds
|
|
||||||
============================
|
|
||||||
|
|
||||||
AITER does not support FP8 KV cache yet.
|
|
||||||
|
|
||||||
Further reading
|
Further reading
|
||||||
===============
|
===============
|
||||||
|
|
||||||
|
|||||||
@@ -1,14 +1,14 @@
|
|||||||
.. meta::
|
.. meta::
|
||||||
:description: How to install ROCm and popular machine learning frameworks.
|
:description: How to install ROCm and popular deep learning frameworks.
|
||||||
:keywords: ROCm, AI, LLM, train, fine-tune, FSDP, DeepSpeed, LLaMA, tutorial
|
:keywords: ROCm, AI, LLM, train, fine-tune, FSDP, DeepSpeed, LLaMA, tutorial
|
||||||
|
|
||||||
.. _rocm-for-ai-install:
|
.. _rocm-for-ai-install:
|
||||||
|
|
||||||
***********************************************
|
********************************************
|
||||||
Installing ROCm and machine learning frameworks
|
Installing ROCm and deep learning frameworks
|
||||||
***********************************************
|
********************************************
|
||||||
|
|
||||||
Before getting started, install ROCm and supported machine learning frameworks.
|
Before getting started, install ROCm and supported deep learning frameworks.
|
||||||
|
|
||||||
.. grid:: 1
|
.. grid:: 1
|
||||||
|
|
||||||
@@ -22,9 +22,9 @@ If you’re new to ROCm, refer to the :doc:`ROCm quick start install guide for L
|
|||||||
<rocm-install-on-linux:install/quick-start>`.
|
<rocm-install-on-linux:install/quick-start>`.
|
||||||
|
|
||||||
If you’re using a Radeon GPU for graphics-accelerated applications, refer to the
|
If you’re using a Radeon GPU for graphics-accelerated applications, refer to the
|
||||||
`Radeon installation instructions <https://rocm.docs.amd.com/projects/radeon/en/docs-6.1.3/docs/install/native_linux/install-radeon.html>`_.
|
`Radeon installation instructions <https://rocm.docs.amd.com/projects/radeon/en/latest/docs/install/native_linux/howto_native_linux.html>`_.
|
||||||
|
|
||||||
You can install ROCm on :ref:`compatible systems <rocm-install-on-linux:reference/system-requirements>` via your Linux
|
You can install ROCm on :doc:`compatible systems <rocm-install-on-linux:reference/system-requirements>` via your Linux
|
||||||
distribution's package manager. See the following documentation resources to get started:
|
distribution's package manager. See the following documentation resources to get started:
|
||||||
|
|
||||||
* :doc:`ROCm installation overview <rocm-install-on-linux:install/install-overview>`
|
* :doc:`ROCm installation overview <rocm-install-on-linux:install/install-overview>`
|
||||||
@@ -43,29 +43,16 @@ distribution's package manager. See the following documentation resources to get
|
|||||||
If you encounter any issues during installation, refer to the
|
If you encounter any issues during installation, refer to the
|
||||||
:doc:`Installation troubleshooting <rocm-install-on-linux:reference/install-faq>` guide.
|
:doc:`Installation troubleshooting <rocm-install-on-linux:reference/install-faq>` guide.
|
||||||
|
|
||||||
Machine learning frameworks
|
Deep learning frameworks
|
||||||
===========================
|
========================
|
||||||
|
|
||||||
ROCm supports popular machine learning frameworks and libraries including `PyTorch
|
ROCm supports deep learning frameworks and libraries including `PyTorch
|
||||||
<https://pytorch.org/blog/pytorch-for-amd-rocm-platform-now-available-as-python-package>`_, `TensorFlow
|
<https://pytorch.org/blog/pytorch-for-amd-rocm-platform-now-available-as-python-package>`_, `TensorFlow
|
||||||
<https://tensorflow.org>`_, `JAX <https://jax.readthedocs.io/en/latest>`_, and `DeepSpeed
|
<https://tensorflow.org>`_, `JAX <https://jax.readthedocs.io/en/latest>`_, and more.
|
||||||
<https://cloudblogs.microsoft.com/opensource/2022/03/21/supporting-efficient-large-model-training-on-amd-instinct-gpus-with-deepspeed/>`_.
|
|
||||||
|
|
||||||
Review the framework installation documentation. For ease-of-use, it's recommended to use official ROCm prebuilt Docker
|
Review the :doc:`framework installation documentation <../deep-learning-rocm>`. For ease-of-use, it's recommended to use official ROCm prebuilt Docker
|
||||||
images with the framework pre-installed.
|
images with the framework pre-installed.
|
||||||
|
|
||||||
* :doc:`PyTorch for ROCm <rocm-install-on-linux:install/3rd-party/pytorch-install>`
|
|
||||||
|
|
||||||
* :doc:`TensorFlow for ROCm <rocm-install-on-linux:install/3rd-party/tensorflow-install>`
|
|
||||||
|
|
||||||
* :doc:`JAX for ROCm <rocm-install-on-linux:install/3rd-party/jax-install>`
|
|
||||||
|
|
||||||
* :doc:`verl for ROCm <rocm-install-on-linux:install/3rd-party/verl-install>`
|
|
||||||
|
|
||||||
* :doc:`Stanford Megatron-LM for ROCm <rocm-install-on-linux:install/3rd-party/jax-install>`
|
|
||||||
|
|
||||||
* :doc:`DGL for ROCm <rocm-install-on-linux:install/3rd-party/jax-install>`
|
|
||||||
|
|
||||||
Next steps
|
Next steps
|
||||||
==========
|
==========
|
||||||
|
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
:orphan:
|
||||||
|
|
||||||
.. meta::
|
.. meta::
|
||||||
:description: How to train a model using Megatron-LM for ROCm.
|
:description: How to train a model using Megatron-LM for ROCm.
|
||||||
:keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
|
:keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
|
||||||
@@ -6,6 +8,14 @@
|
|||||||
Training a model with Megatron-LM for ROCm
|
Training a model with Megatron-LM for ROCm
|
||||||
******************************************
|
******************************************
|
||||||
|
|
||||||
|
.. caution::
|
||||||
|
|
||||||
|
The ROCm Megatron-LM framework now has limited support with this Docker
|
||||||
|
environment; it now focuses on Primus with Megatron-Core. See :doc:`primus-megatron`.
|
||||||
|
|
||||||
|
To learn how to migrate your existing workloads to Primus with Megatron-Core,
|
||||||
|
see :doc:`previous-versions/megatron-lm-primus-migration-guide`.
|
||||||
|
|
||||||
The `Megatron-LM framework for ROCm <https://github.com/ROCm/Megatron-LM>`_ is
|
The `Megatron-LM framework for ROCm <https://github.com/ROCm/Megatron-LM>`_ is
|
||||||
a specialized fork of the robust Megatron-LM, designed to enable efficient
|
a specialized fork of the robust Megatron-LM, designed to enable efficient
|
||||||
training of large-scale language models on AMD GPUs. By leveraging AMD
|
training of large-scale language models on AMD GPUs. By leveraging AMD
|
||||||
@@ -20,13 +30,17 @@ essential components, including PyTorch, ROCm libraries, and Megatron-LM
|
|||||||
utilities. It contains the following software components to accelerate training
|
utilities. It contains the following software components to accelerate training
|
||||||
workloads:
|
workloads:
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
This Docker environment is based on Python 3.10 and Ubuntu 22.04. For an alternative environment with
|
||||||
|
Python 3.12 and Ubuntu 24.04, see the :doc:`previous ROCm Megatron-LM v25.6 Docker release <previous-versions/megatron-lm-v25.6>`.
|
||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
|
||||||
|
|
||||||
{% set dockers = data.dockers %}
|
{% set dockers = data.dockers %}
|
||||||
{% if dockers|length > 1 %}
|
|
||||||
.. tab-set::
|
.. tab-set::
|
||||||
|
|
||||||
{% for docker in data.dockers %}
|
{% for docker in dockers %}
|
||||||
.. tab-item:: ``{{ docker.pull_tag }}``
|
.. tab-item:: ``{{ docker.pull_tag }}``
|
||||||
:sync: {{ docker.pull_tag }}
|
:sync: {{ docker.pull_tag }}
|
||||||
|
|
||||||
@@ -42,28 +56,14 @@ workloads:
|
|||||||
|
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
{% elif dockers|length == 1 %}
|
|
||||||
.. list-table::
|
|
||||||
:header-rows: 1
|
|
||||||
|
|
||||||
* - Software component
|
|
||||||
- Version
|
|
||||||
|
|
||||||
{% for component_name, component_version in docker.components %}
|
|
||||||
* - {{ component_name }}
|
|
||||||
- {{ component_version }}
|
|
||||||
|
|
||||||
{% endfor %}
|
|
||||||
{% endif %}
|
|
||||||
|
|
||||||
.. _amd-megatron-lm-model-support:
|
.. _amd-megatron-lm-model-support:
|
||||||
|
|
||||||
The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
|
|
||||||
|
|
||||||
Supported models
|
Supported models
|
||||||
================
|
================
|
||||||
|
|
||||||
The following models are supported for training performance benchmarking with Megatron-LM and ROCm.
|
The following models are supported for training performance benchmarking with Megatron-LM and ROCm
|
||||||
|
on AMD Instinct MI300X series accelerators.
|
||||||
Some instructions, commands, and training recommendations in this documentation might
|
Some instructions, commands, and training recommendations in this documentation might
|
||||||
vary by model -- select one to get started.
|
vary by model -- select one to get started.
|
||||||
|
|
||||||
@@ -177,7 +177,7 @@ Download the Docker image
|
|||||||
{% if dockers|length > 1 %}
|
{% if dockers|length > 1 %}
|
||||||
.. tab-set::
|
.. tab-set::
|
||||||
|
|
||||||
{% for docker in data.dockers %}
|
{% for docker in dockers %}
|
||||||
.. tab-item:: {{ docker.doc_name }}
|
.. tab-item:: {{ docker.doc_name }}
|
||||||
:sync: {{ docker.pull_tag }}
|
:sync: {{ docker.pull_tag }}
|
||||||
|
|
||||||
@@ -227,10 +227,17 @@ Download the Docker image
|
|||||||
docker start megatron_training_env
|
docker start megatron_training_env
|
||||||
docker exec -it megatron_training_env bash
|
docker exec -it megatron_training_env bash
|
||||||
|
|
||||||
The Docker container includes a pre-installed, verified version of the ROCm
|
4. **Megatron-LM backward compatibility setup** -- this Docker is primarily intended for use with Primus, but it maintains Megatron-LM compatibility with limited support.
|
||||||
Megatron-LM development branch
|
To roll back to using Megatron-LM, follow these steps:
|
||||||
`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev>`__, including necessary
|
|
||||||
training scripts.
|
.. code-block:: shell
|
||||||
|
|
||||||
|
cd /workspace/Megatron-LM/
|
||||||
|
pip uninstall megatron-core
|
||||||
|
pip install -e .
|
||||||
|
|
||||||
|
The Docker container hosts
|
||||||
|
`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev>`__ at verified commit ``e8e9edc``.
|
||||||
|
|
||||||
.. _amd-megatron-lm-environment-setup:
|
.. _amd-megatron-lm-environment-setup:
|
||||||
|
|
||||||
|
|||||||
@@ -16,12 +16,20 @@ previous releases of the ``ROCm/megatron-lm`` Docker image on `Docker Hub <https
|
|||||||
- Components
|
- Components
|
||||||
- Resources
|
- Resources
|
||||||
|
|
||||||
* - v25.6 (latest)
|
* - v25.7 (latest)
|
||||||
|
-
|
||||||
|
* ROCm
|
||||||
|
* PyTorch
|
||||||
|
-
|
||||||
|
* :doc:`Documentation <../megatron-lm>`
|
||||||
|
* `Docker Hub (py310) <https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a>`__
|
||||||
|
|
||||||
|
* - v25.6
|
||||||
-
|
-
|
||||||
* ROCm 6.4.1
|
* ROCm 6.4.1
|
||||||
* PyTorch 2.8.0a0+git7d205b2
|
* PyTorch 2.8.0a0+git7d205b2
|
||||||
-
|
-
|
||||||
* :doc:`Documentation <../megatron-lm>`
|
* :doc:`Documentation <megatron-lm-v25.6>`
|
||||||
* `Docker Hub (py312) <https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py312/images/sha256-482ff906532285bceabdf2bda629bd32cb6174d2d07f4243a736378001b28df0>`__
|
* `Docker Hub (py312) <https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py312/images/sha256-482ff906532285bceabdf2bda629bd32cb6174d2d07f4243a736378001b28df0>`__
|
||||||
* `Docker Hub (py310) <https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py310/images/sha256-9627bd9378684fe26cb1a10c7dd817868f553b33402e49b058355b0f095568d6>`__
|
* `Docker Hub (py310) <https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py310/images/sha256-9627bd9378684fe26cb1a10c7dd817868f553b33402e49b058355b0f095568d6>`__
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,175 @@
|
|||||||
|
:orphan:
|
||||||
|
|
||||||
|
**********************************************************************
|
||||||
|
Migrating workloads to Primus (Megatron-Core backend) from Megatron-LM
|
||||||
|
**********************************************************************
|
||||||
|
|
||||||
|
Primus supports Megatron-Core as backend optimization library,
|
||||||
|
replacing ROCm Megatron-LM. This document outlines the steps to migrate
|
||||||
|
workload from ROCm Megatron-LM to Primus with the Megatron-Core backend.
|
||||||
|
|
||||||
|
Model architecture
|
||||||
|
==================
|
||||||
|
|
||||||
|
ROCm Megatron-LM defines model architecture parameters in the training scripts;
|
||||||
|
for example, the Llama 3 8B model parameters are defined in
|
||||||
|
`examples/llama/train_llama3.sh <https://github.com/ROCm/Megatron-LM/blob/rocm_dev/examples/llama/train_llama3.sh#L117>`__
|
||||||
|
as shown below:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
HIDDEN_SIZE=4096
|
||||||
|
FFN_HIDDEN_SIZE=14336
|
||||||
|
NUM_LAYERS=32
|
||||||
|
NUM_HEADS=32
|
||||||
|
NUM_KV_HEADS=8
|
||||||
|
|
||||||
|
Primus defines the model architecture through model YAML configuration files
|
||||||
|
inside the ``primus/configs/models/megatron/`` repository. For example, Llama 3 8B
|
||||||
|
model architecture parameters are defined in
|
||||||
|
`primus/configs/models/megatron/llama3_8B.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/llama3_8B.yaml>`__
|
||||||
|
as shown below:
|
||||||
|
|
||||||
|
.. code-block:: yaml
|
||||||
|
|
||||||
|
bases:
|
||||||
|
- llama3_base.yaml
|
||||||
|
|
||||||
|
tokenizer_type: Llama3Tokenizer
|
||||||
|
tokenizer_model: meta-llama/Llama-3.1-8B
|
||||||
|
|
||||||
|
ffn_hidden_size: 14336
|
||||||
|
hidden_size: 4096
|
||||||
|
num_attention_heads: 32
|
||||||
|
num_layers: 32
|
||||||
|
num_query_groups: 8
|
||||||
|
|
||||||
|
Primus' model config files follow a hierarchical design, meaning that new model
|
||||||
|
config YAMLs can inherit existing model config files by importing them as
|
||||||
|
bases. For example,
|
||||||
|
`llama3.1_8B.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/llama3.1_8B.yaml>`__
|
||||||
|
uses ``llama3_8B.yaml`` as a base config and overrides few parameters, as shown below.
|
||||||
|
In this example, ``llama3.1_8B`` overrides the ``max_position_embeddings`` value:
|
||||||
|
|
||||||
|
.. code-block:: yaml
|
||||||
|
|
||||||
|
bases:
|
||||||
|
- llama3_8B.yaml
|
||||||
|
|
||||||
|
tokenizer_type: Llama3Tokenizer
|
||||||
|
tokenizer_model: meta-llama/Llama-3.1-8B
|
||||||
|
|
||||||
|
max_position_embeddings: 131072
|
||||||
|
|
||||||
|
.. tip::
|
||||||
|
|
||||||
|
Primus provides ``llama_base.yaml`` as the base configuration, which can be
|
||||||
|
used as bases for additional model architectures. For example,
|
||||||
|
`mixtral_base.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/mixtral_base.yaml>`__
|
||||||
|
and
|
||||||
|
`deepseek_v3_base.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/deepseek_v3_base.yaml>`__
|
||||||
|
define ``llama_base.yaml`` as its base.
|
||||||
|
|
||||||
|
.. code-block:: yaml
|
||||||
|
|
||||||
|
# Example mixtral_base.yaml:
|
||||||
|
|
||||||
|
bases:
|
||||||
|
- llama_base.yaml
|
||||||
|
|
||||||
|
init_method_std: 0.01
|
||||||
|
rotary_base: 1000000
|
||||||
|
qk_layernorm: false
|
||||||
|
|
||||||
|
group_query_attention: true
|
||||||
|
num_query_groups: 8
|
||||||
|
|
||||||
|
# moe parameters
|
||||||
|
num_experts: 8
|
||||||
|
moe_router_topk: 2
|
||||||
|
moe_router_load_balancing_type: aux_loss
|
||||||
|
moe_aux_loss_coeff: 1e-2
|
||||||
|
moe_grouped_gemm: true
|
||||||
|
moe_token_dispatcher_type: alltoall
|
||||||
|
|
||||||
|
It is recommended to add a new ``${MODEL_NAME}_base.yaml`` to add a new
|
||||||
|
category of model and define new models on top of it. For example, to add
|
||||||
|
Qwen2.5 models in Primus, we define
|
||||||
|
`qwen2.5_base.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/qwen2.5_base.yaml>`__
|
||||||
|
and build
|
||||||
|
`qwen2.5_7B.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/qwen2.5_7B.yaml>`__
|
||||||
|
and
|
||||||
|
`qwen2.5_72B.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/qwen2.5_72B.yaml>`__
|
||||||
|
using ``qwen2.5_base.yaml`` as the base config.
|
||||||
|
|
||||||
|
Training parameters
|
||||||
|
===================
|
||||||
|
|
||||||
|
ROCm Megatron-LM also defines the training parameters, like batch size,
|
||||||
|
tensor-parallelism, precision, as so on, in the training scripts. For example,
|
||||||
|
Llama3 8B model parameters are defined in
|
||||||
|
`examples/llama/train_llama3.sh <https://github.com/ROCm/Megatron-LM/blob/rocm_dev/examples/llama/train_llama3.sh>`__
|
||||||
|
as shown below:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
TP="${TP:-8}"
|
||||||
|
PP="${PP:-1}"
|
||||||
|
CP="${CP:-1}"
|
||||||
|
MBS="${MBS:-1}"
|
||||||
|
BS="${BS:-8}"
|
||||||
|
|
||||||
|
Primus defines the training parameters in top-level YAML files -- see
|
||||||
|
`examples/megatron/configs/
|
||||||
|
<https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1/examples/megatron/configs>`__.
|
||||||
|
For example, the `llama3.1_8B-pretrain.yaml
|
||||||
|
<https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/examples/megatron/configs/llama3.1_8B-pretrain.yaml>`__
|
||||||
|
configuration imports the ``llama3.1_8B.yaml`` model architecture file. Users can then override
|
||||||
|
the default training parameters in ``llama3.1_8B-pretrain.yaml``.
|
||||||
|
|
||||||
|
.. code-block:: yaml
|
||||||
|
|
||||||
|
# model to run
|
||||||
|
model: llama3.1_8B.yaml # Model architecture yaml
|
||||||
|
overrides:
|
||||||
|
# log
|
||||||
|
# disable_wandb: false
|
||||||
|
# disable_tensorboard: false
|
||||||
|
stderr_sink_level: DEBUG
|
||||||
|
|
||||||
|
log_avg_skip_iterations: 2
|
||||||
|
log_avg_reset_interval: 50
|
||||||
|
|
||||||
|
train_iters: 50
|
||||||
|
micro_batch_size: 2
|
||||||
|
global_batch_size: 128
|
||||||
|
|
||||||
|
seq_length: 8192
|
||||||
|
max_position_embeddings: 8192
|
||||||
|
|
||||||
|
lr: 1.0e-5
|
||||||
|
min_lr: 0.0
|
||||||
|
lr_warmup_iters: 2
|
||||||
|
lr_decay_iters: null
|
||||||
|
lr_decay_style: cosine
|
||||||
|
weight_decay: 0.1
|
||||||
|
adam_beta1: 0.9
|
||||||
|
adam_beta2: 0.95
|
||||||
|
eod_mask_loss: true
|
||||||
|
init_method_std: 0.008
|
||||||
|
norm_epsilon: 1.0e-6
|
||||||
|
|
||||||
|
Backward compatibility with Megatron-LM
|
||||||
|
=======================================
|
||||||
|
|
||||||
|
The Dockerized environment used for Primus maintains compatibility with Megatron-LM with
|
||||||
|
limited support. To roll back to using Megatron-LM, follow these steps.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
cd /workspace/Megatron-LM/
|
||||||
|
pip uninstall megatron-core
|
||||||
|
pip install -e .
|
||||||
|
|
||||||
|
Once Megatron-LM is installed, follow :doc:`the documentation <../megatron-lm>` to run workloads as
|
||||||
|
usual.
|
||||||
@@ -18,7 +18,7 @@ Training a model with ROCm Megatron-LM
|
|||||||
The ROCm Megatron-LM framework is a specialized fork of the robust Megatron-LM, designed to
|
The ROCm Megatron-LM framework is a specialized fork of the robust Megatron-LM, designed to
|
||||||
enable efficient training of large-scale language models on AMD GPUs. By leveraging AMD Instinct™ MI300X
|
enable efficient training of large-scale language models on AMD GPUs. By leveraging AMD Instinct™ MI300X
|
||||||
accelerators, AMD Megatron-LM delivers enhanced scalability, performance, and resource utilization for AI
|
accelerators, AMD Megatron-LM delivers enhanced scalability, performance, and resource utilization for AI
|
||||||
workloads. It is purpose-built to :ref:`support models <amd-megatron-lm-model-support>`
|
workloads. It is purpose-built to :ref:`support models <amd-megatron-lm-model-support-24-12>`
|
||||||
like Meta's Llama 2, Llama 3, and Llama 3.1, enabling developers to train next-generation AI models with greater
|
like Meta's Llama 2, Llama 3, and Llama 3.1, enabling developers to train next-generation AI models with greater
|
||||||
efficiency. See the GitHub repository at `<https://github.com/ROCm/Megatron-LM>`__.
|
efficiency. See the GitHub repository at `<https://github.com/ROCm/Megatron-LM>`__.
|
||||||
|
|
||||||
@@ -67,7 +67,7 @@ Megatron-LM provides the following key features to train large language models e
|
|||||||
|
|
||||||
- Pre-training
|
- Pre-training
|
||||||
|
|
||||||
.. _amd-megatron-lm-model-support:
|
.. _amd-megatron-lm-model-support-24-12:
|
||||||
|
|
||||||
The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.
|
The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.
|
||||||
|
|
||||||
|
|||||||
@@ -67,7 +67,7 @@ Megatron-LM provides the following key features to train large language models e
|
|||||||
|
|
||||||
- Pre-training
|
- Pre-training
|
||||||
|
|
||||||
.. _amd-megatron-lm-model-support:
|
.. _amd-megatron-lm-model-support-25-3:
|
||||||
|
|
||||||
The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.
|
The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.
|
||||||
|
|
||||||
@@ -278,7 +278,7 @@ handle a variety of input sequences, including unseen words or domain-specific t
|
|||||||
.. tab-item:: Llama
|
.. tab-item:: Llama
|
||||||
:sync: llama
|
:sync: llama
|
||||||
|
|
||||||
To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``Llama2Tokenizer``.
|
To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support-25-3>`, use the ``Llama2Tokenizer``.
|
||||||
|
|
||||||
To train any of Llama 3 and Llama 3.1 models that this Docker image supports, use the ``HuggingFaceTokenizer``.
|
To train any of Llama 3 and Llama 3.1 models that this Docker image supports, use the ``HuggingFaceTokenizer``.
|
||||||
Set the Hugging Face model link in the ``TOKENIZER_MODEL`` variable.
|
Set the Hugging Face model link in the ``TOKENIZER_MODEL`` variable.
|
||||||
@@ -292,7 +292,7 @@ handle a variety of input sequences, including unseen words or domain-specific t
|
|||||||
.. tab-item:: DeepSeek V2
|
.. tab-item:: DeepSeek V2
|
||||||
:sync: deepseek
|
:sync: deepseek
|
||||||
|
|
||||||
To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``DeepSeekV2Tokenizer``.
|
To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support-25-3>`, use the ``DeepSeekV2Tokenizer``.
|
||||||
|
|
||||||
Multi-node training
|
Multi-node training
|
||||||
^^^^^^^^^^^^^^^^^^^
|
^^^^^^^^^^^^^^^^^^^
|
||||||
|
|||||||
@@ -67,7 +67,7 @@ Megatron-LM provides the following key features to train large language models e
|
|||||||
|
|
||||||
- Pre-training
|
- Pre-training
|
||||||
|
|
||||||
.. _amd-megatron-lm-model-support:
|
.. _amd-megatron-lm-model-support-25-4:
|
||||||
|
|
||||||
The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
|
The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
|
||||||
|
|
||||||
@@ -291,7 +291,7 @@ or ``${DATA_DIR}/tokenizer_llama2``.
|
|||||||
.. tab-item:: Llama
|
.. tab-item:: Llama
|
||||||
:sync: llama
|
:sync: llama
|
||||||
|
|
||||||
To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``Llama2Tokenizer``
|
To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support-25-4>`, use the ``Llama2Tokenizer``
|
||||||
or the default ``HuggingFaceTokenizer``.
|
or the default ``HuggingFaceTokenizer``.
|
||||||
|
|
||||||
To train any of Llama 3 and Llama 3.1 models that this Docker image supports, use the ``HuggingFaceTokenizer``.
|
To train any of Llama 3 and Llama 3.1 models that this Docker image supports, use the ``HuggingFaceTokenizer``.
|
||||||
@@ -320,7 +320,7 @@ or ``${DATA_DIR}/tokenizer_llama2``.
|
|||||||
.. tab-item:: DeepSeek V2
|
.. tab-item:: DeepSeek V2
|
||||||
:sync: deepseek
|
:sync: deepseek
|
||||||
|
|
||||||
To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``DeepSeekV2Tokenizer``.
|
To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support-25-4>`, use the ``DeepSeekV2Tokenizer``.
|
||||||
|
|
||||||
Multi-node training
|
Multi-node training
|
||||||
^^^^^^^^^^^^^^^^^^^
|
^^^^^^^^^^^^^^^^^^^
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -16,12 +16,20 @@ previous releases of the ``ROCm/pytorch-training`` Docker image on `Docker Hub <
|
|||||||
- Components
|
- Components
|
||||||
- Resources
|
- Resources
|
||||||
|
|
||||||
|
* - v25.7
|
||||||
|
-
|
||||||
|
* ROCm 6.4.2
|
||||||
|
* PyTorch 2.8.0a0+gitd06a406
|
||||||
|
-
|
||||||
|
* :doc:`Documentation <../pytorch-training>`
|
||||||
|
* `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.7/images/sha256-cc6fd840ab89cb81d926fc29eca6d075aee9875a55a522675a4b9231c9a0a712>`__
|
||||||
|
|
||||||
* - v25.6
|
* - v25.6
|
||||||
-
|
-
|
||||||
* ROCm 6.3.4
|
* ROCm 6.3.4
|
||||||
* PyTorch 2.8.0a0+git7d205b2
|
* PyTorch 2.8.0a0+git7d205b2
|
||||||
-
|
-
|
||||||
* :doc:`Documentation <../pytorch-training>`
|
* :doc:`Documentation <pytorch-training-v25.6>`
|
||||||
* `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.6/images/sha256-a4cea3c493a4a03d199a3e81960ac071d79a4a7a391aa9866add3b30a7842661>`__
|
* `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.6/images/sha256-a4cea3c493a4a03d199a3e81960ac071d79a4a7a391aa9866add3b30a7842661>`__
|
||||||
|
|
||||||
* - v25.5
|
* - v25.5
|
||||||
|
|||||||
@@ -437,3 +437,8 @@ Once the setup is complete, choose between two options to start benchmarking:
|
|||||||
|
|
||||||
./pytorch_benchmark_report.sh -t HF_finetune_lora -p BF16 -m Llama-2-70B
|
./pytorch_benchmark_report.sh -t HF_finetune_lora -p BF16 -m Llama-2-70B
|
||||||
|
|
||||||
|
Previous versions
|
||||||
|
=================
|
||||||
|
|
||||||
|
See :doc:`pytorch-training-history` to find documentation for previous releases
|
||||||
|
of the ``ROCm/pytorch-training`` Docker image.
|
||||||
|
|||||||
@@ -0,0 +1,456 @@
|
|||||||
|
:orphan:
|
||||||
|
|
||||||
|
.. meta::
|
||||||
|
:description: How to train a model using PyTorch for ROCm.
|
||||||
|
:keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
|
||||||
|
|
||||||
|
**************************************
|
||||||
|
Training a model with PyTorch for ROCm
|
||||||
|
**************************************
|
||||||
|
|
||||||
|
.. caution::
|
||||||
|
|
||||||
|
This documentation does not reflect the latest version of ROCm vLLM
|
||||||
|
performance benchmark documentation. See :doc:`../pytorch-training` for the latest version.
|
||||||
|
|
||||||
|
PyTorch is an open-source machine learning framework that is widely used for
|
||||||
|
model training with GPU-optimized components for transformer-based models.
|
||||||
|
|
||||||
|
The `PyTorch for ROCm training Docker <https://hub.docker.com/layers/rocm/pytorch-training/v25.6/images/sha256-a4cea3c493a4a03d199a3e81960ac071d79a4a7a391aa9866add3b30a7842661>`_
|
||||||
|
(``rocm/pytorch-training:v25.6``) image provides a prebuilt optimized environment for fine-tuning and pretraining a
|
||||||
|
model on AMD Instinct MI325X and MI300X accelerators. It includes the following software components to accelerate
|
||||||
|
training workloads:
|
||||||
|
|
||||||
|
+--------------------------+--------------------------------+
|
||||||
|
| Software component | Version |
|
||||||
|
+==========================+================================+
|
||||||
|
| ROCm | 6.3.4 |
|
||||||
|
+--------------------------+--------------------------------+
|
||||||
|
| PyTorch | 2.8.0a0+git7d205b2 |
|
||||||
|
+--------------------------+--------------------------------+
|
||||||
|
| Python | 3.10.17 |
|
||||||
|
+--------------------------+--------------------------------+
|
||||||
|
| Transformer Engine | 1.14.0+2f85f5f2 |
|
||||||
|
+--------------------------+--------------------------------+
|
||||||
|
| Flash Attention | 3.0.0.post1 |
|
||||||
|
+--------------------------+--------------------------------+
|
||||||
|
| hipBLASLt | 0.15.0-8c6919d |
|
||||||
|
+--------------------------+--------------------------------+
|
||||||
|
| Triton | 3.3.0 |
|
||||||
|
+--------------------------+--------------------------------+
|
||||||
|
|
||||||
|
.. _amd-pytorch-training-model-support-v256:
|
||||||
|
|
||||||
|
Supported models
|
||||||
|
================
|
||||||
|
|
||||||
|
The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X accelerators.
|
||||||
|
|
||||||
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.6-benchmark-models.yaml
|
||||||
|
|
||||||
|
{% set unified_docker = data.unified_docker.latest %}
|
||||||
|
{% set model_groups = data.model_groups %}
|
||||||
|
|
||||||
|
.. raw:: html
|
||||||
|
|
||||||
|
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||||
|
<div class="row">
|
||||||
|
<div class="col-2 me-2 model-param-head">Workload</div>
|
||||||
|
<div class="row col-10">
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
<div class="col-6 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="row mt-1">
|
||||||
|
<div class="col-2 me-2 model-param-head">Model</div>
|
||||||
|
<div class="row col-10">
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
{% set models = model_group.models %}
|
||||||
|
{% for model in models %}
|
||||||
|
{% if models|length % 3 == 0 %}
|
||||||
|
<div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
|
{% else %}
|
||||||
|
<div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
|
{% endif %}
|
||||||
|
{% endfor %}
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
Some models require an external license agreement through a third party (for example, Meta).
|
||||||
|
|
||||||
|
.. _amd-pytorch-training-performance-measurements-v256:
|
||||||
|
|
||||||
|
Performance measurements
|
||||||
|
========================
|
||||||
|
|
||||||
|
To evaluate performance, the
|
||||||
|
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
|
||||||
|
page provides reference throughput and latency measurements for training
|
||||||
|
popular AI models.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
The performance data presented in
|
||||||
|
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
|
||||||
|
should not be interpreted as the peak performance achievable by AMD
|
||||||
|
Instinct MI325X and MI300X accelerators or ROCm software.
|
||||||
|
|
||||||
|
System validation
|
||||||
|
=================
|
||||||
|
|
||||||
|
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||||
|
correctly and performing optimally.
|
||||||
|
|
||||||
|
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
||||||
|
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
||||||
|
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
||||||
|
before starting training.
|
||||||
|
|
||||||
|
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||||
|
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||||
|
system's configuration.
|
||||||
|
|
||||||
|
This Docker image is optimized for specific model configurations outlined
|
||||||
|
below. Performance can vary for other training workloads, as AMD
|
||||||
|
doesn’t validate configurations and run conditions outside those described.
|
||||||
|
|
||||||
|
Benchmarking
|
||||||
|
============
|
||||||
|
|
||||||
|
Once the setup is complete, choose between two options to start benchmarking:
|
||||||
|
|
||||||
|
.. tab-set::
|
||||||
|
|
||||||
|
.. tab-item:: MAD-integrated benchmarking
|
||||||
|
|
||||||
|
Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||||
|
directory and install the required packages on the host machine.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
git clone https://github.com/ROCm/MAD
|
||||||
|
cd MAD
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
{% for model in model_group.models %}
|
||||||
|
|
||||||
|
.. container:: model-doc {{ model.mad_tag }}
|
||||||
|
|
||||||
|
For example, use this command to run the performance benchmark test on the {{ model.model }} model
|
||||||
|
using one GPU with the {{ model.precision }} data type on the host machine.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
||||||
|
madengine run \
|
||||||
|
--tags {{ model.mad_tag }} \
|
||||||
|
--keep-model-dir \
|
||||||
|
--live-output \
|
||||||
|
--timeout 28800
|
||||||
|
|
||||||
|
MAD launches a Docker container with the name
|
||||||
|
``container_ci-{{ model.mad_tag }}``, for example. The latency and throughput reports of the
|
||||||
|
model are collected in the following path: ``~/MAD/perf.csv``.
|
||||||
|
|
||||||
|
{% endfor %}
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
.. tab-item:: Standalone benchmarking
|
||||||
|
|
||||||
|
.. rubric:: Download the Docker image and required packages
|
||||||
|
|
||||||
|
Use the following command to pull the Docker image from Docker Hub.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker pull {{ unified_docker.pull_tag }}
|
||||||
|
|
||||||
|
Run the Docker container.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env {{ unified_docker.pull_tag }}
|
||||||
|
|
||||||
|
Use these commands if you exit the ``training_env`` container and need to return to it.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker start training_env
|
||||||
|
docker exec -it training_env bash
|
||||||
|
|
||||||
|
In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
|
||||||
|
repository and navigate to the benchmark scripts directory
|
||||||
|
``/workspace/MAD/scripts/pytorch_train``.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
git clone https://github.com/ROCm/MAD
|
||||||
|
cd MAD/scripts/pytorch_train
|
||||||
|
|
||||||
|
.. rubric:: Prepare training datasets and dependencies
|
||||||
|
|
||||||
|
The following benchmarking examples require downloading models and datasets
|
||||||
|
from Hugging Face. To ensure successful access to gated repos, set your
|
||||||
|
``HF_TOKEN``.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
export HF_TOKEN=$your_personal_hugging_face_access_token
|
||||||
|
|
||||||
|
Run the setup script to install libraries and datasets needed for benchmarking.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
./pytorch_benchmark_setup.sh
|
||||||
|
|
||||||
|
.. container:: model-doc pyt_train_llama-3.1-8b
|
||||||
|
|
||||||
|
``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 8B:
|
||||||
|
|
||||||
|
.. list-table::
|
||||||
|
:header-rows: 1
|
||||||
|
|
||||||
|
* - Library
|
||||||
|
- Reference
|
||||||
|
|
||||||
|
* - ``accelerate``
|
||||||
|
- `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
|
||||||
|
|
||||||
|
* - ``datasets``
|
||||||
|
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
|
||||||
|
|
||||||
|
.. container:: model-doc pyt_train_llama-3.1-70b
|
||||||
|
|
||||||
|
``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 70B:
|
||||||
|
|
||||||
|
.. list-table::
|
||||||
|
:header-rows: 1
|
||||||
|
|
||||||
|
* - Library
|
||||||
|
- Reference
|
||||||
|
|
||||||
|
* - ``datasets``
|
||||||
|
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
|
||||||
|
|
||||||
|
* - ``torchdata``
|
||||||
|
- `TorchData <https://pytorch.org/data/beta/index.html>`_
|
||||||
|
|
||||||
|
* - ``tomli``
|
||||||
|
- `Tomli <https://pypi.org/project/tomli/>`_
|
||||||
|
|
||||||
|
* - ``tiktoken``
|
||||||
|
- `tiktoken <https://github.com/openai/tiktoken>`_
|
||||||
|
|
||||||
|
* - ``blobfile``
|
||||||
|
- `blobfile <https://pypi.org/project/blobfile/>`_
|
||||||
|
|
||||||
|
* - ``tabulate``
|
||||||
|
- `tabulate <https://pypi.org/project/tabulate/>`_
|
||||||
|
|
||||||
|
* - ``wandb``
|
||||||
|
- `Weights & Biases <https://github.com/wandb/wandb>`_
|
||||||
|
|
||||||
|
* - ``sentencepiece``
|
||||||
|
- `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
|
||||||
|
|
||||||
|
* - ``tensorboard``
|
||||||
|
- `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
|
||||||
|
|
||||||
|
.. container:: model-doc pyt_train_flux
|
||||||
|
|
||||||
|
``pytorch_benchmark_setup.sh`` installs the following libraries for FLUX:
|
||||||
|
|
||||||
|
.. list-table::
|
||||||
|
:header-rows: 1
|
||||||
|
|
||||||
|
* - Library
|
||||||
|
- Reference
|
||||||
|
|
||||||
|
* - ``accelerate``
|
||||||
|
- `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
|
||||||
|
|
||||||
|
* - ``datasets``
|
||||||
|
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
|
||||||
|
|
||||||
|
* - ``sentencepiece``
|
||||||
|
- `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
|
||||||
|
|
||||||
|
* - ``tensorboard``
|
||||||
|
- `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
|
||||||
|
|
||||||
|
* - ``csvkit``
|
||||||
|
- `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1
|
||||||
|
|
||||||
|
* - ``deepspeed``
|
||||||
|
- `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2
|
||||||
|
|
||||||
|
* - ``diffusers``
|
||||||
|
- `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0
|
||||||
|
|
||||||
|
* - ``GitPython``
|
||||||
|
- `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44
|
||||||
|
|
||||||
|
* - ``opencv-python-headless``
|
||||||
|
- `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84
|
||||||
|
|
||||||
|
* - ``peft``
|
||||||
|
- `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0
|
||||||
|
|
||||||
|
* - ``protobuf``
|
||||||
|
- `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2
|
||||||
|
|
||||||
|
* - ``pytest``
|
||||||
|
- `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4
|
||||||
|
|
||||||
|
* - ``python-dotenv``
|
||||||
|
- `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1
|
||||||
|
|
||||||
|
* - ``seaborn``
|
||||||
|
- `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2
|
||||||
|
|
||||||
|
* - ``transformers``
|
||||||
|
- `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0
|
||||||
|
|
||||||
|
``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face:
|
||||||
|
|
||||||
|
* `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_
|
||||||
|
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
{% for model in model_group.models %}
|
||||||
|
{% if model_group.tag == "pre-training" and model.mad_tag in ["pyt_train_llama-3.1-8b", "pyt_train_llama-3.1-70b", "pyt_train_flux"] %}
|
||||||
|
|
||||||
|
.. container:: model-doc {{ model.mad_tag }}
|
||||||
|
|
||||||
|
.. rubric:: Pretraining
|
||||||
|
|
||||||
|
To start the pre-training benchmark, use the following command with the
|
||||||
|
appropriate options. See the following list of options and their descriptions.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
./pytorch_benchmark_report.sh -t pretrain -m {{ model.model_repo }} -p $datatype -s $sequence_length
|
||||||
|
|
||||||
|
.. list-table::
|
||||||
|
:header-rows: 1
|
||||||
|
|
||||||
|
* - Name
|
||||||
|
- Options
|
||||||
|
- Description
|
||||||
|
|
||||||
|
{% if model.mad_tag == "pyt_train_llama-3.1-8b" %}
|
||||||
|
* - ``$datatype``
|
||||||
|
- ``BF16`` or ``FP8``
|
||||||
|
- Only Llama 3.1 8B supports FP8 precision.
|
||||||
|
{% else %}
|
||||||
|
* - ``$datatype``
|
||||||
|
- ``BF16``
|
||||||
|
- Only Llama 3.1 8B supports FP8 precision.
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
* - ``$sequence_length``
|
||||||
|
- Sequence length for the language model.
|
||||||
|
- Between 2048 and 8192. 8192 by default.
|
||||||
|
|
||||||
|
{% if model.mad_tag == "pyt_train_flux" %}
|
||||||
|
.. container:: model-doc {{ model.mad_tag }}
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
Occasionally, downloading the Flux dataset might fail. In the event of this
|
||||||
|
error, manually download it from Hugging Face at
|
||||||
|
`black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
|
||||||
|
and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
|
||||||
|
the required dataset.
|
||||||
|
{% endif %}
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
{% if model_group.tag == "fine-tuning" %}
|
||||||
|
.. container:: model-doc {{ model.mad_tag }}
|
||||||
|
|
||||||
|
.. rubric:: Fine-tuning
|
||||||
|
|
||||||
|
To start the fine-tuning benchmark, use the following command with the
|
||||||
|
appropriate options. See the following list of options and their descriptions.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
./pytorch_benchmark_report.sh -t $training_mode -m {{ model.model_repo }} -p BF16 -s $sequence_length
|
||||||
|
|
||||||
|
.. list-table::
|
||||||
|
:header-rows: 1
|
||||||
|
|
||||||
|
* - Name
|
||||||
|
- Options
|
||||||
|
- Description
|
||||||
|
|
||||||
|
* - ``$training_mode``
|
||||||
|
- ``finetune_fw``
|
||||||
|
- Full weight fine-tuning (BF16 supported)
|
||||||
|
|
||||||
|
* -
|
||||||
|
- ``finetune_lora``
|
||||||
|
- LoRA fine-tuning (BF16 supported)
|
||||||
|
|
||||||
|
* -
|
||||||
|
- ``finetune_qlora``
|
||||||
|
- QLoRA fine-tuning (BF16 supported)
|
||||||
|
|
||||||
|
* -
|
||||||
|
- ``HF_finetune_lora``
|
||||||
|
- LoRA fine-tuning with Hugging Face PEFT
|
||||||
|
|
||||||
|
* - ``$datatype``
|
||||||
|
- ``BF16``
|
||||||
|
- All models support BF16.
|
||||||
|
|
||||||
|
* - ``$sequence_length``
|
||||||
|
- Between 2048 and 16384.
|
||||||
|
- Sequence length for the language model.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
{{ model.model }} currently supports the following fine-tuning methods:
|
||||||
|
|
||||||
|
{% for method in model.training_modes %}
|
||||||
|
* ``{{ method }}``
|
||||||
|
{% endfor %}
|
||||||
|
{% if model.training_modes|length < 4 %}
|
||||||
|
|
||||||
|
The upstream `torchtune <https://github.com/pytorch/torchtune>`_ repository
|
||||||
|
does not currently provide YAML configuration files for other combinations of
|
||||||
|
model to fine-tuning method
|
||||||
|
However, you can still configure your own YAML files to enable support for
|
||||||
|
fine-tuning methods not listed here by following existing patterns in the
|
||||||
|
``/workspace/torchtune/recipes/configs`` directory.
|
||||||
|
{% endif %}
|
||||||
|
{% endif %}
|
||||||
|
{% endfor %}
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
.. rubric:: Benchmarking examples
|
||||||
|
|
||||||
|
For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.
|
||||||
|
|
||||||
|
Further reading
|
||||||
|
===============
|
||||||
|
|
||||||
|
- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
|
||||||
|
|
||||||
|
- To learn more about system settings and management practices to configure your system for
|
||||||
|
AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
|
||||||
|
|
||||||
|
- For a list of other ready-made Docker images for AI with ROCm, see
|
||||||
|
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
|
||||||
|
|
||||||
|
Previous versions
|
||||||
|
=================
|
||||||
|
|
||||||
|
See :doc:`pytorch-training-history` to find documentation for previous releases
|
||||||
|
of the ``ROCm/pytorch-training`` Docker image.
|
||||||
@@ -0,0 +1,602 @@
|
|||||||
|
.. meta::
|
||||||
|
:description: How to train a model using Megatron-LM for ROCm.
|
||||||
|
:keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
|
||||||
|
|
||||||
|
**********************************************
|
||||||
|
Training a model with Primus and Megatron-Core
|
||||||
|
**********************************************
|
||||||
|
|
||||||
|
`Primus <https://github.com/AMD-AIG-AIMA/Primus>`__ is a unified and flexible
|
||||||
|
LLM training framework designed to streamline training. It streamlines LLM
|
||||||
|
training on AMD Instinct accelerators using a modular, reproducible configuration paradigm.
|
||||||
|
Primus is backend-agnostic and supports multiple training engines -- including Megatron-Core.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
Primus with the Megatron-Core backend is intended to replace ROCm
|
||||||
|
Megatron-LM in this Dockerized training environment. To learn how to migrate
|
||||||
|
workloads from Megatron-LM to Primus with Megatron-Core, see
|
||||||
|
:doc:`previous-versions/megatron-lm-primus-migration-guide`.
|
||||||
|
|
||||||
|
For ease of use, AMD provides a ready-to-use Docker image for MI300 series accelerators
|
||||||
|
containing essential components for Primus and Megatron-Core.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
This Docker environment is based on Python 3.10 and Ubuntu 22.04. For an alternative environment with
|
||||||
|
Python 3.12 and Ubuntu 24.04, see the :doc:`previous ROCm Megatron-LM v25.6 Docker release <previous-versions/megatron-lm-v25.6>`.
|
||||||
|
|
||||||
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
|
||||||
|
|
||||||
|
{% set dockers = data.dockers %}
|
||||||
|
{% set docker = dockers[0] %}
|
||||||
|
.. list-table::
|
||||||
|
:header-rows: 1
|
||||||
|
|
||||||
|
* - Software component
|
||||||
|
- Version
|
||||||
|
|
||||||
|
{% for component_name, component_version in docker.components.items() %}
|
||||||
|
* - {{ component_name }}
|
||||||
|
- {{ component_version }}
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
.. _amd-primus-megatron-lm-model-support:
|
||||||
|
|
||||||
|
Supported models
|
||||||
|
================
|
||||||
|
|
||||||
|
The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
|
||||||
|
Some instructions, commands, and training examples in this documentation might
|
||||||
|
vary by model -- select one to get started.
|
||||||
|
|
||||||
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
|
||||||
|
|
||||||
|
{% set model_groups = data.model_groups %}
|
||||||
|
.. raw:: html
|
||||||
|
|
||||||
|
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||||
|
<div class="row">
|
||||||
|
<div class="col-2 me-2 model-param-head">Model</div>
|
||||||
|
<div class="row col-10">
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
<div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="row mt-1">
|
||||||
|
<div class="col-2 me-2 model-param-head">Model variant</div>
|
||||||
|
<div class="row col-10">
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
{% set models = model_group.models %}
|
||||||
|
{% for model in models %}
|
||||||
|
{% if models|length % 3 == 0 %}
|
||||||
|
<div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
|
{% else %}
|
||||||
|
<div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
|
{% endif %}
|
||||||
|
{% endfor %}
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
Some models, such as Llama, require an external license agreement through
|
||||||
|
a third party (for example, Meta).
|
||||||
|
|
||||||
|
System validation
|
||||||
|
=================
|
||||||
|
|
||||||
|
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||||
|
correctly and performing optimally.
|
||||||
|
|
||||||
|
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
||||||
|
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
||||||
|
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
||||||
|
before starting training.
|
||||||
|
|
||||||
|
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||||
|
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||||
|
system's configuration.
|
||||||
|
|
||||||
|
.. _mi300x-amd-primus-megatron-lm-training:
|
||||||
|
|
||||||
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
|
||||||
|
|
||||||
|
{% set dockers = data.dockers %}
|
||||||
|
{% set docker = dockers[0] %}
|
||||||
|
|
||||||
|
Environment setup
|
||||||
|
=================
|
||||||
|
|
||||||
|
Use the following instructions to set up the environment, configure the script to train models, and
|
||||||
|
reproduce the benchmark results on MI300X series accelerators with the ``{{ docker.pull_tag }}`` image.
|
||||||
|
|
||||||
|
.. _amd-primus-megatron-lm-requirements:
|
||||||
|
|
||||||
|
Download the Docker image
|
||||||
|
-------------------------
|
||||||
|
|
||||||
|
1. Use the following command to pull the Docker image from Docker Hub.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker pull {{ docker.pull_tag }}
|
||||||
|
|
||||||
|
2. Launch the Docker container.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker run -it \
|
||||||
|
--device /dev/dri \
|
||||||
|
--device /dev/kfd \
|
||||||
|
--device /dev/infiniband \
|
||||||
|
--network host --ipc host \
|
||||||
|
--group-add video \
|
||||||
|
--cap-add SYS_PTRACE \
|
||||||
|
--security-opt seccomp=unconfined \
|
||||||
|
--privileged \
|
||||||
|
-v $HOME:$HOME \
|
||||||
|
--shm-size 128G \
|
||||||
|
--name primus_training_env \
|
||||||
|
{{ docker.pull_tag }}
|
||||||
|
|
||||||
|
3. Use these commands if you exit the ``primus_training_env`` container and need to return to it.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker start primus_training_env
|
||||||
|
docker exec -it primus_training_env bash
|
||||||
|
|
||||||
|
The Docker container hosts verified release tag ``v0.1.0-rc1`` of the `Primus
|
||||||
|
<https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1>`__ repository.
|
||||||
|
|
||||||
|
.. _amd-primus-megatron-lm-environment-setup:
|
||||||
|
|
||||||
|
Configuration
|
||||||
|
=============
|
||||||
|
|
||||||
|
Primus defines a training configuration in YAML for each model in
|
||||||
|
`examples/megatron/configs <https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1/examples/megatron/configs>`__.
|
||||||
|
|
||||||
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
|
||||||
|
|
||||||
|
{% set model_groups = data.model_groups %}
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
{% for model in model_group.models %}
|
||||||
|
.. container:: model-doc {{ model.mad_tag }}
|
||||||
|
|
||||||
|
To update training parameters for {{ model.model }}, you can update ``examples/megatron/configs/{{ model.config_name }}``.
|
||||||
|
Note that training configuration YAML files for other models follow this naming convention.
|
||||||
|
|
||||||
|
{% endfor %}
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
See :ref:`Key options <amd-primus-megatron-lm-benchmark-test-vars>` for more information on configuration options.
|
||||||
|
|
||||||
|
Dataset options
|
||||||
|
---------------
|
||||||
|
|
||||||
|
You can use either mock data or real data for training.
|
||||||
|
|
||||||
|
* Mock data can be useful for testing and validation. Use the ``mock_data`` field to toggle between mock and real data. The default
|
||||||
|
value is ``true`` for enabled.
|
||||||
|
|
||||||
|
.. code-block:: yaml
|
||||||
|
|
||||||
|
mock_data: true
|
||||||
|
|
||||||
|
* If you're using a real dataset, update the ``train_data_path`` field to point to the location of your dataset.
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
mock_data: false
|
||||||
|
train_data_path: /path/to/your/dataset
|
||||||
|
|
||||||
|
Ensure that the files are accessible inside the Docker container.
|
||||||
|
|
||||||
|
.. _amd-primus-megatron-lm-tokenizer:
|
||||||
|
|
||||||
|
Tokenizer
|
||||||
|
---------
|
||||||
|
|
||||||
|
In Primus, each model uses a tokenizer from Hugging Face. For example, Llama
|
||||||
|
3.1 8B model uses ``tokenizer_model: meta-llama/Llama-3.1-8B`` and
|
||||||
|
``tokenizer_type: Llama3Tokenizer`` defined in the `llama3.1-8B model
|
||||||
|
<https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1/primus/configs/models/megatron/llama3.1_8B.yaml>`__
|
||||||
|
definition. As such, you need to set the ``HF_TOKEN`` environment variable with
|
||||||
|
right permissions to access the tokenizer for each model.
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
# Export your HF_TOKEN in the workspace
|
||||||
|
export HF_TOKEN=<your_hftoken>
|
||||||
|
|
||||||
|
.. _amd-primus-megatron-lm-run-training:
|
||||||
|
|
||||||
|
Run training
|
||||||
|
============
|
||||||
|
|
||||||
|
Use the following example commands to set up the environment, configure
|
||||||
|
:ref:`key options <amd-primus-megatron-lm-benchmark-test-vars>`, and run training on
|
||||||
|
MI300X series accelerators with the AMD Megatron-LM environment.
|
||||||
|
|
||||||
|
Single node training
|
||||||
|
--------------------
|
||||||
|
|
||||||
|
To run training on a single node, navigate to ``/workspace/Primus`` and use the following setup command:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
pip install -r requirements.txt
|
||||||
|
export HSA_NO_SCRATCH_RECLAIM=1
|
||||||
|
export NVTE_CK_USES_BWD_V3=1
|
||||||
|
|
||||||
|
Once setup is complete, run the appropriate training command.
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.3-70b
|
||||||
|
|
||||||
|
To run pre-training for Llama 3.3 70B BF16, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
|
||||||
|
bash ./examples/run_pretrain.sh \
|
||||||
|
--micro_batch_size 2 \
|
||||||
|
--global_batch_size 16 \
|
||||||
|
--train_iters 50
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b
|
||||||
|
|
||||||
|
To run pre-training for Llama 3.1 8B FP8, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
|
||||||
|
bash ./examples/run_pretrain.sh \
|
||||||
|
--train_iters 50 \
|
||||||
|
--fp8 hybrid
|
||||||
|
|
||||||
|
For Llama 3.1 8B BF16, use the following command:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
|
||||||
|
bash ./examples/run_pretrain.sh --train_iters 50
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-70b
|
||||||
|
|
||||||
|
To run pre-training for Llama 3.1 70B BF16, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
|
||||||
|
bash ./examples/run_pretrain.sh \
|
||||||
|
--train_iters 50
|
||||||
|
|
||||||
|
To run the training on a single node for Llama 3.1 70B FP8 with proxy, use the following command:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
|
||||||
|
bash ./examples/run_pretrain.sh \
|
||||||
|
--train_iters 50 \
|
||||||
|
--num_layers 40 \
|
||||||
|
--fp8 hybrid \
|
||||||
|
--no_fp8_weight_transpose_cache true
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
Use two or more nodes to run the *full* Llama 70B model with FP8 precision.
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-7b
|
||||||
|
|
||||||
|
To run pre-training for Llama 2 7B FP8, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
|
||||||
|
bash ./examples/run_pretrain.sh \
|
||||||
|
--train_iters 50 \
|
||||||
|
--fp8 hybrid
|
||||||
|
|
||||||
|
To run pre-training for Llama 2 7B BF16, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
|
||||||
|
bash ./examples/run_pretrain.sh --train_iters 50
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-70b
|
||||||
|
|
||||||
|
To run pre-training for Llama 2 70B BF16, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
|
||||||
|
bash ./examples/run_pretrain.sh --train_iters 50
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v3-proxy
|
||||||
|
|
||||||
|
To run training on a single node for DeepSeek-V3 (MoE with expert parallel) with 3-layer proxy,
|
||||||
|
use the following command:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/deepseek_v3-pretrain.yaml \
|
||||||
|
bash examples/run_pretrain.sh \
|
||||||
|
--num_layers 3 \
|
||||||
|
--moe_layer_freq 1 \
|
||||||
|
--train_iters 50
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v2-lite-16b
|
||||||
|
|
||||||
|
To run training on a single node for DeepSeek-V2-Lite (MoE with expert parallel),
|
||||||
|
use the following command:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/deepseek_v2_lite-pretrain.yaml \
|
||||||
|
bash examples/run_pretrain.sh \
|
||||||
|
--global_batch_size 256 \
|
||||||
|
--train_iters 50
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x7b
|
||||||
|
|
||||||
|
To run training on a single node for Mixtral 8x7B (MoE with expert parallel),
|
||||||
|
use the following command:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/mixtral_8x7B_v0.1-pretrain.yaml \
|
||||||
|
bash examples/run_pretrain.sh --train_iters 50
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x22b-proxy
|
||||||
|
|
||||||
|
To run training on a single node for Mixtral 8x7B (MoE with expert parallel) with 4-layer proxy,
|
||||||
|
use the following command:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/mixtral_8x22B_v0.1-pretrain.yaml \
|
||||||
|
bash examples/run_pretrain.sh \
|
||||||
|
--num_layers 4 \
|
||||||
|
--pipeline_model_parallel_size 1 \
|
||||||
|
--micro_batch_size 1 \
|
||||||
|
--global_batch_size 16 \
|
||||||
|
--train_iters 50
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-7b
|
||||||
|
|
||||||
|
To run training on a single node for Qwen 2.5 7B BF16, use the following
|
||||||
|
command:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \
|
||||||
|
bash examples/run_pretrain.sh --train_iters 50
|
||||||
|
|
||||||
|
For FP8, use the following command.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \
|
||||||
|
bash examples/run_pretrain.sh \
|
||||||
|
--train_iters 50 \
|
||||||
|
--fp8 hybrid
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-72b
|
||||||
|
|
||||||
|
To run the training on a single node for Qwen 2.5 72B BF16, use the following command.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \
|
||||||
|
bash examples/run_pretrain.sh --train_iters 50
|
||||||
|
|
||||||
|
Multi-node training examples
|
||||||
|
----------------------------
|
||||||
|
|
||||||
|
To run training on multiple nodes, you can use the
|
||||||
|
`run_slurm_pretrain.sh <https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1/examples/run_slurm_pretrain.sh>`__
|
||||||
|
to launch the multi-node workload. Use the following steps to setup your environment:
|
||||||
|
|
||||||
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
|
||||||
|
|
||||||
|
{% set dockers = data.dockers %}
|
||||||
|
{% set docker = dockers[0] %}
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
cd /workspace/Primus/
|
||||||
|
export DOCKER_IMAGE={{ docker.pull_tag }}
|
||||||
|
export HF_TOKEN=<your_HF_token>
|
||||||
|
export HSA_NO_SCRATCH_RECLAIM=1
|
||||||
|
export NVTE_CK_USES_BWD_V3=1
|
||||||
|
export NCCL_IB_HCA=<your_NCCL_IB_HCA> # specify which RDMA interfaces to use for communication
|
||||||
|
export NCCL_SOCKET_IFNAME=<your_NCCL_SOCKET_IFNAME> # your Network Interface
|
||||||
|
export GLOO_SOCKET_IFNAME=<your_GLOO_SOCKET_IFNAME> # your Network Interface
|
||||||
|
export NCCL_IB_GID_INDEX=3 # Set InfiniBand GID index for NCCL communication. Default is 3 for ROCE
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
* Make sure correct network drivers are installed on the nodes. If inside a Docker, either install the drivers inside the Docker container or pass the network drivers from the host while creating Docker container.
|
||||||
|
* If ``NCCL_IB_HCA`` and ``NCCL_SOCKET_IFNAME`` are not set, Primus will try to auto-detect. However, since NICs can vary accross different cluster, it is encouraged to explicitly export your NCCL parameters for the cluster.
|
||||||
|
* To find your network interface, you can use ``ip a``.
|
||||||
|
* To find RDMA interfaces, you can use ``ibv_devices`` to get the list of all the RDMA/IB devices.
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.3-70b
|
||||||
|
|
||||||
|
To train Llama 3.3 70B FP8 on 8 nodes, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
NNODES=8 EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
|
||||||
|
bash examples/run_slurm_pretrain.sh \
|
||||||
|
--micro_batch_size 4 \
|
||||||
|
--global_batch_size 256 \
|
||||||
|
--recompute_num_layers 80 \
|
||||||
|
--no_fp8_weight_transpose_cache true \
|
||||||
|
--fp8 hybrid
|
||||||
|
|
||||||
|
To train Llama 3.3 70B BF16 on 8 nodes, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
NNODES=8 EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
|
||||||
|
bash examples/run_slurm_pretrain.sh \
|
||||||
|
--micro_batch_size 1 \
|
||||||
|
--global_batch_size 256 \
|
||||||
|
--recompute_num_layers 12
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b
|
||||||
|
|
||||||
|
To train Llama 3.1 8B FP8 on 8 nodes, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
# Adjust the training parameters. For e.g., `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case
|
||||||
|
NNODES=8 EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
|
||||||
|
bash ./examples/run_slurm_pretrain.sh \
|
||||||
|
--global_batch_size 1024 \
|
||||||
|
--fp8 hybrid
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-70b
|
||||||
|
|
||||||
|
To train Llama 3.1 70B FP8 on 8 nodes, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
NNODES=8 EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
|
||||||
|
bash examples/run_slurm_pretrain.sh \
|
||||||
|
--micro_batch_size 4 \
|
||||||
|
--global_batch_size 256 \
|
||||||
|
--recompute_num_layers 80 \
|
||||||
|
--no_fp8_weight_transpose_cache true \
|
||||||
|
--fp8 hybrid
|
||||||
|
|
||||||
|
To train Llama 3.1 70B BF16 on 8 nodes, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
NNODES=8 EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
|
||||||
|
bash examples/run_slurm_pretrain.sh \
|
||||||
|
--micro_batch_size 1 \
|
||||||
|
--global_batch_size 256 \
|
||||||
|
--recompute_num_layers 12
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-7b
|
||||||
|
|
||||||
|
To train Llama 2 8B FP8 on 8 nodes, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
# Adjust the training parameters. For e.g., `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case
|
||||||
|
NNODES=8 EXP=examples/megatron/configs/llama2_7B-pretrain.yaml bash ./examples/run_slurm_pretrain.sh --global_batch_size 2048 --fp8 hybrid
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-70b
|
||||||
|
|
||||||
|
To train Llama 2 70B FP8 on 8 nodes, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
NNODES=8 EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
|
||||||
|
bash examples/run_slurm_pretrain.sh \
|
||||||
|
--micro_batch_size 10 \
|
||||||
|
--global_batch_size 640 \
|
||||||
|
--recompute_num_layers 80 \
|
||||||
|
--no_fp8_weight_transpose_cache true \
|
||||||
|
--fp8 hybrid
|
||||||
|
|
||||||
|
To train Llama 2 70B BF16 on 8 nodes, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
NNODES=8 EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
|
||||||
|
bash ./examples/run_slurm_pretrain.sh \
|
||||||
|
--micro_batch_size 2 \
|
||||||
|
--global_batch_size 1536 \
|
||||||
|
--recompute_num_layers 12
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x7b
|
||||||
|
|
||||||
|
To train Mixtral 8x7B BF16 on 8 nodes, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
NNODES=8 EXP=examples/megatron/configs/mixtral_8x7B_v0.1-pretrain.yaml \
|
||||||
|
bash examples/run_slurm_pretrain.sh \
|
||||||
|
--micro_batch_size 2 \
|
||||||
|
--global_batch_size 256
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-72b
|
||||||
|
|
||||||
|
To train Qwen2.5 72B FP8 on 8 nodes, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
NNODES=8 EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \
|
||||||
|
bash examples/run_slurm_pretrain.sh \
|
||||||
|
--micro_batch_size 8 \
|
||||||
|
--global_batch_size 512 \
|
||||||
|
--recompute_num_layers 80 \
|
||||||
|
--no_fp8_weight_transpose_cache true \
|
||||||
|
--fp8 hybrid
|
||||||
|
|
||||||
|
.. _amd-primus-megatron-lm-benchmark-test-vars:
|
||||||
|
|
||||||
|
Key options
|
||||||
|
-----------
|
||||||
|
|
||||||
|
The following are key options to take note of
|
||||||
|
|
||||||
|
fp8
|
||||||
|
``hybrid`` enables FP8 GEMMs.
|
||||||
|
|
||||||
|
use_torch_fsdp2
|
||||||
|
``use_torch_fsdp2: 1`` enables torch fsdp-v2. If FSDP is enabled,
|
||||||
|
set ``use_distributed_optimizer`` and ``overlap_param_gather`` to ``false``.
|
||||||
|
|
||||||
|
profile
|
||||||
|
To enable PyTorch profiling, set these parameters:
|
||||||
|
|
||||||
|
.. code-block:: yaml
|
||||||
|
|
||||||
|
profile: true
|
||||||
|
use_pytorch_profiler: true
|
||||||
|
profile_step_end: 7
|
||||||
|
profile_step_start: 6
|
||||||
|
|
||||||
|
train_iters
|
||||||
|
The total number of iterations (default: 50).
|
||||||
|
|
||||||
|
mock_data
|
||||||
|
True by default.
|
||||||
|
|
||||||
|
micro_batch_size
|
||||||
|
Micro batch size.
|
||||||
|
|
||||||
|
global_batch_size
|
||||||
|
Global batch size.
|
||||||
|
|
||||||
|
recompute_granularity
|
||||||
|
For activation checkpointing.
|
||||||
|
|
||||||
|
num_layers
|
||||||
|
For using a reduced number of layers as with proxy models.
|
||||||
|
|
||||||
|
Previous versions
|
||||||
|
=================
|
||||||
|
|
||||||
|
See :doc:`previous-versions/megatron-lm-history` to find documentation for previous releases
|
||||||
|
of the ``ROCm/megatron-lm`` Docker image.
|
||||||
|
|
||||||
|
This training environment now uses Primus with Megatron as the primary
|
||||||
|
configuration. Limited support for the legacy ROCm Megatron-LM is still
|
||||||
|
available. For instructions on using ROCm Megatron-LM, see the
|
||||||
|
:doc:`megatron-lm` document.
|
||||||
@@ -9,28 +9,25 @@ Training a model with PyTorch for ROCm
|
|||||||
PyTorch is an open-source machine learning framework that is widely used for
|
PyTorch is an open-source machine learning framework that is widely used for
|
||||||
model training with GPU-optimized components for transformer-based models.
|
model training with GPU-optimized components for transformer-based models.
|
||||||
|
|
||||||
The `PyTorch for ROCm training Docker <https://hub.docker.com/r/rocm/pytorch-training/tags>`_
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
|
||||||
(``rocm/pytorch-training:v25.6``) image provides a prebuilt optimized environment for fine-tuning and pretraining a
|
|
||||||
model on AMD Instinct MI325X and MI300X accelerators. It includes the following software components to accelerate
|
|
||||||
training workloads:
|
|
||||||
|
|
||||||
+--------------------------+--------------------------------+
|
{% set dockers = data.dockers %}
|
||||||
| Software component | Version |
|
{% set docker = dockers[0] %}
|
||||||
+==========================+================================+
|
The `PyTorch for ROCm training Docker <{{ docker.docker_hub_url }}>`__
|
||||||
| ROCm | 6.3.4 |
|
(``{{ docker.pull_tag }}``) image provides a prebuilt optimized environment for fine-tuning and pretraining a
|
||||||
+--------------------------+--------------------------------+
|
model on AMD Instinct MI325X and MI300X accelerators. It includes the following software components to accelerate
|
||||||
| PyTorch | 2.8.0a0+git7d205b2 |
|
training workloads:
|
||||||
+--------------------------+--------------------------------+
|
|
||||||
| Python | 3.10.17 |
|
.. list-table::
|
||||||
+--------------------------+--------------------------------+
|
:header-rows: 1
|
||||||
| Transformer Engine | 1.14.0+2f85f5f2 |
|
|
||||||
+--------------------------+--------------------------------+
|
* - Software component
|
||||||
| Flash Attention | 3.0.0.post1 |
|
- Version
|
||||||
+--------------------------+--------------------------------+
|
|
||||||
| hipBLASLt | 0.15.0-8c6919d |
|
{% for component_name, component_version in docker.components.items() %}
|
||||||
+--------------------------+--------------------------------+
|
* - {{ component_name }}
|
||||||
| Triton | 3.3.0 |
|
- {{ component_version }}
|
||||||
+--------------------------+--------------------------------+
|
{% endfor %}
|
||||||
|
|
||||||
.. _amd-pytorch-training-model-support:
|
.. _amd-pytorch-training-model-support:
|
||||||
|
|
||||||
@@ -38,26 +35,27 @@ Supported models
|
|||||||
================
|
================
|
||||||
|
|
||||||
The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X accelerators.
|
The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X accelerators.
|
||||||
|
Some instructions, commands, and training recommendations in this documentation might
|
||||||
|
vary by model -- select one to get started.
|
||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
|
||||||
|
|
||||||
{% set unified_docker = data.unified_docker.latest %}
|
{% set unified_docker = data.dockers[0] %}
|
||||||
{% set model_groups = data.model_groups %}
|
{% set model_groups = data.model_groups %}
|
||||||
|
|
||||||
.. raw:: html
|
.. raw:: html
|
||||||
|
|
||||||
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||||
<div class="row">
|
<div class="row">
|
||||||
<div class="col-2 me-2 model-param-head">Workload</div>
|
<div class="col-2 me-2 model-param-head">Model group</div>
|
||||||
<div class="row col-10">
|
<div class="row col-10">
|
||||||
{% for model_group in model_groups %}
|
{% for model_group in model_groups %}
|
||||||
<div class="col-6 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
<div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="row mt-1">
|
<div class="row mt-1">
|
||||||
<div class="col-2 me-2 model-param-head">Model</div>
|
<div class="col-2 me-2 model-param-head">Model variant</div>
|
||||||
<div class="row col-10">
|
<div class="row col-10">
|
||||||
{% for model_group in model_groups %}
|
{% for model_group in model_groups %}
|
||||||
{% set models = model_group.models %}
|
{% set models = model_group.models %}
|
||||||
@@ -73,84 +71,116 @@ The following models are pre-optimized for performance on the AMD Instinct MI325
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
.. note::
|
|
||||||
|
|
||||||
Some models require an external license agreement through a third party (for example, Meta).
|
.. _amd-pytorch-training-supported-training-modes:
|
||||||
|
|
||||||
.. _amd-pytorch-training-performance-measurements:
|
The following table lists supported training modes per model.
|
||||||
|
|
||||||
Performance measurements
|
.. dropdown:: Supported training modes
|
||||||
========================
|
|
||||||
|
|
||||||
To evaluate performance, the
|
.. list-table::
|
||||||
|
:header-rows: 1
|
||||||
|
|
||||||
|
* - Model
|
||||||
|
- Supported training modes
|
||||||
|
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
{% set models = model_group.models %}
|
||||||
|
{% for model in models %}
|
||||||
|
* - {{ model.model }}
|
||||||
|
- ``{{ model.training_modes | join('``, ``') }}``
|
||||||
|
|
||||||
|
{% endfor %}
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
Some model and fine-tuning combinations are not listed. This is
|
||||||
|
because the `upstream torchtune repository <https://github.com/pytorch/torchtune>`__
|
||||||
|
doesn't provide default YAML configurations for them.
|
||||||
|
For advanced usage, you can create a custom configuration to enable
|
||||||
|
unlisted fine-tuning methods by using an existing file in the
|
||||||
|
``/workspace/torchtune/recipes/configs`` directory as a template.
|
||||||
|
|
||||||
|
.. _amd-pytorch-training-performance-measurements:
|
||||||
|
|
||||||
|
Performance measurements
|
||||||
|
========================
|
||||||
|
|
||||||
|
To evaluate performance, the
|
||||||
|
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
|
||||||
|
page provides reference throughput and latency measurements for training
|
||||||
|
popular AI models.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
The performance data presented in
|
||||||
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
|
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
|
||||||
page provides reference throughput and latency measurements for training
|
should not be interpreted as the peak performance achievable by AMD
|
||||||
popular AI models.
|
Instinct MI325X and MI300X accelerators or ROCm software.
|
||||||
|
|
||||||
.. note::
|
System validation
|
||||||
|
=================
|
||||||
|
|
||||||
The performance data presented in
|
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||||
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
|
correctly and performing optimally.
|
||||||
should not be interpreted as the peak performance achievable by AMD
|
|
||||||
Instinct MI325X and MI300X accelerators or ROCm software.
|
|
||||||
|
|
||||||
System validation
|
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
||||||
=================
|
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
||||||
|
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
||||||
|
before starting training.
|
||||||
|
|
||||||
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||||
correctly and performing optimally.
|
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||||
|
system's configuration.
|
||||||
|
|
||||||
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
This Docker image is optimized for specific model configurations outlined
|
||||||
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
below. Performance can vary for other training workloads, as AMD
|
||||||
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
doesn’t test configurations and run conditions outside those described.
|
||||||
before starting training.
|
|
||||||
|
|
||||||
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
Run training
|
||||||
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
============
|
||||||
system's configuration.
|
|
||||||
|
|
||||||
This Docker image is optimized for specific model configurations outlined
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
|
||||||
below. Performance can vary for other training workloads, as AMD
|
|
||||||
doesn’t validate configurations and run conditions outside those described.
|
|
||||||
|
|
||||||
Benchmarking
|
{% set unified_docker = data.dockers[0] %}
|
||||||
============
|
{% set model_groups = data.model_groups %}
|
||||||
|
|
||||||
Once the setup is complete, choose between two options to start benchmarking:
|
Once the setup is complete, choose between two options to start benchmarking training:
|
||||||
|
|
||||||
.. tab-set::
|
.. tab-set::
|
||||||
|
|
||||||
.. tab-item:: MAD-integrated benchmarking
|
.. tab-item:: MAD-integrated benchmarking
|
||||||
|
|
||||||
Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||||
directory and install the required packages on the host machine.
|
directory and install the required packages on the host machine.
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
git clone https://github.com/ROCm/MAD
|
git clone https://github.com/ROCm/MAD
|
||||||
cd MAD
|
cd MAD
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
|
|
||||||
{% for model_group in model_groups %}
|
{% for model_group in model_groups %}
|
||||||
{% for model in model_group.models %}
|
{% for model in model_group.models %}
|
||||||
|
|
||||||
.. container:: model-doc {{ model.mad_tag }}
|
.. container:: model-doc {{ model.mad_tag }}
|
||||||
|
|
||||||
For example, use this command to run the performance benchmark test on the {{ model.model }} model
|
2. For example, use this command to run the performance benchmark test on the {{ model.model }} model
|
||||||
using one GPU with the {{ model.precision }} data type on the host machine.
|
using one node with the {{ model.precision }} data type on the host machine.
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
||||||
madengine run \
|
madengine run \
|
||||||
--tags {{ model.mad_tag }} \
|
--tags {{ model.mad_tag }} \
|
||||||
--keep-model-dir \
|
--keep-model-dir \
|
||||||
--live-output \
|
--live-output \
|
||||||
--timeout 28800
|
--timeout 28800
|
||||||
|
|
||||||
MAD launches a Docker container with the name
|
MAD launches a Docker container with the name
|
||||||
``container_ci-{{ model.mad_tag }}``, for example. The latency and throughput reports of the
|
``container_ci-{{ model.mad_tag }}``. The latency and throughput reports of the
|
||||||
model are collected in the following path: ``~/MAD/perf.csv``.
|
model are collected in ``~/MAD/perf.csv``.
|
||||||
|
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
@@ -159,222 +189,213 @@ The following models are pre-optimized for performance on the AMD Instinct MI325
|
|||||||
|
|
||||||
.. rubric:: Download the Docker image and required packages
|
.. rubric:: Download the Docker image and required packages
|
||||||
|
|
||||||
Use the following command to pull the Docker image from Docker Hub.
|
1. Use the following command to pull the Docker image from Docker Hub.
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
docker pull {{ unified_docker.pull_tag }}
|
docker pull {{ unified_docker.pull_tag }}
|
||||||
|
|
||||||
Run the Docker container.
|
2. Run the Docker container.
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env {{ unified_docker.pull_tag }}
|
docker run -it \
|
||||||
|
--device /dev/dri \
|
||||||
|
--device /dev/kfd \
|
||||||
|
--network host \
|
||||||
|
--ipc host \
|
||||||
|
--group-add video \
|
||||||
|
--cap-add SYS_PTRACE \
|
||||||
|
--security-opt seccomp=unconfined \
|
||||||
|
--privileged \
|
||||||
|
-v $HOME:$HOME \
|
||||||
|
-v $HOME/.ssh:/root/.ssh \
|
||||||
|
--shm-size 64G \
|
||||||
|
--name training_env \
|
||||||
|
{{ unified_docker.pull_tag }}
|
||||||
|
|
||||||
Use these commands if you exit the ``training_env`` container and need to return to it.
|
Use these commands if you exit the ``training_env`` container and need to return to it.
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
docker start training_env
|
docker start training_env
|
||||||
docker exec -it training_env bash
|
docker exec -it training_env bash
|
||||||
|
|
||||||
In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
|
3. In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
|
||||||
repository and navigate to the benchmark scripts directory
|
repository and navigate to the benchmark scripts directory
|
||||||
``/workspace/MAD/scripts/pytorch_train``.
|
``/workspace/MAD/scripts/pytorch_train``.
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
git clone https://github.com/ROCm/MAD
|
git clone https://github.com/ROCm/MAD
|
||||||
cd MAD/scripts/pytorch_train
|
cd MAD/scripts/pytorch_train
|
||||||
|
|
||||||
.. rubric:: Prepare training datasets and dependencies
|
.. rubric:: Prepare training datasets and dependencies
|
||||||
|
|
||||||
The following benchmarking examples require downloading models and datasets
|
1. The following benchmarking examples require downloading models and datasets
|
||||||
from Hugging Face. To ensure successful access to gated repos, set your
|
from Hugging Face. To ensure successful access to gated repos, set your
|
||||||
``HF_TOKEN``.
|
``HF_TOKEN``.
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
export HF_TOKEN=$your_personal_hugging_face_access_token
|
export HF_TOKEN=$your_personal_hugging_face_access_token
|
||||||
|
|
||||||
Run the setup script to install libraries and datasets needed for benchmarking.
|
2. Run the setup script to install libraries and datasets needed for benchmarking.
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
./pytorch_benchmark_setup.sh
|
./pytorch_benchmark_setup.sh
|
||||||
|
|
||||||
.. container:: model-doc pyt_train_llama-3.1-8b
|
.. container:: model-doc pyt_train_llama-3.1-8b
|
||||||
|
|
||||||
``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 8B:
|
``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 8B:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Library
|
* - Library
|
||||||
- Reference
|
- Reference
|
||||||
|
|
||||||
* - ``accelerate``
|
* - ``accelerate``
|
||||||
- `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
|
- `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
|
||||||
|
|
||||||
* - ``datasets``
|
* - ``datasets``
|
||||||
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
|
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
|
||||||
|
|
||||||
.. container:: model-doc pyt_train_llama-3.1-70b
|
.. container:: model-doc pyt_train_llama-3.1-70b
|
||||||
|
|
||||||
``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 70B:
|
``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 70B:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Library
|
* - Library
|
||||||
- Reference
|
- Reference
|
||||||
|
|
||||||
* - ``datasets``
|
* - ``datasets``
|
||||||
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
|
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
|
||||||
|
|
||||||
* - ``torchdata``
|
* - ``torchdata``
|
||||||
- `TorchData <https://pytorch.org/data/beta/index.html>`_
|
- `TorchData <https://pytorch.org/data/beta/index.html>`_
|
||||||
|
|
||||||
* - ``tomli``
|
* - ``tomli``
|
||||||
- `Tomli <https://pypi.org/project/tomli/>`_
|
- `Tomli <https://pypi.org/project/tomli/>`_
|
||||||
|
|
||||||
* - ``tiktoken``
|
* - ``tiktoken``
|
||||||
- `tiktoken <https://github.com/openai/tiktoken>`_
|
- `tiktoken <https://github.com/openai/tiktoken>`_
|
||||||
|
|
||||||
* - ``blobfile``
|
* - ``blobfile``
|
||||||
- `blobfile <https://pypi.org/project/blobfile/>`_
|
- `blobfile <https://pypi.org/project/blobfile/>`_
|
||||||
|
|
||||||
* - ``tabulate``
|
* - ``tabulate``
|
||||||
- `tabulate <https://pypi.org/project/tabulate/>`_
|
- `tabulate <https://pypi.org/project/tabulate/>`_
|
||||||
|
|
||||||
* - ``wandb``
|
* - ``wandb``
|
||||||
- `Weights & Biases <https://github.com/wandb/wandb>`_
|
- `Weights & Biases <https://github.com/wandb/wandb>`_
|
||||||
|
|
||||||
* - ``sentencepiece``
|
* - ``sentencepiece``
|
||||||
- `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
|
- `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
|
||||||
|
|
||||||
* - ``tensorboard``
|
* - ``tensorboard``
|
||||||
- `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
|
- `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
|
||||||
|
|
||||||
.. container:: model-doc pyt_train_flux
|
.. container:: model-doc pyt_train_flux
|
||||||
|
|
||||||
``pytorch_benchmark_setup.sh`` installs the following libraries for FLUX:
|
``pytorch_benchmark_setup.sh`` installs the following libraries for FLUX:
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
|
|
||||||
* - Library
|
* - Library
|
||||||
- Reference
|
- Reference
|
||||||
|
|
||||||
* - ``accelerate``
|
* - ``accelerate``
|
||||||
- `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
|
- `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
|
||||||
|
|
||||||
* - ``datasets``
|
* - ``datasets``
|
||||||
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
|
- `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
|
||||||
|
|
||||||
* - ``sentencepiece``
|
* - ``sentencepiece``
|
||||||
- `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
|
- `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
|
||||||
|
|
||||||
* - ``tensorboard``
|
* - ``tensorboard``
|
||||||
- `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
|
- `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
|
||||||
|
|
||||||
* - ``csvkit``
|
* - ``csvkit``
|
||||||
- `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1
|
- `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1
|
||||||
|
|
||||||
* - ``deepspeed``
|
* - ``deepspeed``
|
||||||
- `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2
|
- `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2
|
||||||
|
|
||||||
* - ``diffusers``
|
* - ``diffusers``
|
||||||
- `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0
|
- `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0
|
||||||
|
|
||||||
* - ``GitPython``
|
* - ``GitPython``
|
||||||
- `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44
|
- `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44
|
||||||
|
|
||||||
* - ``opencv-python-headless``
|
* - ``opencv-python-headless``
|
||||||
- `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84
|
- `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84
|
||||||
|
|
||||||
* - ``peft``
|
* - ``peft``
|
||||||
- `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0
|
- `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0
|
||||||
|
|
||||||
* - ``protobuf``
|
* - ``protobuf``
|
||||||
- `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2
|
- `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2
|
||||||
|
|
||||||
* - ``pytest``
|
* - ``pytest``
|
||||||
- `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4
|
- `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4
|
||||||
|
|
||||||
* - ``python-dotenv``
|
* - ``python-dotenv``
|
||||||
- `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1
|
- `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1
|
||||||
|
|
||||||
* - ``seaborn``
|
* - ``seaborn``
|
||||||
- `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2
|
- `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2
|
||||||
|
|
||||||
* - ``transformers``
|
* - ``transformers``
|
||||||
- `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0
|
- `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0
|
||||||
|
|
||||||
``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face:
|
``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face:
|
||||||
|
|
||||||
* `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_
|
* `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_
|
||||||
|
|
||||||
{% for model_group in model_groups %}
|
{% for model_group in model_groups %}
|
||||||
{% for model in model_group.models %}
|
{% for model in model_group.models %}
|
||||||
{% if model_group.tag == "pre-training" and model.mad_tag in ["pyt_train_llama-3.1-8b", "pyt_train_llama-3.1-70b", "pyt_train_flux"] %}
|
{% set training_modes = model.training_modes %}
|
||||||
|
{% set training_mode_descs = {
|
||||||
|
"pretrain": "Benchmark pre-training.",
|
||||||
|
"HF_pretrain": "Llama 3.1 8B pre-training with FP8 precision."
|
||||||
|
} %}
|
||||||
|
{% set available_modes = training_modes | select("in", ["pretrain", "HF_pretrain"]) | list %}
|
||||||
|
{% if available_modes %}
|
||||||
|
|
||||||
.. container:: model-doc {{ model.mad_tag }}
|
.. container:: model-doc {{ model.mad_tag }}
|
||||||
|
|
||||||
.. rubric:: Pretraining
|
.. rubric:: Pre-training
|
||||||
|
|
||||||
To start the pre-training benchmark, use the following command with the
|
To start the pre-training benchmark, use the following command with the
|
||||||
appropriate options. See the following list of options and their descriptions.
|
appropriate options. See the following list of options and their descriptions.
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
./pytorch_benchmark_report.sh -t pretrain -m {{ model.model_repo }} -p $datatype -s $sequence_length
|
./pytorch_benchmark_report.sh -t {% if available_modes | length == 1 %}{{ available_modes[0] }}{% else %}$training_mode{% endif %} \
|
||||||
|
-m {{ model.model_repo }} \
|
||||||
.. list-table::
|
-p $datatype \
|
||||||
:header-rows: 1
|
-s $sequence_length
|
||||||
|
|
||||||
* - Name
|
|
||||||
- Options
|
|
||||||
- Description
|
|
||||||
|
|
||||||
{% if model.mad_tag == "pyt_train_llama-3.1-8b" %}
|
|
||||||
* - ``$datatype``
|
|
||||||
- ``BF16`` or ``FP8``
|
|
||||||
- Only Llama 3.1 8B supports FP8 precision.
|
|
||||||
{% else %}
|
|
||||||
* - ``$datatype``
|
|
||||||
- ``BF16``
|
|
||||||
- Only Llama 3.1 8B supports FP8 precision.
|
|
||||||
{% endif %}
|
|
||||||
|
|
||||||
* - ``$sequence_length``
|
|
||||||
- Sequence length for the language model.
|
|
||||||
- Between 2048 and 8192. 8192 by default.
|
|
||||||
|
|
||||||
{% if model.mad_tag == "pyt_train_flux" %}
|
{% if model.mad_tag == "pyt_train_flux" %}
|
||||||
.. container:: model-doc {{ model.mad_tag }}
|
.. container:: model-doc {{ model.mad_tag }}
|
||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
|
|
||||||
|
Currently, FLUX models are not supported out-of-the-box on {{ unified_docker.pull_tag }}.
|
||||||
|
To use FLUX, refer to the previous version of the ``pytorch-training`` Docker: :doc:`previous-versions/pytorch-training-v25.6`
|
||||||
|
|
||||||
Occasionally, downloading the Flux dataset might fail. In the event of this
|
Occasionally, downloading the Flux dataset might fail. In the event of this
|
||||||
error, manually download it from Hugging Face at
|
error, manually download it from Hugging Face at
|
||||||
`black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
|
`black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
|
||||||
and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
|
and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
|
||||||
the required dataset.
|
the required dataset.
|
||||||
{% endif %}
|
{% endif %}
|
||||||
{% endif %}
|
|
||||||
|
|
||||||
{% if model_group.tag == "fine-tuning" %}
|
|
||||||
.. container:: model-doc {{ model.mad_tag }}
|
|
||||||
|
|
||||||
.. rubric:: Fine-tuning
|
|
||||||
|
|
||||||
To start the fine-tuning benchmark, use the following command with the
|
|
||||||
appropriate options. See the following list of options and their descriptions.
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
./pytorch_benchmark_report.sh -t $training_mode -m {{ model.model_repo }} -p BF16 -s $sequence_length
|
|
||||||
|
|
||||||
.. list-table::
|
.. list-table::
|
||||||
:header-rows: 1
|
:header-rows: 1
|
||||||
@@ -383,53 +404,143 @@ The following models are pre-optimized for performance on the AMD Instinct MI325
|
|||||||
- Options
|
- Options
|
||||||
- Description
|
- Description
|
||||||
|
|
||||||
* - ``$training_mode``
|
{% for mode in available_modes %}
|
||||||
- ``finetune_fw``
|
* - {% if loop.first %}``$training_mode``{% endif %}
|
||||||
- Full weight fine-tuning (BF16 supported)
|
- ``{{ mode }}``
|
||||||
|
- {{ training_mode_descs[mode] }}
|
||||||
* -
|
{% endfor %}
|
||||||
- ``finetune_lora``
|
|
||||||
- LoRA fine-tuning (BF16 supported)
|
|
||||||
|
|
||||||
* -
|
|
||||||
- ``finetune_qlora``
|
|
||||||
- QLoRA fine-tuning (BF16 supported)
|
|
||||||
|
|
||||||
* -
|
|
||||||
- ``HF_finetune_lora``
|
|
||||||
- LoRA fine-tuning with Hugging Face PEFT
|
|
||||||
|
|
||||||
* - ``$datatype``
|
* - ``$datatype``
|
||||||
- ``BF16``
|
- ``BF16``{% if model.mad_tag == "pyt_train_llama-3.1-8b" %} or ``FP8``{% endif %}
|
||||||
- All models support BF16.
|
- Only Llama 3.1 8B supports FP8 precision.
|
||||||
|
|
||||||
|
* - ``$sequence_length``
|
||||||
|
- Sequence length for the language model.
|
||||||
|
- Between 2048 and 8192. 8192 by default.
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
{% set training_mode_descs = {
|
||||||
|
"finetune_fw": "Full weight fine-tuning (BF16 and FP8 supported).",
|
||||||
|
"finetune_lora": "LoRA fine-tuning (BF16 supported).",
|
||||||
|
"finetune_qlora": "QLoRA fine-tuning (BF16 supported).",
|
||||||
|
"HF_finetune_lora": "LoRA fine-tuning with Hugging Face PEFT.",
|
||||||
|
} %}
|
||||||
|
{% set available_modes = training_modes | select("in", ["finetune_fw", "finetune_lora", "finetune_qlora", "HF_finetune_lora"]) | list %}
|
||||||
|
{% if available_modes %}
|
||||||
|
.. container:: model-doc {{ model.mad_tag }}
|
||||||
|
|
||||||
|
.. rubric:: Fine-tuning
|
||||||
|
|
||||||
|
To start the fine-tuning benchmark, use the following command with the
|
||||||
|
appropriate options. See the following list of options and their descriptions.
|
||||||
|
See :ref:`supported training modes <amd-pytorch-training-supported-training-modes>`.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
./pytorch_benchmark_report.sh -t $training_mode \
|
||||||
|
-m {{ model.model_repo }} \
|
||||||
|
-p $datatype \
|
||||||
|
-s $sequence_length
|
||||||
|
|
||||||
|
.. list-table::
|
||||||
|
:header-rows: 1
|
||||||
|
|
||||||
|
* - Name
|
||||||
|
- Options
|
||||||
|
- Description
|
||||||
|
|
||||||
|
{% for mode in available_modes %}
|
||||||
|
* - {% if loop.first %}``$training_mode``{% endif %}
|
||||||
|
- ``{{ mode }}``
|
||||||
|
- {{ training_mode_descs[mode] }}
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
* - ``$datatype``
|
||||||
|
- ``BF16``{% if "finetune_fw" in available_modes %} or ``FP8``{% endif %}
|
||||||
|
- All models support BF16.{% if "finetune_fw" in available_modes %} FP8 is only available for full weight fine-tuning.{% endif %}
|
||||||
|
|
||||||
* - ``$sequence_length``
|
* - ``$sequence_length``
|
||||||
- Between 2048 and 16384.
|
- Between 2048 and 16384.
|
||||||
- Sequence length for the language model.
|
- Sequence length for the language model.
|
||||||
|
|
||||||
|
{% if model.mad_tag in ["pyt_train_llama3.2-vision-11b", "pyt_train_llama-3.2-vision-90b"] %}
|
||||||
.. note::
|
.. note::
|
||||||
|
|
||||||
{{ model.model }} currently supports the following fine-tuning methods:
|
For LoRA and QLoRA support with vision models (Llama 3.2 11B and 90B),
|
||||||
|
use the following torchtune commit for compatibility:
|
||||||
|
|
||||||
{% for method in model.training_modes %}
|
.. code-block:: shell
|
||||||
* ``{{ method }}``
|
|
||||||
{% endfor %}
|
git checkout 48192e23188b1fc524dd6d127725ceb2348e7f0e
|
||||||
{% if model.training_modes|length < 4 %}
|
|
||||||
|
{% elif model.mad_tag in ["pyt_train_llama-2-7b", "pyt_train_llama-2-13b", "pyt_train_llama-2-70b"] %}
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
You might encounter the following error with Llama 2: ``ValueError: seq_len (16384) of
|
||||||
|
input tensor should be smaller than max_seq_len (4096)``.
|
||||||
|
This error indicates that an input sequence is longer than the model's maximum context window.
|
||||||
|
|
||||||
|
Ensure your tokenized input does not exceed the model's ``max_seq_len`` (4096
|
||||||
|
tokens in this case). You can resolve this by truncating the input or splitting
|
||||||
|
it into smaller chunks before passing it to the model.
|
||||||
|
|
||||||
|
Note on reproducibility: The results in this guide are based on
|
||||||
|
commit ``b4c98ac`` from the upstream
|
||||||
|
`<https://github.com/pytorch/torchtune>`__ repository. For the
|
||||||
|
latest updates, you can use the main branch.
|
||||||
|
|
||||||
The upstream `torchtune <https://github.com/pytorch/torchtune>`_ repository
|
|
||||||
does not currently provide YAML configuration files for other combinations of
|
|
||||||
model to fine-tuning method
|
|
||||||
However, you can still configure your own YAML files to enable support for
|
|
||||||
fine-tuning methods not listed here by following existing patterns in the
|
|
||||||
``/workspace/torchtune/recipes/configs`` directory.
|
|
||||||
{% endif %}
|
{% endif %}
|
||||||
{% endif %}
|
{% endif %}
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
|
|
||||||
.. rubric:: Benchmarking examples
|
.. rubric:: Benchmarking examples
|
||||||
|
|
||||||
For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.
|
For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.
|
||||||
|
|
||||||
|
Multi-node training
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
Pre-training
|
||||||
|
~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Multi-node training with torchtitan is supported. The provided SLURM script is pre-configured for Llama 3 70B.
|
||||||
|
|
||||||
|
To launch the training job on a SLURM cluster for Llama 3 70B, run the following commands from the MAD repository.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
# In the MAD repository
|
||||||
|
cd scripts/pytorch_train
|
||||||
|
sbatch run_slurm_train.sh
|
||||||
|
|
||||||
|
Fine-tuning
|
||||||
|
~~~~~~~~~~~
|
||||||
|
|
||||||
|
Multi-node training with torchtune is supported. The provided SLURM script is pre-configured for Llama 3.3 70B.
|
||||||
|
|
||||||
|
To launch the training job on a SLURM cluster for Llama 3.3 70B, run the following commands from the MAD repository.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
huggingface-cli login # Get access to HF Llama model space
|
||||||
|
huggingface-cli download meta-llama/Llama-3.3-70B-Instruct --local-dir ./models/Llama-3.3-70B-Instruct # Download the Llama 3.3 model locally
|
||||||
|
# In the MAD repository
|
||||||
|
cd scripts/pytorch_train
|
||||||
|
sbatch Torchtune_Multinode.sh
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
Information regarding benchmark setup:
|
||||||
|
|
||||||
|
* By default, Llama 3.3 70B is fine-tuned using ``alpaca_dataset``.
|
||||||
|
* You can adjust the torchtune `YAML configuration file
|
||||||
|
<https://github.com/pytorch/torchtune/blob/main/recipes/configs/llama3_3/70B_full_multinode.yaml>`__
|
||||||
|
if you're using a different model.
|
||||||
|
* The number of nodes and other parameters can be tuned in the SLURM script ``Torchtune_Multinode.sh``.
|
||||||
|
* Set the ``mounting_paths`` inside the SLURM script.
|
||||||
|
|
||||||
|
Once the run is finished, you can find the log files in the ``result_torchtune/`` directory.
|
||||||
|
|
||||||
Further reading
|
Further reading
|
||||||
===============
|
===============
|
||||||
|
|||||||
@@ -21,6 +21,8 @@ In this guide, you'll learn about:
|
|||||||
|
|
||||||
- Training a model
|
- Training a model
|
||||||
|
|
||||||
|
- :doc:`With Primus (Megatron-LM backend) <benchmark-docker/primus-megatron>`
|
||||||
|
|
||||||
- :doc:`With Megatron-LM <benchmark-docker/megatron-lm>`
|
- :doc:`With Megatron-LM <benchmark-docker/megatron-lm>`
|
||||||
|
|
||||||
- :doc:`With PyTorch <benchmark-docker/pytorch-training>`
|
- :doc:`With PyTorch <benchmark-docker/pytorch-training>`
|
||||||
|
|||||||
@@ -285,7 +285,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
|||||||
- Radeon AI PRO R9700
|
- Radeon AI PRO R9700
|
||||||
- RDNA4
|
- RDNA4
|
||||||
- gfx1201
|
- gfx1201
|
||||||
- 16
|
- 32
|
||||||
- 64
|
- 64
|
||||||
- 32 or 64
|
- 32 or 64
|
||||||
- 128
|
- 128
|
||||||
|
|||||||
@@ -27,6 +27,24 @@ subtrees:
|
|||||||
title: ROCm on Radeon GPUs
|
title: ROCm on Radeon GPUs
|
||||||
- file: how-to/deep-learning-rocm.md
|
- file: how-to/deep-learning-rocm.md
|
||||||
title: Deep learning frameworks
|
title: Deep learning frameworks
|
||||||
|
subtrees:
|
||||||
|
- entries:
|
||||||
|
- file: compatibility/ml-compatibility/pytorch-compatibility.rst
|
||||||
|
title: PyTorch compatibility
|
||||||
|
- file: compatibility/ml-compatibility/tensorflow-compatibility.rst
|
||||||
|
title: TensorFlow compatibility
|
||||||
|
- file: compatibility/ml-compatibility/jax-compatibility.rst
|
||||||
|
title: JAX compatibility
|
||||||
|
- file: compatibility/ml-compatibility/verl-compatibility.rst
|
||||||
|
title: verl compatibility
|
||||||
|
- file: compatibility/ml-compatibility/stanford-megatron-lm-compatibility.rst
|
||||||
|
title: Stanford Megatron-LM compatibility
|
||||||
|
- file: compatibility/ml-compatibility/dgl-compatibility.rst
|
||||||
|
title: DGL compatibility
|
||||||
|
- file: compatibility/ml-compatibility/megablocks-compatibility.rst
|
||||||
|
title: Megablocks compatibility
|
||||||
|
- file: compatibility/ml-compatibility/taichi-compatibility.rst
|
||||||
|
title: Taichi compatibility
|
||||||
- file: how-to/build-rocm.rst
|
- file: how-to/build-rocm.rst
|
||||||
title: Build ROCm from source
|
title: Build ROCm from source
|
||||||
|
|
||||||
@@ -44,8 +62,8 @@ subtrees:
|
|||||||
title: Training
|
title: Training
|
||||||
subtrees:
|
subtrees:
|
||||||
- entries:
|
- entries:
|
||||||
- file: how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
|
- file: how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst
|
||||||
title: Train a model with Megatron-LM
|
title: Train a model with Primus and Megatron-Core
|
||||||
- file: how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
|
- file: how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
|
||||||
title: Train a model with PyTorch
|
title: Train a model with PyTorch
|
||||||
- file: how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
|
- file: how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
rocm-docs-core==1.20.1
|
rocm-docs-core==1.22.0
|
||||||
sphinx-reredirects
|
sphinx-reredirects
|
||||||
sphinx-sitemap
|
sphinx-sitemap
|
||||||
sphinxcontrib.datatemplates==0.11.0
|
sphinxcontrib.datatemplates==0.11.0
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ beautifulsoup4==4.13.4
|
|||||||
# via pydata-sphinx-theme
|
# via pydata-sphinx-theme
|
||||||
breathe==4.36.0
|
breathe==4.36.0
|
||||||
# via rocm-docs-core
|
# via rocm-docs-core
|
||||||
certifi==2025.4.26
|
certifi==2025.7.14
|
||||||
# via requests
|
# via requests
|
||||||
cffi==1.17.1
|
cffi==1.17.1
|
||||||
# via
|
# via
|
||||||
@@ -35,18 +35,16 @@ click==8.2.1
|
|||||||
# via
|
# via
|
||||||
# jupyter-cache
|
# jupyter-cache
|
||||||
# sphinx-external-toc
|
# sphinx-external-toc
|
||||||
comm==0.2.2
|
comm==0.2.3
|
||||||
# via ipykernel
|
# via ipykernel
|
||||||
cryptography==45.0.3
|
cryptography==45.0.5
|
||||||
# via pyjwt
|
# via pyjwt
|
||||||
debugpy==1.8.14
|
debugpy==1.8.15
|
||||||
# via ipykernel
|
# via ipykernel
|
||||||
decorator==5.2.1
|
decorator==5.2.1
|
||||||
# via ipython
|
# via ipython
|
||||||
defusedxml==0.7.1
|
defusedxml==0.7.1
|
||||||
# via sphinxcontrib-datatemplates
|
# via sphinxcontrib-datatemplates
|
||||||
deprecated==1.2.18
|
|
||||||
# via pygithub
|
|
||||||
docutils==0.21.2
|
docutils==0.21.2
|
||||||
# via
|
# via
|
||||||
# myst-parser
|
# myst-parser
|
||||||
@@ -62,7 +60,7 @@ fastjsonschema==2.21.1
|
|||||||
# rocm-docs-core
|
# rocm-docs-core
|
||||||
gitdb==4.0.12
|
gitdb==4.0.12
|
||||||
# via gitpython
|
# via gitpython
|
||||||
gitpython==3.1.44
|
gitpython==3.1.45
|
||||||
# via rocm-docs-core
|
# via rocm-docs-core
|
||||||
greenlet==3.2.3
|
greenlet==3.2.3
|
||||||
# via sqlalchemy
|
# via sqlalchemy
|
||||||
@@ -74,7 +72,7 @@ importlib-metadata==8.7.0
|
|||||||
# via
|
# via
|
||||||
# jupyter-cache
|
# jupyter-cache
|
||||||
# myst-nb
|
# myst-nb
|
||||||
ipykernel==6.29.5
|
ipykernel==6.30.0
|
||||||
# via myst-nb
|
# via myst-nb
|
||||||
ipython==8.37.0
|
ipython==8.37.0
|
||||||
# via
|
# via
|
||||||
@@ -86,7 +84,7 @@ jinja2==3.1.6
|
|||||||
# via
|
# via
|
||||||
# myst-parser
|
# myst-parser
|
||||||
# sphinx
|
# sphinx
|
||||||
jsonschema==4.24.0
|
jsonschema==4.25.0
|
||||||
# via nbformat
|
# via nbformat
|
||||||
jsonschema-specifications==2025.4.1
|
jsonschema-specifications==2025.4.1
|
||||||
# via jsonschema
|
# via jsonschema
|
||||||
@@ -116,7 +114,7 @@ mdit-py-plugins==0.4.2
|
|||||||
# via myst-parser
|
# via myst-parser
|
||||||
mdurl==0.1.2
|
mdurl==0.1.2
|
||||||
# via markdown-it-py
|
# via markdown-it-py
|
||||||
myst-nb==1.2.0
|
myst-nb==1.3.0
|
||||||
# via rocm-docs-core
|
# via rocm-docs-core
|
||||||
myst-parser==4.0.1
|
myst-parser==4.0.1
|
||||||
# via myst-nb
|
# via myst-nb
|
||||||
@@ -134,7 +132,6 @@ nest-asyncio==1.6.0
|
|||||||
packaging==25.0
|
packaging==25.0
|
||||||
# via
|
# via
|
||||||
# ipykernel
|
# ipykernel
|
||||||
# pydata-sphinx-theme
|
|
||||||
# sphinx
|
# sphinx
|
||||||
parso==0.8.4
|
parso==0.8.4
|
||||||
# via jedi
|
# via jedi
|
||||||
@@ -152,13 +149,13 @@ pure-eval==0.2.3
|
|||||||
# via stack-data
|
# via stack-data
|
||||||
pycparser==2.22
|
pycparser==2.22
|
||||||
# via cffi
|
# via cffi
|
||||||
pydata-sphinx-theme==0.15.4
|
pydata-sphinx-theme==0.16.1
|
||||||
# via
|
# via
|
||||||
# rocm-docs-core
|
# rocm-docs-core
|
||||||
# sphinx-book-theme
|
# sphinx-book-theme
|
||||||
pygithub==2.6.1
|
pygithub==2.7.0
|
||||||
# via rocm-docs-core
|
# via rocm-docs-core
|
||||||
pygments==2.19.1
|
pygments==2.19.2
|
||||||
# via
|
# via
|
||||||
# accessible-pygments
|
# accessible-pygments
|
||||||
# ipython
|
# ipython
|
||||||
@@ -178,7 +175,7 @@ pyyaml==6.0.2
|
|||||||
# rocm-docs-core
|
# rocm-docs-core
|
||||||
# sphinx-external-toc
|
# sphinx-external-toc
|
||||||
# sphinxcontrib-datatemplates
|
# sphinxcontrib-datatemplates
|
||||||
pyzmq==26.4.0
|
pyzmq==27.0.0
|
||||||
# via
|
# via
|
||||||
# ipykernel
|
# ipykernel
|
||||||
# jupyter-client
|
# jupyter-client
|
||||||
@@ -190,9 +187,9 @@ requests==2.32.4
|
|||||||
# via
|
# via
|
||||||
# pygithub
|
# pygithub
|
||||||
# sphinx
|
# sphinx
|
||||||
rocm-docs-core==1.20.1
|
rocm-docs-core==1.22.0
|
||||||
# via -r requirements.in
|
# via -r requirements.in
|
||||||
rpds-py==0.25.1
|
rpds-py==0.26.0
|
||||||
# via
|
# via
|
||||||
# jsonschema
|
# jsonschema
|
||||||
# referencing
|
# referencing
|
||||||
@@ -220,7 +217,7 @@ sphinx==8.1.3
|
|||||||
# sphinx-reredirects
|
# sphinx-reredirects
|
||||||
# sphinxcontrib-datatemplates
|
# sphinxcontrib-datatemplates
|
||||||
# sphinxcontrib-runcmd
|
# sphinxcontrib-runcmd
|
||||||
sphinx-book-theme==1.1.4
|
sphinx-book-theme==1.1.3
|
||||||
# via rocm-docs-core
|
# via rocm-docs-core
|
||||||
sphinx-copybutton==0.5.2
|
sphinx-copybutton==0.5.2
|
||||||
# via rocm-docs-core
|
# via rocm-docs-core
|
||||||
@@ -252,7 +249,7 @@ sphinxcontrib-runcmd==0.2.0
|
|||||||
# via sphinxcontrib-datatemplates
|
# via sphinxcontrib-datatemplates
|
||||||
sphinxcontrib-serializinghtml==2.0.0
|
sphinxcontrib-serializinghtml==2.0.0
|
||||||
# via sphinx
|
# via sphinx
|
||||||
sqlalchemy==2.0.41
|
sqlalchemy==2.0.42
|
||||||
# via jupyter-cache
|
# via jupyter-cache
|
||||||
stack-data==0.6.3
|
stack-data==0.6.3
|
||||||
# via ipython
|
# via ipython
|
||||||
@@ -266,7 +263,6 @@ tornado==6.5.1
|
|||||||
# jupyter-client
|
# jupyter-client
|
||||||
traitlets==5.14.3
|
traitlets==5.14.3
|
||||||
# via
|
# via
|
||||||
# comm
|
|
||||||
# ipykernel
|
# ipykernel
|
||||||
# ipython
|
# ipython
|
||||||
# jupyter-client
|
# jupyter-client
|
||||||
@@ -274,7 +270,7 @@ traitlets==5.14.3
|
|||||||
# matplotlib-inline
|
# matplotlib-inline
|
||||||
# nbclient
|
# nbclient
|
||||||
# nbformat
|
# nbformat
|
||||||
typing-extensions==4.14.0
|
typing-extensions==4.14.1
|
||||||
# via
|
# via
|
||||||
# beautifulsoup4
|
# beautifulsoup4
|
||||||
# exceptiongroup
|
# exceptiongroup
|
||||||
@@ -290,7 +286,5 @@ urllib3==2.5.0
|
|||||||
# requests
|
# requests
|
||||||
wcwidth==0.2.13
|
wcwidth==0.2.13
|
||||||
# via prompt-toolkit
|
# via prompt-toolkit
|
||||||
wrapt==1.17.2
|
|
||||||
# via deprecated
|
|
||||||
zipp==3.23.0
|
zipp==3.23.0
|
||||||
# via importlib-metadata
|
# via importlib-metadata
|
||||||
|
|||||||
Reference in New Issue
Block a user