mirror of
https://github.com/ROCm/ROCm.git
synced 2026-01-10 23:28:03 -05:00
Compare commits
31 Commits
update_jax
...
link-fix
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3c3847f9f7 | ||
|
|
249bd177ec | ||
|
|
b2ee8d4b2e | ||
|
|
3f834cf520 | ||
|
|
70ba866c5b | ||
|
|
320ec4669a | ||
|
|
c9bd93b537 | ||
|
|
a060550bcd | ||
|
|
c92cbaee66 | ||
|
|
c84afacc8d | ||
|
|
843fd1b3fb | ||
|
|
82221c4e2d | ||
|
|
d0ebe126e7 | ||
|
|
74610893a9 | ||
|
|
afe3e21cad | ||
|
|
ae2440772f | ||
|
|
61f970a24d | ||
|
|
85a1682573 | ||
|
|
87c6e320b4 | ||
|
|
b50948fe6b | ||
|
|
91407405a9 | ||
|
|
8f23f63a6b | ||
|
|
11747aaadc | ||
|
|
1088beefe5 | ||
|
|
b7988925a5 | ||
|
|
89dafa6232 | ||
|
|
8054852dad | ||
|
|
542d7813ce | ||
|
|
bc1ffe4fcb | ||
|
|
09997c68bb | ||
|
|
42bc3501ac |
@@ -5,6 +5,7 @@ ACEs
|
|||||||
ACS
|
ACS
|
||||||
AccVGPR
|
AccVGPR
|
||||||
AccVGPRs
|
AccVGPRs
|
||||||
|
AITER
|
||||||
ALU
|
ALU
|
||||||
AllReduce
|
AllReduce
|
||||||
AMD
|
AMD
|
||||||
@@ -115,6 +116,7 @@ Deprecations
|
|||||||
DevCap
|
DevCap
|
||||||
DirectX
|
DirectX
|
||||||
Dockerfile
|
Dockerfile
|
||||||
|
Dockerized
|
||||||
Doxygen
|
Doxygen
|
||||||
dropless
|
dropless
|
||||||
ELMo
|
ELMo
|
||||||
@@ -122,6 +124,7 @@ ENDPGM
|
|||||||
EPYC
|
EPYC
|
||||||
ESXi
|
ESXi
|
||||||
EoS
|
EoS
|
||||||
|
fas
|
||||||
FBGEMM
|
FBGEMM
|
||||||
FFT
|
FFT
|
||||||
FFTs
|
FFTs
|
||||||
@@ -194,6 +197,7 @@ HWE
|
|||||||
HWS
|
HWS
|
||||||
Haswell
|
Haswell
|
||||||
Higgs
|
Higgs
|
||||||
|
href
|
||||||
Hyperparameters
|
Hyperparameters
|
||||||
Huggingface
|
Huggingface
|
||||||
ICD
|
ICD
|
||||||
@@ -360,6 +364,7 @@ PowerEdge
|
|||||||
PowerShell
|
PowerShell
|
||||||
Pretrained
|
Pretrained
|
||||||
Pretraining
|
Pretraining
|
||||||
|
Primus
|
||||||
Profiler's
|
Profiler's
|
||||||
PyPi
|
PyPi
|
||||||
Pytest
|
Pytest
|
||||||
@@ -524,6 +529,7 @@ Xilinx
|
|||||||
Xnack
|
Xnack
|
||||||
Xteam
|
Xteam
|
||||||
YAML
|
YAML
|
||||||
|
YAMLs
|
||||||
YML
|
YML
|
||||||
YModel
|
YModel
|
||||||
ZeRO
|
ZeRO
|
||||||
@@ -584,6 +590,7 @@ completers
|
|||||||
composable
|
composable
|
||||||
concretization
|
concretization
|
||||||
config
|
config
|
||||||
|
configs
|
||||||
conformant
|
conformant
|
||||||
constructible
|
constructible
|
||||||
convolutional
|
convolutional
|
||||||
@@ -794,7 +801,9 @@ preprocessing
|
|||||||
preprocessor
|
preprocessor
|
||||||
prequantized
|
prequantized
|
||||||
prerequisites
|
prerequisites
|
||||||
|
pretrain
|
||||||
pretraining
|
pretraining
|
||||||
|
primus
|
||||||
profiler
|
profiler
|
||||||
profilers
|
profilers
|
||||||
protobuf
|
protobuf
|
||||||
|
|||||||
@@ -31,9 +31,9 @@ ROCm Version,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6
|
|||||||
:doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.14.0, 2.13.1, 2.12.1","2.14.0, 2.13.1, 2.12.1"
|
:doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.14.0, 2.13.1, 2.12.1","2.14.0, 2.13.1, 2.12.1"
|
||||||
:doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.4.35,0.4.35,0.4.35,0.4.35,0.4.31,0.4.31,0.4.31,0.4.31,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26
|
:doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.4.35,0.4.35,0.4.35,0.4.35,0.4.31,0.4.31,0.4.31,0.4.31,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26
|
||||||
:doc:`verl <../compatibility/ml-compatibility/verl-compatibility>` [#verl_compat]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.3.0.post0,N/A,N/A,N/A,N/A,N/A
|
:doc:`verl <../compatibility/ml-compatibility/verl-compatibility>` [#verl_compat]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.3.0.post0,N/A,N/A,N/A,N/A,N/A
|
||||||
:doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>`,N/A,N/A,N/A,N/A,85f95ae,85f95ae,85f95ae,85f95ae,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
:doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>` [#stanford-megatron-lm_compat]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,85f95ae,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||||
:doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat]_,N/A,N/A,N/A,2.4.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,
|
:doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat]_,N/A,N/A,N/A,2.4.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,
|
||||||
:doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>`,N/A,N/A,N/A,N/A,0.7.0,0.7.0,0.7.0,0.7.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
:doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>` [#megablocks_compat]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.7.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||||
:doc:`Taichi <../compatibility/ml-compatibility/taichi-compatibility>` [#taichi_compat]_,N/A,N/A,N/A,N/A,N/A,1.8.0b1,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
:doc:`Taichi <../compatibility/ml-compatibility/taichi-compatibility>` [#taichi_compat]_,N/A,N/A,N/A,N/A,N/A,1.8.0b1,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
|
||||||
`ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.2,1.2,1.2,1.2,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.14.1,1.14.1
|
`ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.2,1.2,1.2,1.2,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.14.1,1.14.1
|
||||||
,,,,,,,,,,,,,,,,,,
|
,,,,,,,,,,,,,,,,,,
|
||||||
|
|||||||
|
@@ -242,7 +242,9 @@ Expand for full historical view of:
|
|||||||
.. [#mi300_602-past-60] **For ROCm 6.0.2** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.
|
.. [#mi300_602-past-60] **For ROCm 6.0.2** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.
|
||||||
.. [#mi300_600-past-60] **For ROCm 6.0.0** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.
|
.. [#mi300_600-past-60] **For ROCm 6.0.0** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.
|
||||||
.. [#verl_compat] verl is only supported on ROCm 6.2.0.
|
.. [#verl_compat] verl is only supported on ROCm 6.2.0.
|
||||||
|
.. [#stanford-megatron-lm_compat] Stanford Megatron-LM is only supported on ROCm 6.3.0.
|
||||||
.. [#dgl_compat] DGL is only supported on ROCm 6.4.0.
|
.. [#dgl_compat] DGL is only supported on ROCm 6.4.0.
|
||||||
|
.. [#megablocks_compat] Megablocks is only supported on ROCm 6.3.0.
|
||||||
.. [#taichi_compat] Taichi is only supported on ROCm 6.3.2.
|
.. [#taichi_compat] Taichi is only supported on ROCm 6.3.2.
|
||||||
.. [#kfd_support-past-60] As of ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The tested user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and kernel-space support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
|
.. [#kfd_support-past-60] As of ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The tested user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and kernel-space support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
|
||||||
.. [#ROCT-rocr-past-60] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.
|
.. [#ROCT-rocr-past-60] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.
|
||||||
|
|||||||
@@ -147,6 +147,8 @@ article_pages = [
|
|||||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250521", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250521", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250605", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250605", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250702", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250702", "os": ["linux"]},
|
||||||
|
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250702", "os": ["linux"]},
|
||||||
|
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference", "os": ["linux"]},
|
||||||
{"file": "how-to/rocm-for-ai/inference/deploy-your-model", "os": ["linux"]},
|
{"file": "how-to/rocm-for-ai/inference/deploy-your-model", "os": ["linux"]},
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,163 @@
|
|||||||
|
vllm_benchmark:
|
||||||
|
unified_docker:
|
||||||
|
latest:
|
||||||
|
# TODO: update me
|
||||||
|
pull_tag: rocm/vllm:rocm6.4.1_vllm_0.9.1_20250715
|
||||||
|
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250715/images/sha256-4a429705fa95a58f6d20aceab43b1b76fa769d57f32d5d28bd3f4e030e2a78ea
|
||||||
|
rocm_version: 6.4.1
|
||||||
|
vllm_version: 0.9.1 (0.9.2.dev364+gb432b7a28.rocm641)
|
||||||
|
pytorch_version: 2.7.0+gitf717b2a
|
||||||
|
hipblaslt_version: 0.15
|
||||||
|
model_groups:
|
||||||
|
- group: Meta Llama
|
||||||
|
tag: llama
|
||||||
|
models:
|
||||||
|
- model: Llama 3.1 8B
|
||||||
|
mad_tag: pyt_vllm_llama-3.1-8b
|
||||||
|
model_repo: meta-llama/Llama-3.1-8B-Instruct
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-3.1-8B
|
||||||
|
precision: float16
|
||||||
|
- model: Llama 3.1 70B
|
||||||
|
mad_tag: pyt_vllm_llama-3.1-70b
|
||||||
|
model_repo: meta-llama/Llama-3.1-70B-Instruct
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
|
||||||
|
precision: float16
|
||||||
|
- model: Llama 3.1 405B
|
||||||
|
mad_tag: pyt_vllm_llama-3.1-405b
|
||||||
|
model_repo: meta-llama/Llama-3.1-405B-Instruct
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
|
||||||
|
precision: float16
|
||||||
|
- model: Llama 2 7B
|
||||||
|
mad_tag: pyt_vllm_llama-2-7b
|
||||||
|
model_repo: meta-llama/Llama-2-7b-chat-hf
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
|
||||||
|
precision: float16
|
||||||
|
- model: Llama 2 70B
|
||||||
|
mad_tag: pyt_vllm_llama-2-70b
|
||||||
|
model_repo: meta-llama/Llama-2-70b-chat-hf
|
||||||
|
url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
|
||||||
|
precision: float16
|
||||||
|
- model: Llama 3.1 8B FP8
|
||||||
|
mad_tag: pyt_vllm_llama-3.1-8b_fp8
|
||||||
|
model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- model: Llama 3.1 70B FP8
|
||||||
|
mad_tag: pyt_vllm_llama-3.1-70b_fp8
|
||||||
|
model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- model: Llama 3.1 405B FP8
|
||||||
|
mad_tag: pyt_vllm_llama-3.1-405b_fp8
|
||||||
|
model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- group: Mistral AI
|
||||||
|
tag: mistral
|
||||||
|
models:
|
||||||
|
- model: Mixtral MoE 8x7B
|
||||||
|
mad_tag: pyt_vllm_mixtral-8x7b
|
||||||
|
model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||||
|
url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
|
||||||
|
precision: float16
|
||||||
|
- model: Mixtral MoE 8x22B
|
||||||
|
mad_tag: pyt_vllm_mixtral-8x22b
|
||||||
|
model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||||
|
url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||||
|
precision: float16
|
||||||
|
- model: Mistral 7B
|
||||||
|
mad_tag: pyt_vllm_mistral-7b
|
||||||
|
model_repo: mistralai/Mistral-7B-Instruct-v0.3
|
||||||
|
url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
|
||||||
|
precision: float16
|
||||||
|
- model: Mixtral MoE 8x7B FP8
|
||||||
|
mad_tag: pyt_vllm_mixtral-8x7b_fp8
|
||||||
|
model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- model: Mixtral MoE 8x22B FP8
|
||||||
|
mad_tag: pyt_vllm_mixtral-8x22b_fp8
|
||||||
|
model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- model: Mistral 7B FP8
|
||||||
|
mad_tag: pyt_vllm_mistral-7b_fp8
|
||||||
|
model_repo: amd/Mistral-7B-v0.1-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- group: Qwen
|
||||||
|
tag: qwen
|
||||||
|
models:
|
||||||
|
- model: Qwen2 7B
|
||||||
|
mad_tag: pyt_vllm_qwen2-7b
|
||||||
|
model_repo: Qwen/Qwen2-7B-Instruct
|
||||||
|
url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
|
||||||
|
precision: float16
|
||||||
|
- model: Qwen2 72B
|
||||||
|
mad_tag: pyt_vllm_qwen2-72b
|
||||||
|
model_repo: Qwen/Qwen2-72B-Instruct
|
||||||
|
url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
|
||||||
|
precision: float16
|
||||||
|
- model: QwQ-32B
|
||||||
|
mad_tag: pyt_vllm_qwq-32b
|
||||||
|
model_repo: Qwen/QwQ-32B
|
||||||
|
url: https://huggingface.co/Qwen/QwQ-32B
|
||||||
|
precision: float16
|
||||||
|
tunableop: true
|
||||||
|
- group: Databricks DBRX
|
||||||
|
tag: dbrx
|
||||||
|
models:
|
||||||
|
- model: DBRX Instruct
|
||||||
|
mad_tag: pyt_vllm_dbrx-instruct
|
||||||
|
model_repo: databricks/dbrx-instruct
|
||||||
|
url: https://huggingface.co/databricks/dbrx-instruct
|
||||||
|
precision: float16
|
||||||
|
- model: DBRX Instruct FP8
|
||||||
|
mad_tag: pyt_vllm_dbrx_fp8
|
||||||
|
model_repo: amd/dbrx-instruct-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- group: Google Gemma
|
||||||
|
tag: gemma
|
||||||
|
models:
|
||||||
|
- model: Gemma 2 27B
|
||||||
|
mad_tag: pyt_vllm_gemma-2-27b
|
||||||
|
model_repo: google/gemma-2-27b
|
||||||
|
url: https://huggingface.co/google/gemma-2-27b
|
||||||
|
precision: float16
|
||||||
|
- group: Cohere
|
||||||
|
tag: cohere
|
||||||
|
models:
|
||||||
|
- model: C4AI Command R+ 08-2024
|
||||||
|
mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
|
||||||
|
model_repo: CohereForAI/c4ai-command-r-plus-08-2024
|
||||||
|
url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
|
||||||
|
precision: float16
|
||||||
|
- model: C4AI Command R+ 08-2024 FP8
|
||||||
|
mad_tag: pyt_vllm_command-r-plus_fp8
|
||||||
|
model_repo: amd/c4ai-command-r-plus-FP8-KV
|
||||||
|
url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
|
||||||
|
precision: float8
|
||||||
|
- group: DeepSeek
|
||||||
|
tag: deepseek
|
||||||
|
models:
|
||||||
|
- model: DeepSeek MoE 16B
|
||||||
|
mad_tag: pyt_vllm_deepseek-moe-16b-chat
|
||||||
|
model_repo: deepseek-ai/deepseek-moe-16b-chat
|
||||||
|
url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
|
||||||
|
precision: float16
|
||||||
|
- group: Microsoft Phi
|
||||||
|
tag: phi
|
||||||
|
models:
|
||||||
|
- model: Phi-4
|
||||||
|
mad_tag: pyt_vllm_phi-4
|
||||||
|
model_repo: microsoft/phi-4
|
||||||
|
url: https://huggingface.co/microsoft/phi-4
|
||||||
|
- group: TII Falcon
|
||||||
|
tag: falcon
|
||||||
|
models:
|
||||||
|
- model: Falcon 180B
|
||||||
|
mad_tag: pyt_vllm_falcon-180b
|
||||||
|
model_repo: tiiuae/falcon-180B
|
||||||
|
url: https://huggingface.co/tiiuae/falcon-180B
|
||||||
|
precision: float16
|
||||||
@@ -39,7 +39,7 @@ pytorch_inference_benchmark:
|
|||||||
model_repo: Wan-AI/Wan2.1-T2V-14B
|
model_repo: Wan-AI/Wan2.1-T2V-14B
|
||||||
url: https://huggingface.co/Wan-AI/Wan2.1-T2V-14B
|
url: https://huggingface.co/Wan-AI/Wan2.1-T2V-14B
|
||||||
precision: bfloat16
|
precision: bfloat16
|
||||||
- group: Janus-Pro
|
- group: Janus Pro
|
||||||
tag: janus-pro
|
tag: janus-pro
|
||||||
models:
|
models:
|
||||||
- model: Janus Pro 7B
|
- model: Janus Pro 7B
|
||||||
@@ -47,3 +47,11 @@ pytorch_inference_benchmark:
|
|||||||
model_repo: deepseek-ai/Janus-Pro-7B
|
model_repo: deepseek-ai/Janus-Pro-7B
|
||||||
url: https://huggingface.co/deepseek-ai/Janus-Pro-7B
|
url: https://huggingface.co/deepseek-ai/Janus-Pro-7B
|
||||||
precision: bfloat16
|
precision: bfloat16
|
||||||
|
- group: Hunyuan Video
|
||||||
|
tag: hunyuan
|
||||||
|
models:
|
||||||
|
- model: Hunyuan Video
|
||||||
|
mad_tag: pyt_hy_video
|
||||||
|
model_repo: tencent/HunyuanVideo
|
||||||
|
url: https://huggingface.co/tencent/HunyuanVideo
|
||||||
|
precision: float16
|
||||||
|
|||||||
@@ -2,11 +2,11 @@ vllm_benchmark:
|
|||||||
unified_docker:
|
unified_docker:
|
||||||
latest:
|
latest:
|
||||||
# TODO: update me
|
# TODO: update me
|
||||||
pull_tag: rocm/vllm:rocm6.4.1_vllm_0.9.1_20250715
|
pull_tag: rocm/vllm:rocm6.4.1_vllm_0.10.0_20250812
|
||||||
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250715/images/sha256-4a429705fa95a58f6d20aceab43b1b76fa769d57f32d5d28bd3f4e030e2a78ea
|
docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.0_20250812/images/sha256-4c277ad39af3a8c9feac9b30bf78d439c74d9b4728e788a419d3f1d0c30cacaa
|
||||||
rocm_version: 6.4.1
|
rocm_version: 6.4.1
|
||||||
vllm_version: 0.9.1 (0.9.2.dev364+gb432b7a28.rocm641)
|
vllm_version: 0.10.0 (0.10.1.dev395+g340ea86df.rocm641)
|
||||||
pytorch_version: 2.7.0+gitf717b2a
|
pytorch_version: 2.7.0+gitf717b2a (2.7.0+gitf717b2a)
|
||||||
hipblaslt_version: 0.15
|
hipblaslt_version: 0.15
|
||||||
model_groups:
|
model_groups:
|
||||||
- group: Meta Llama
|
- group: Meta Llama
|
||||||
@@ -27,11 +27,6 @@ vllm_benchmark:
|
|||||||
model_repo: meta-llama/Llama-3.1-405B-Instruct
|
model_repo: meta-llama/Llama-3.1-405B-Instruct
|
||||||
url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
|
url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
|
||||||
precision: float16
|
precision: float16
|
||||||
- model: Llama 2 7B
|
|
||||||
mad_tag: pyt_vllm_llama-2-7b
|
|
||||||
model_repo: meta-llama/Llama-2-7b-chat-hf
|
|
||||||
url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
|
|
||||||
precision: float16
|
|
||||||
- model: Llama 2 70B
|
- model: Llama 2 70B
|
||||||
mad_tag: pyt_vllm_llama-2-70b
|
mad_tag: pyt_vllm_llama-2-70b
|
||||||
model_repo: meta-llama/Llama-2-70b-chat-hf
|
model_repo: meta-llama/Llama-2-70b-chat-hf
|
||||||
@@ -65,11 +60,6 @@ vllm_benchmark:
|
|||||||
model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
|
model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||||
url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
|
url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
|
||||||
precision: float16
|
precision: float16
|
||||||
- model: Mistral 7B
|
|
||||||
mad_tag: pyt_vllm_mistral-7b
|
|
||||||
model_repo: mistralai/Mistral-7B-Instruct-v0.3
|
|
||||||
url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
|
|
||||||
precision: float16
|
|
||||||
- model: Mixtral MoE 8x7B FP8
|
- model: Mixtral MoE 8x7B FP8
|
||||||
mad_tag: pyt_vllm_mixtral-8x7b_fp8
|
mad_tag: pyt_vllm_mixtral-8x7b_fp8
|
||||||
model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
|
||||||
@@ -80,72 +70,15 @@ vllm_benchmark:
|
|||||||
model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||||
url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
|
||||||
precision: float8
|
precision: float8
|
||||||
- model: Mistral 7B FP8
|
|
||||||
mad_tag: pyt_vllm_mistral-7b_fp8
|
|
||||||
model_repo: amd/Mistral-7B-v0.1-FP8-KV
|
|
||||||
url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
|
|
||||||
precision: float8
|
|
||||||
- group: Qwen
|
- group: Qwen
|
||||||
tag: qwen
|
tag: qwen
|
||||||
models:
|
models:
|
||||||
- model: Qwen2 7B
|
|
||||||
mad_tag: pyt_vllm_qwen2-7b
|
|
||||||
model_repo: Qwen/Qwen2-7B-Instruct
|
|
||||||
url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
|
|
||||||
precision: float16
|
|
||||||
- model: Qwen2 72B
|
|
||||||
mad_tag: pyt_vllm_qwen2-72b
|
|
||||||
model_repo: Qwen/Qwen2-72B-Instruct
|
|
||||||
url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
|
|
||||||
precision: float16
|
|
||||||
- model: QwQ-32B
|
- model: QwQ-32B
|
||||||
mad_tag: pyt_vllm_qwq-32b
|
mad_tag: pyt_vllm_qwq-32b
|
||||||
model_repo: Qwen/QwQ-32B
|
model_repo: Qwen/QwQ-32B
|
||||||
url: https://huggingface.co/Qwen/QwQ-32B
|
url: https://huggingface.co/Qwen/QwQ-32B
|
||||||
precision: float16
|
precision: float16
|
||||||
tunableop: true
|
tunableop: true
|
||||||
- group: Databricks DBRX
|
|
||||||
tag: dbrx
|
|
||||||
models:
|
|
||||||
- model: DBRX Instruct
|
|
||||||
mad_tag: pyt_vllm_dbrx-instruct
|
|
||||||
model_repo: databricks/dbrx-instruct
|
|
||||||
url: https://huggingface.co/databricks/dbrx-instruct
|
|
||||||
precision: float16
|
|
||||||
- model: DBRX Instruct FP8
|
|
||||||
mad_tag: pyt_vllm_dbrx_fp8
|
|
||||||
model_repo: amd/dbrx-instruct-FP8-KV
|
|
||||||
url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
|
|
||||||
precision: float8
|
|
||||||
- group: Google Gemma
|
|
||||||
tag: gemma
|
|
||||||
models:
|
|
||||||
- model: Gemma 2 27B
|
|
||||||
mad_tag: pyt_vllm_gemma-2-27b
|
|
||||||
model_repo: google/gemma-2-27b
|
|
||||||
url: https://huggingface.co/google/gemma-2-27b
|
|
||||||
precision: float16
|
|
||||||
- group: Cohere
|
|
||||||
tag: cohere
|
|
||||||
models:
|
|
||||||
- model: C4AI Command R+ 08-2024
|
|
||||||
mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
|
|
||||||
model_repo: CohereForAI/c4ai-command-r-plus-08-2024
|
|
||||||
url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
|
|
||||||
precision: float16
|
|
||||||
- model: C4AI Command R+ 08-2024 FP8
|
|
||||||
mad_tag: pyt_vllm_command-r-plus_fp8
|
|
||||||
model_repo: amd/c4ai-command-r-plus-FP8-KV
|
|
||||||
url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
|
|
||||||
precision: float8
|
|
||||||
- group: DeepSeek
|
|
||||||
tag: deepseek
|
|
||||||
models:
|
|
||||||
- model: DeepSeek MoE 16B
|
|
||||||
mad_tag: pyt_vllm_deepseek-moe-16b-chat
|
|
||||||
model_repo: deepseek-ai/deepseek-moe-16b-chat
|
|
||||||
url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
|
|
||||||
precision: float16
|
|
||||||
- group: Microsoft Phi
|
- group: Microsoft Phi
|
||||||
tag: phi
|
tag: phi
|
||||||
models:
|
models:
|
||||||
@@ -153,11 +86,3 @@ vllm_benchmark:
|
|||||||
mad_tag: pyt_vllm_phi-4
|
mad_tag: pyt_vllm_phi-4
|
||||||
model_repo: microsoft/phi-4
|
model_repo: microsoft/phi-4
|
||||||
url: https://huggingface.co/microsoft/phi-4
|
url: https://huggingface.co/microsoft/phi-4
|
||||||
- group: TII Falcon
|
|
||||||
tag: falcon
|
|
||||||
models:
|
|
||||||
- model: Falcon 180B
|
|
||||||
mad_tag: pyt_vllm_falcon-180b
|
|
||||||
model_repo: tiiuae/falcon-180B
|
|
||||||
url: https://huggingface.co/tiiuae/falcon-180B
|
|
||||||
precision: float16
|
|
||||||
|
|||||||
@@ -1,26 +1,15 @@
|
|||||||
dockers:
|
dockers:
|
||||||
- pull_tag: rocm/megatron-lm:v25.6_py312
|
- pull_tag: rocm/megatron-lm:v25.7_py310
|
||||||
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py312/images/sha256-482ff906532285bceabdf2bda629bd32cb6174d2d07f4243a736378001b28df0
|
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a
|
||||||
components:
|
components:
|
||||||
ROCm: 6.4.1
|
ROCm: 6.4.2
|
||||||
PyTorch: 2.8.0a0+git7d205b2
|
Primus: v0.1.0-rc1
|
||||||
Python: 3.12
|
PyTorch: 2.8.0a0+gitd06a406
|
||||||
Transformer Engine: 2.1.0.dev0+8c4a512
|
|
||||||
hipBLASLt: 393e413
|
|
||||||
Triton: 3.3.0
|
|
||||||
RCCL: 2.23.4.7a84c5d
|
|
||||||
doc_name: Ubuntu 24.04 + Python 3.12
|
|
||||||
- pull_tag: rocm/megatron-lm:v25.6_py310
|
|
||||||
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py310/images/sha256-9627bd9378684fe26cb1a10c7dd817868f553b33402e49b058355b0f095568d6
|
|
||||||
components:
|
|
||||||
ROCm: 6.4.1
|
|
||||||
PyTorch: 2.8.0a0+git7d205b2
|
|
||||||
Python: "3.10"
|
Python: "3.10"
|
||||||
Transformer Engine: 2.1.0.dev0+8c4a512
|
Transformer Engine: 2.1.0.dev0+ba586519
|
||||||
hipBLASLt: 393e413
|
hipBLASLt: 37ba1d36
|
||||||
Triton: 3.3.0
|
Triton: 3.3.0
|
||||||
RCCL: 2.23.4.7a84c5d
|
RCCL: 2.22.3
|
||||||
doc_name: Ubuntu 22.04 + Python 3.10
|
|
||||||
model_groups:
|
model_groups:
|
||||||
- group: Meta Llama
|
- group: Meta Llama
|
||||||
tag: llama
|
tag: llama
|
||||||
|
|||||||
@@ -0,0 +1,60 @@
|
|||||||
|
dockers:
|
||||||
|
- pull_tag: rocm/megatron-lm:v25.6_py312
|
||||||
|
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py312/images/sha256-482ff906532285bceabdf2bda629bd32cb6174d2d07f4243a736378001b28df0
|
||||||
|
components:
|
||||||
|
ROCm: 6.4.1
|
||||||
|
PyTorch: 2.8.0a0+git7d205b2
|
||||||
|
Python: 3.12
|
||||||
|
Transformer Engine: 2.1.0.dev0+8c4a512
|
||||||
|
hipBLASLt: 393e413
|
||||||
|
Triton: 3.3.0
|
||||||
|
RCCL: 2.23.4.7a84c5d
|
||||||
|
doc_name: Ubuntu 24.04 + Python 3.12
|
||||||
|
- pull_tag: rocm/megatron-lm:v25.6_py310
|
||||||
|
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py310/images/sha256-9627bd9378684fe26cb1a10c7dd817868f553b33402e49b058355b0f095568d6
|
||||||
|
components:
|
||||||
|
ROCm: 6.4.1
|
||||||
|
PyTorch: 2.8.0a0+git7d205b2
|
||||||
|
Python: "3.10"
|
||||||
|
Transformer Engine: 2.1.0.dev0+8c4a512
|
||||||
|
hipBLASLt: 393e413
|
||||||
|
Triton: 3.3.0
|
||||||
|
RCCL: 2.23.4.7a84c5d
|
||||||
|
doc_name: Ubuntu 22.04 + Python 3.10
|
||||||
|
model_groups:
|
||||||
|
- group: Meta Llama
|
||||||
|
tag: llama
|
||||||
|
models:
|
||||||
|
- model: Llama 3.3 70B
|
||||||
|
mad_tag: pyt_megatron_lm_train_llama-3.3-70b
|
||||||
|
- model: Llama 3.1 8B
|
||||||
|
mad_tag: pyt_megatron_lm_train_llama-3.1-8b
|
||||||
|
- model: Llama 3.1 70B
|
||||||
|
mad_tag: pyt_megatron_lm_train_llama-3.1-70b
|
||||||
|
- model: Llama 3.1 70B (proxy)
|
||||||
|
mad_tag: pyt_megatron_lm_train_llama-3.1-70b-proxy
|
||||||
|
- model: Llama 2 7B
|
||||||
|
mad_tag: pyt_megatron_lm_train_llama-2-7b
|
||||||
|
- model: Llama 2 70B
|
||||||
|
mad_tag: pyt_megatron_lm_train_llama-2-70b
|
||||||
|
- group: DeepSeek
|
||||||
|
tag: deepseek
|
||||||
|
models:
|
||||||
|
- model: DeepSeek-V3 (proxy)
|
||||||
|
mad_tag: pyt_megatron_lm_train_deepseek-v3-proxy
|
||||||
|
- model: DeepSeek-V2-Lite
|
||||||
|
mad_tag: pyt_megatron_lm_train_deepseek-v2-lite-16b
|
||||||
|
- group: Mistral AI
|
||||||
|
tag: mistral
|
||||||
|
models:
|
||||||
|
- model: Mixtral 8x7B
|
||||||
|
mad_tag: pyt_megatron_lm_train_mixtral-8x7b
|
||||||
|
- model: Mixtral 8x22B (proxy)
|
||||||
|
mad_tag: pyt_megatron_lm_train_mixtral-8x22b-proxy
|
||||||
|
- group: Qwen
|
||||||
|
tag: qwen
|
||||||
|
models:
|
||||||
|
- model: Qwen 2.5 7B
|
||||||
|
mad_tag: pyt_megatron_lm_train_qwen2.5-7b
|
||||||
|
- model: Qwen 2.5 72B
|
||||||
|
mad_tag: pyt_megatron_lm_train_qwen2.5-72b
|
||||||
@@ -0,0 +1,58 @@
|
|||||||
|
dockers:
|
||||||
|
- pull_tag: rocm/megatron-lm:v25.7_py310
|
||||||
|
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a
|
||||||
|
components:
|
||||||
|
ROCm: 6.4.2
|
||||||
|
Primus: v0.1.0-rc1
|
||||||
|
PyTorch: 2.8.0a0+gitd06a406
|
||||||
|
Python: "3.10"
|
||||||
|
Transformer Engine: 2.1.0.dev0+ba586519
|
||||||
|
hipBLASLt: 37ba1d36
|
||||||
|
Triton: 3.3.0
|
||||||
|
RCCL: 2.22.3
|
||||||
|
model_groups:
|
||||||
|
- group: Meta Llama
|
||||||
|
tag: llama
|
||||||
|
models:
|
||||||
|
- model: Llama 3.3 70B
|
||||||
|
mad_tag: primus_pyt_megatron_lm_train_llama-3.3-70b
|
||||||
|
config_name: llama3.3_70B-pretrain.yaml
|
||||||
|
- model: Llama 3.1 70B
|
||||||
|
mad_tag: primus_pyt_megatron_lm_train_llama-3.1-70b
|
||||||
|
config_name: llama3.1_70B-pretrain.yaml
|
||||||
|
- model: Llama 3.1 8B
|
||||||
|
mad_tag: primus_pyt_megatron_lm_train_llama-3.1-8b
|
||||||
|
config_name: llama3.1_8B-pretrain.yaml
|
||||||
|
- model: Llama 2 7B
|
||||||
|
mad_tag: primus_pyt_megatron_lm_train_llama-2-7b
|
||||||
|
config_name: llama2_7B-pretrain.yaml
|
||||||
|
- model: Llama 2 70B
|
||||||
|
mad_tag: primus_pyt_megatron_lm_train_llama-2-70b
|
||||||
|
config_name: llama2_70B-pretrain.yaml
|
||||||
|
- group: DeepSeek
|
||||||
|
tag: deepseek
|
||||||
|
models:
|
||||||
|
- model: DeepSeek-V3 (proxy)
|
||||||
|
mad_tag: primus_pyt_megatron_lm_train_deepseek-v3-proxy
|
||||||
|
config_name: deepseek_v3-pretrain.yaml
|
||||||
|
- model: DeepSeek-V2-Lite
|
||||||
|
mad_tag: primus_pyt_megatron_lm_train_deepseek-v2-lite-16b
|
||||||
|
config_name: deepseek_v2_lite-pretrain.yaml
|
||||||
|
- group: Mistral AI
|
||||||
|
tag: mistral
|
||||||
|
models:
|
||||||
|
- model: Mixtral 8x7B
|
||||||
|
mad_tag: primus_pyt_megatron_lm_train_mixtral-8x7b
|
||||||
|
config_name: mixtral_8x7B_v0.1-pretrain.yaml
|
||||||
|
- model: Mixtral 8x22B (proxy)
|
||||||
|
mad_tag: primus_pyt_megatron_lm_train_mixtral-8x22b-proxy
|
||||||
|
config_name: mixtral_8x22B_v0.1-pretrain.yaml
|
||||||
|
- group: Qwen
|
||||||
|
tag: qwen
|
||||||
|
models:
|
||||||
|
- model: Qwen 2.5 7B
|
||||||
|
mad_tag: primus_pyt_megatron_lm_train_qwen2.5-7b
|
||||||
|
config_name: primus_qwen2.5_7B-pretrain.yaml
|
||||||
|
- model: Qwen 2.5 72B
|
||||||
|
mad_tag: primus_pyt_megatron_lm_train_qwen2.5-72b
|
||||||
|
config_name: qwen2.5_72B-pretrain.yaml
|
||||||
@@ -2,58 +2,132 @@
|
|||||||
:description: How to install deep learning frameworks for ROCm
|
:description: How to install deep learning frameworks for ROCm
|
||||||
:keywords: deep learning, frameworks, ROCm, install, PyTorch, TensorFlow, JAX, MAGMA, DeepSpeed, ML, AI
|
:keywords: deep learning, frameworks, ROCm, install, PyTorch, TensorFlow, JAX, MAGMA, DeepSpeed, ML, AI
|
||||||
|
|
||||||
********************************************
|
**********************************
|
||||||
Installing deep learning frameworks for ROCm
|
Deep learning frameworks for ROCm
|
||||||
********************************************
|
**********************************
|
||||||
|
|
||||||
ROCm provides a comprehensive ecosystem for deep learning development, including
|
Deep learning frameworks provide environments for machine learning, training, fine-tuning, inference, and performance optimization.
|
||||||
:ref:`libraries <artificial-intelligence-apis>` for optimized deep learning operations and ROCm-aware versions of popular
|
|
||||||
deep learning frameworks and libraries such as PyTorch, TensorFlow, and JAX. ROCm works closely with these
|
|
||||||
frameworks to ensure that framework-specific optimizations take advantage of AMD accelerator and GPU architectures.
|
|
||||||
|
|
||||||
The following guides provide information on compatibility and supported
|
ROCm offers a complete ecosystem for developing and running deep learning applications efficiently. It also provides ROCm-compatible versions of popular frameworks and libraries, such as PyTorch, TensorFlow, JAX, and others.
|
||||||
features for these ROCm-enabled deep learning frameworks.
|
|
||||||
|
|
||||||
* :doc:`PyTorch compatibility <../compatibility/ml-compatibility/pytorch-compatibility>`
|
The AMD ROCm organization actively contributes to open-source development and collaborates closely with framework organizations. This collaboration ensures that framework-specific optimizations effectively leverage AMD GPUs and accelerators.
|
||||||
* :doc:`TensorFlow compatibility <../compatibility/ml-compatibility/tensorflow-compatibility>`
|
|
||||||
* :doc:`JAX compatibility <../compatibility/ml-compatibility/jax-compatibility>`
|
|
||||||
* :doc:`verl compatibility <../compatibility/ml-compatibility/verl-compatibility>`
|
|
||||||
* :doc:`Stanford Megatron-LM compatibility <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>`
|
|
||||||
* :doc:`DGL compatibility <../compatibility/ml-compatibility/dgl-compatibility>`
|
|
||||||
* :doc:`Megablocks compatibility <../compatibility/ml-compatibility/megablocks-compatibility>`
|
|
||||||
* :doc:`Taichi compatibility <../compatibility/ml-compatibility/taichi-compatibility>`
|
|
||||||
|
|
||||||
This chart steps through typical installation workflows for installing deep learning frameworks for ROCm.
|
The table below summarizes information about ROCm-enabled deep learning frameworks. It includes details on ROCm compatibility and third-party tool support, installation steps and options, and links to GitHub resources. For a complete list of supported framework versions on ROCm, see the :doc:`Compatibility matrix <../compatibility/compatibility-matrix>` topic.
|
||||||
|
|
||||||
.. image:: ../data/how-to/framework_install_2024_07_04.png
|
.. list-table::
|
||||||
:alt: Flowchart for installing ROCm-aware machine learning frameworks
|
:header-rows: 1
|
||||||
:align: center
|
:widths: 5 3 6 3
|
||||||
|
|
||||||
See the installation instructions to get started.
|
* - Framework
|
||||||
|
- Installation
|
||||||
|
- Installation options
|
||||||
|
- GitHub
|
||||||
|
|
||||||
* :doc:`PyTorch for ROCm <rocm-install-on-linux:install/3rd-party/pytorch-install>`
|
* - `PyTorch <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/pytorch-compatibility.html>`_
|
||||||
* :doc:`TensorFlow for ROCm <rocm-install-on-linux:install/3rd-party/tensorflow-install>`
|
- .. raw:: html
|
||||||
* :doc:`JAX for ROCm <rocm-install-on-linux:install/3rd-party/jax-install>`
|
|
||||||
* :doc:`verl for ROCm <rocm-install-on-linux:install/3rd-party/verl-install>`
|
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||||
* :doc:`Stanford Megatron-LM for ROCm <rocm-install-on-linux:install/3rd-party/stanford-megatron-lm-install>`
|
-
|
||||||
* :doc:`DGL for ROCm <rocm-install-on-linux:install/3rd-party/dgl-install>`
|
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-a-docker-image-with-pytorch-pre-installed>`_
|
||||||
* :doc:`Megablocks for ROCm <rocm-install-on-linux:install/3rd-party/megablocks-install>`
|
- `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-a-wheels-package>`_
|
||||||
* :doc:`Taichi for ROCm <rocm-install-on-linux:install/3rd-party/taichi-install>`
|
- `ROCm Base Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-the-pytorch-rocm-base-docker-image>`_
|
||||||
|
- `Upstream Docker file <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-the-pytorch-upstream-dockerfile>`_
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://github.com/ROCm/pytorch"><i class="fab fa-github fa-lg"></i></a>
|
||||||
|
|
||||||
|
* - `TensorFlow <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/tensorflow-compatibility.html>`_
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||||
|
-
|
||||||
|
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html#using-a-docker-image-with-tensorflow-pre-installed>`_
|
||||||
|
- `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html#using-a-wheels-package>`_
|
||||||
|
|
||||||
.. note::
|
- .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://github.com/ROCm/tensorflow-upstream"><i class="fab fa-github fa-lg"></i></a>
|
||||||
|
|
||||||
|
* - `JAX <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/jax-compatibility.html>`_
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/jax-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||||
|
-
|
||||||
|
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/jax-install.html#using-a-prebuilt-docker-image>`_
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://github.com/ROCm/jax"><i class="fab fa-github fa-lg"></i></a>
|
||||||
|
|
||||||
|
* - `verl <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/verl-compatibility.html>`_
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/verl-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||||
|
-
|
||||||
|
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/verl-install.html#use-a-prebuilt-docker-image-with-verl-pre-installed>`_
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://github.com/ROCm/verl"><i class="fab fa-github fa-lg"></i></a>
|
||||||
|
|
||||||
|
* - `Stanford Megatron-LM <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/stanford-megatron-lm-compatibility.html>`_
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/stanford-megatron-lm-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||||
|
-
|
||||||
|
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/stanford-megatron-lm-install.html#use-a-prebuilt-docker-image-with-stanford-megatron-lm-pre-installed>`_
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://github.com/ROCm/Stanford-Megatron-LM"><i class="fab fa-github fa-lg"></i></a>
|
||||||
|
|
||||||
|
* - `DGL <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/dgl-compatibility.html>`_
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/dgl-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||||
|
-
|
||||||
|
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/dgl-install.html#use-a-prebuilt-docker-image-with-dgl-pre-installed>`_
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://github.com/ROCm/dgl"><i class="fab fa-github fa-lg"></i></a>
|
||||||
|
|
||||||
|
* - `Megablocks <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/megablocks-compatibility.html>`_
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/megablocks-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||||
|
-
|
||||||
|
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/megablocks-install.html#using-a-prebuilt-docker-image-with-megablocks-pre-installed>`_
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://github.com/ROCm/megablocks"><i class="fab fa-github fa-lg"></i></a>
|
||||||
|
|
||||||
|
* - `Taichi <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/taichi-compatibility.html>`_
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html"><i class="fas fa-link fa-lg"></i></a>
|
||||||
|
-
|
||||||
|
- `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html#use-a-prebuilt-docker-image-with-taichi-pre-installed>`_
|
||||||
|
- `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html#use-a-wheels-package>`_
|
||||||
|
|
||||||
|
- .. raw:: html
|
||||||
|
|
||||||
|
<a href="https://github.com/ROCm/taichi"><i class="fab fa-github fa-lg"></i></a>
|
||||||
|
|
||||||
For guidance on installing ROCm itself, refer to :doc:`ROCm installation for Linux <rocm-install-on-linux:index>`.
|
|
||||||
|
|
||||||
Learn how to use your ROCm deep learning environment for training, fine-tuning, inference, and performance optimization
|
Learn how to use your ROCm deep learning environment for training, fine-tuning, inference, and performance optimization
|
||||||
through the following guides.
|
through the following guides.
|
||||||
|
|
||||||
* :doc:`rocm-for-ai/index`
|
* :doc:`rocm-for-ai/index`
|
||||||
|
|
||||||
* :doc:`Training <rocm-for-ai/training/index>`
|
* :doc:`Use ROCm for training <rocm-for-ai/training/index>`
|
||||||
|
|
||||||
|
* :doc:`Use ROCm for fine-tuning LLMs <rocm-for-ai/fine-tuning/index>`
|
||||||
|
|
||||||
|
* :doc:`Use ROCm for AI inference <rocm-for-ai/inference/index>`
|
||||||
|
|
||||||
|
* :doc:`Use ROCm for AI inference optimization <rocm-for-ai/inference-optimization/index>`
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
* :doc:`Fine-tuning LLMs <rocm-for-ai/fine-tuning/index>`
|
|
||||||
|
|
||||||
* :doc:`Inference <rocm-for-ai/inference/index>`
|
|
||||||
|
|
||||||
* :doc:`Inference optimization <rocm-for-ai/inference-optimization/index>`
|
|
||||||
|
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ vLLM inference performance testing
|
|||||||
This documentation does not reflect the latest version of ROCm vLLM
|
This documentation does not reflect the latest version of ROCm vLLM
|
||||||
inference performance documentation. See :doc:`../vllm` for the latest version.
|
inference performance documentation. See :doc:`../vllm` for the latest version.
|
||||||
|
|
||||||
.. _vllm-benchmark-unified-docker:
|
.. _vllm-benchmark-unified-docker-702:
|
||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250702-benchmark-models.yaml
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250702-benchmark-models.yaml
|
||||||
|
|
||||||
@@ -77,7 +77,7 @@ vLLM inference performance testing
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
.. _vllm-benchmark-vllm:
|
.. _vllm-benchmark-vllm-702:
|
||||||
|
|
||||||
{% for model_group in model_groups %}
|
{% for model_group in model_groups %}
|
||||||
{% for model in model_group.models %}
|
{% for model in model_group.models %}
|
||||||
@@ -159,7 +159,7 @@ vLLM inference performance testing
|
|||||||
Once the setup is complete, choose between two options to reproduce the
|
Once the setup is complete, choose between two options to reproduce the
|
||||||
benchmark results:
|
benchmark results:
|
||||||
|
|
||||||
.. _vllm-benchmark-mad:
|
.. _vllm-benchmark-mad-702:
|
||||||
|
|
||||||
{% for model_group in model_groups %}
|
{% for model_group in model_groups %}
|
||||||
{% for model in model_group.models %}
|
{% for model in model_group.models %}
|
||||||
|
|||||||
@@ -0,0 +1,450 @@
|
|||||||
|
:orphan:
|
||||||
|
|
||||||
|
.. meta::
|
||||||
|
:description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
|
||||||
|
ROCm vLLM Docker image.
|
||||||
|
:keywords: model, MAD, automation, dashboarding, validate
|
||||||
|
|
||||||
|
**********************************
|
||||||
|
vLLM inference performance testing
|
||||||
|
**********************************
|
||||||
|
|
||||||
|
.. caution::
|
||||||
|
|
||||||
|
This documentation does not reflect the latest version of ROCm vLLM
|
||||||
|
inference performance documentation. See :doc:`../vllm` for the latest version.
|
||||||
|
|
||||||
|
.. _vllm-benchmark-unified-docker-715:
|
||||||
|
|
||||||
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml
|
||||||
|
|
||||||
|
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
||||||
|
{% set model_groups = data.vllm_benchmark.model_groups %}
|
||||||
|
|
||||||
|
The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
|
||||||
|
a prebuilt, optimized environment for validating large language model (LLM)
|
||||||
|
inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
|
||||||
|
Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
|
||||||
|
accelerators and includes the following components:
|
||||||
|
|
||||||
|
.. list-table::
|
||||||
|
:header-rows: 1
|
||||||
|
|
||||||
|
* - Software component
|
||||||
|
- Version
|
||||||
|
|
||||||
|
* - `ROCm <https://github.com/ROCm/ROCm>`__
|
||||||
|
- {{ unified_docker.rocm_version }}
|
||||||
|
|
||||||
|
* - `vLLM <https://docs.vllm.ai/en/latest>`__
|
||||||
|
- {{ unified_docker.vllm_version }}
|
||||||
|
|
||||||
|
* - `PyTorch <https://github.com/ROCm/pytorch>`__
|
||||||
|
- {{ unified_docker.pytorch_version }}
|
||||||
|
|
||||||
|
* - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`__
|
||||||
|
- {{ unified_docker.hipblaslt_version }}
|
||||||
|
|
||||||
|
With this Docker image, you can quickly test the :ref:`expected
|
||||||
|
inference performance numbers <vllm-benchmark-performance-measurements>` for
|
||||||
|
MI300X series accelerators.
|
||||||
|
|
||||||
|
What's new
|
||||||
|
==========
|
||||||
|
|
||||||
|
The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <vllm-history>`.
|
||||||
|
|
||||||
|
* The ``--compilation-config-parameter`` is no longer required as its options are now enabled by default.
|
||||||
|
This parameter has been removed from the benchmarking script.
|
||||||
|
|
||||||
|
* Resolved Llama 3.1 405 B custom all-reduce issue, eliminating the need for ``--disable-custom-all-reduce``.
|
||||||
|
This parameter has been removed from the benchmarking script.
|
||||||
|
|
||||||
|
* Fixed a ``+rms_norm`` custom kernel issue.
|
||||||
|
|
||||||
|
* Added quick reduce functionality. Set ``VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=FP`` to enable; supported modes are ``FP``, ``INT8``, ``INT6``, ``INT4``.
|
||||||
|
|
||||||
|
* Implemented a workaround to potentially mitigate GPU crashes experienced with the Command R+ model, pending a driver fix.
|
||||||
|
|
||||||
|
Supported models
|
||||||
|
================
|
||||||
|
|
||||||
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml
|
||||||
|
|
||||||
|
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
||||||
|
{% set model_groups = data.vllm_benchmark.model_groups %}
|
||||||
|
|
||||||
|
.. _vllm-benchmark-available-models-715:
|
||||||
|
|
||||||
|
The following models are supported for inference performance benchmarking
|
||||||
|
with vLLM and ROCm. Some instructions, commands, and recommendations in this
|
||||||
|
documentation might vary by model -- select one to get started.
|
||||||
|
|
||||||
|
.. raw:: html
|
||||||
|
|
||||||
|
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||||
|
<div class="row">
|
||||||
|
<div class="col-2 me-2 model-param-head">Model group</div>
|
||||||
|
<div class="row col-10">
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
<div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="row mt-1">
|
||||||
|
<div class="col-2 me-2 model-param-head">Model</div>
|
||||||
|
<div class="row col-10">
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
{% set models = model_group.models %}
|
||||||
|
{% for model in models %}
|
||||||
|
{% if models|length % 3 == 0 %}
|
||||||
|
<div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
|
{% else %}
|
||||||
|
<div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
|
{% endif %}
|
||||||
|
{% endfor %}
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
.. _vllm-benchmark-vllm-715:
|
||||||
|
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
{% for model in model_group.models %}
|
||||||
|
|
||||||
|
.. container:: model-doc {{model.mad_tag}}
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
|
||||||
|
Some models require access authorization prior to use via an external license agreement through a third party.
|
||||||
|
|
||||||
|
{% endfor %}
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
vLLM is a toolkit and library for LLM inference and serving. AMD implements
|
||||||
|
high-performance custom kernels and modules in vLLM to enhance performance.
|
||||||
|
See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
|
||||||
|
more information.
|
||||||
|
|
||||||
|
.. _vllm-benchmark-performance-measurements-715:
|
||||||
|
|
||||||
|
Performance measurements
|
||||||
|
========================
|
||||||
|
|
||||||
|
To evaluate performance, the
|
||||||
|
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
|
||||||
|
page provides reference throughput and latency measurements for inferencing popular AI models.
|
||||||
|
|
||||||
|
.. important::
|
||||||
|
|
||||||
|
The performance data presented in
|
||||||
|
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
|
||||||
|
only reflects the latest version of this inference benchmarking environment.
|
||||||
|
The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
|
||||||
|
|
||||||
|
System validation
|
||||||
|
=================
|
||||||
|
|
||||||
|
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||||
|
correctly and performing optimally.
|
||||||
|
|
||||||
|
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
||||||
|
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
||||||
|
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
||||||
|
before starting training.
|
||||||
|
|
||||||
|
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||||
|
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||||
|
system's configuration.
|
||||||
|
|
||||||
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml
|
||||||
|
|
||||||
|
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
||||||
|
{% set model_groups = data.vllm_benchmark.model_groups %}
|
||||||
|
|
||||||
|
Pull the Docker image
|
||||||
|
=====================
|
||||||
|
|
||||||
|
Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
|
||||||
|
Use the following command to pull the Docker image from Docker Hub.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker pull {{ unified_docker.pull_tag }}
|
||||||
|
|
||||||
|
Benchmarking
|
||||||
|
============
|
||||||
|
|
||||||
|
Once the setup is complete, choose between two options to reproduce the
|
||||||
|
benchmark results:
|
||||||
|
|
||||||
|
.. _vllm-benchmark-mad-715:
|
||||||
|
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
{% for model in model_group.models %}
|
||||||
|
|
||||||
|
.. container:: model-doc {{model.mad_tag}}
|
||||||
|
|
||||||
|
.. tab-set::
|
||||||
|
|
||||||
|
.. tab-item:: MAD-integrated benchmarking
|
||||||
|
|
||||||
|
1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
|
||||||
|
directory and install the required packages on the host machine.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
git clone https://github.com/ROCm/MAD
|
||||||
|
cd MAD
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
2. Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
|
||||||
|
using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
|
||||||
|
madengine run \
|
||||||
|
--tags {{model.mad_tag}} \
|
||||||
|
--keep-model-dir \
|
||||||
|
--live-output \
|
||||||
|
--timeout 28800
|
||||||
|
|
||||||
|
MAD launches a Docker container with the name
|
||||||
|
``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
|
||||||
|
model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
|
||||||
|
|
||||||
|
Although the :ref:`available models <vllm-benchmark-available-models>` are preconfigured
|
||||||
|
to collect latency and throughput performance data, you can also change the benchmarking
|
||||||
|
parameters. See the standalone benchmarking tab for more information.
|
||||||
|
|
||||||
|
{% if model.tunableop %}
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
|
||||||
|
TunableOp automatically explores different implementations and configurations of certain PyTorch
|
||||||
|
operators to find the fastest one for your hardware.
|
||||||
|
|
||||||
|
By default, ``{{model.mad_tag}}`` runs with TunableOp disabled
|
||||||
|
(see
|
||||||
|
`<https://github.com/ROCm/MAD/blob/develop/models.json>`__).
|
||||||
|
To enable it, include the ``--tunableop on`` argument in your
|
||||||
|
run.
|
||||||
|
|
||||||
|
Enabling TunableOp triggers a two-pass run -- a warm-up followed
|
||||||
|
by the performance-collection run.
|
||||||
|
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
.. tab-item:: Standalone benchmarking
|
||||||
|
|
||||||
|
.. rubric:: Download the Docker image and required scripts
|
||||||
|
|
||||||
|
1. Run the vLLM benchmark tool independently by starting the
|
||||||
|
`Docker container <{{ unified_docker.docker_hub_url }}>`_
|
||||||
|
as shown in the following snippet.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker pull {{ unified_docker.pull_tag }}
|
||||||
|
docker run -it \
|
||||||
|
--device=/dev/kfd \
|
||||||
|
--device=/dev/dri \
|
||||||
|
--group-add video \
|
||||||
|
--shm-size 16G \
|
||||||
|
--security-opt seccomp=unconfined \
|
||||||
|
--security-opt apparmor=unconfined \
|
||||||
|
--cap-add=SYS_PTRACE \
|
||||||
|
-v $(pwd):/workspace \
|
||||||
|
--env HUGGINGFACE_HUB_CACHE=/workspace \
|
||||||
|
--name test \
|
||||||
|
{{ unified_docker.pull_tag }}
|
||||||
|
|
||||||
|
2. In the Docker container, clone the ROCm MAD repository and navigate to the
|
||||||
|
benchmark scripts directory at ``~/MAD/scripts/vllm``.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
git clone https://github.com/ROCm/MAD
|
||||||
|
cd MAD/scripts/vllm
|
||||||
|
|
||||||
|
3. To start the benchmark, use the following command with the appropriate options.
|
||||||
|
|
||||||
|
.. dropdown:: Benchmark options
|
||||||
|
:open:
|
||||||
|
|
||||||
|
.. list-table::
|
||||||
|
:header-rows: 1
|
||||||
|
:align: center
|
||||||
|
|
||||||
|
* - Name
|
||||||
|
- Options
|
||||||
|
- Description
|
||||||
|
|
||||||
|
* - ``$test_option``
|
||||||
|
- latency
|
||||||
|
- Measure decoding token latency
|
||||||
|
|
||||||
|
* -
|
||||||
|
- throughput
|
||||||
|
- Measure token generation throughput
|
||||||
|
|
||||||
|
* -
|
||||||
|
- all
|
||||||
|
- Measure both throughput and latency
|
||||||
|
|
||||||
|
* - ``$num_gpu``
|
||||||
|
- 1 or 8
|
||||||
|
- Number of GPUs
|
||||||
|
|
||||||
|
* - ``$datatype``
|
||||||
|
- ``float16`` or ``float8``
|
||||||
|
- Data type
|
||||||
|
|
||||||
|
The input sequence length, output sequence length, and tensor parallel (TP) are
|
||||||
|
already configured. You don't need to specify them with this script.
|
||||||
|
|
||||||
|
Command:
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
./vllm_benchmark_report.sh \
|
||||||
|
-s $test_option \
|
||||||
|
-m {{model.model_repo}} \
|
||||||
|
-g $num_gpu \
|
||||||
|
-d {{model.precision}}
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
For best performance, it's recommend to run with ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``.
|
||||||
|
|
||||||
|
If you encounter the following error, pass your access-authorized Hugging
|
||||||
|
Face token to the gated models.
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
OSError: You are trying to access a gated repo.
|
||||||
|
|
||||||
|
# pass your HF_TOKEN
|
||||||
|
export HF_TOKEN=$your_personal_hf_token
|
||||||
|
|
||||||
|
.. rubric:: Benchmarking examples
|
||||||
|
|
||||||
|
Here are some examples of running the benchmark with various options:
|
||||||
|
|
||||||
|
* Latency benchmark
|
||||||
|
|
||||||
|
Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
./vllm_benchmark_report.sh \
|
||||||
|
-s latency \
|
||||||
|
-m {{model.model_repo}} \
|
||||||
|
-g 8 \
|
||||||
|
-d {{model.precision}}
|
||||||
|
|
||||||
|
Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
|
||||||
|
|
||||||
|
* Throughput benchmark
|
||||||
|
|
||||||
|
Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
./vllm_benchmark_report.sh \
|
||||||
|
-s throughput \
|
||||||
|
-m {{model.model_repo}} \
|
||||||
|
-g 8 \
|
||||||
|
-d {{model.precision}}
|
||||||
|
|
||||||
|
Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
|
||||||
|
|
||||||
|
.. raw:: html
|
||||||
|
|
||||||
|
<style>
|
||||||
|
mjx-container[jax="CHTML"][display="true"] {
|
||||||
|
text-align: left;
|
||||||
|
margin: 0;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
Throughput is calculated as:
|
||||||
|
|
||||||
|
- .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
|
||||||
|
|
||||||
|
- .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
|
||||||
|
{% endfor %}
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
Advanced usage
|
||||||
|
==============
|
||||||
|
|
||||||
|
For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
|
||||||
|
see the developer's guide at `<https://github.com/ROCm/vllm/tree/f94ec9beeca1071cc34f9d1e206d8c7f3ac76129/docs/dev-docker>`__.
|
||||||
|
|
||||||
|
Reproducing the Docker image
|
||||||
|
----------------------------
|
||||||
|
|
||||||
|
To reproduce this ROCm/vLLM Docker image release, follow these steps:
|
||||||
|
|
||||||
|
1. Clone the `vLLM repository <https://github.com/ROCm/vllm>`__.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
git clone https://github.com/ROCm/vllm.git
|
||||||
|
|
||||||
|
2. Checkout the specific release commit.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
cd vllm
|
||||||
|
git checkout b432b7a285aa0dcb9677380936ffa74931bb6d6f
|
||||||
|
|
||||||
|
3. Build the Docker image. Replace ``vllm-rocm`` with your desired image tag.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker build -f docker/Dockerfile.rocm -t vllm-rocm .
|
||||||
|
|
||||||
|
Known issues and workarounds
|
||||||
|
============================
|
||||||
|
|
||||||
|
AITER does not support FP8 KV cache yet.
|
||||||
|
|
||||||
|
Further reading
|
||||||
|
===============
|
||||||
|
|
||||||
|
- To learn more about the options for latency and throughput benchmark scripts,
|
||||||
|
see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
|
||||||
|
|
||||||
|
- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
|
||||||
|
|
||||||
|
- To learn more about system settings and management practices to configure your system for
|
||||||
|
AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
|
||||||
|
|
||||||
|
- For application performance optimization strategies for HPC and AI workloads,
|
||||||
|
including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
|
||||||
|
|
||||||
|
- To learn how to run community models from Hugging Face on AMD GPUs, see
|
||||||
|
:doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
|
||||||
|
|
||||||
|
- To learn how to fine-tune LLMs and optimize inference, see
|
||||||
|
:doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
|
||||||
|
|
||||||
|
- For a list of other ready-made Docker images for AI with ROCm, see
|
||||||
|
`AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
|
||||||
|
|
||||||
|
Previous versions
|
||||||
|
=================
|
||||||
|
|
||||||
|
See :doc:`vllm-history` to find documentation for previous releases
|
||||||
|
of the ``ROCm/vllm`` Docker image.
|
||||||
@@ -16,14 +16,23 @@ previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub <https://hub.
|
|||||||
- Components
|
- Components
|
||||||
- Resources
|
- Resources
|
||||||
|
|
||||||
* - ``rocm/vllm:rocm6.4.1_vllm_0.9.1_20250715``
|
* - ``rocm/vllm:rocm6.4.1_vllm_0.10.0_20250812``
|
||||||
(latest)
|
(latest)
|
||||||
|
-
|
||||||
|
* ROCm 6.4.1
|
||||||
|
* vLLM 0.10.0
|
||||||
|
* PyTorch 2.7.0
|
||||||
|
-
|
||||||
|
* :doc:`Documentation <../vllm>`
|
||||||
|
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.0_20250812/images/sha256-4c277ad39af3a8c9feac9b30bf78d439c74d9b4728e788a419d3f1d0c30cacaa>`__
|
||||||
|
|
||||||
|
* - ``rocm/vllm:rocm6.4.1_vllm_0.9.1_20250715``
|
||||||
-
|
-
|
||||||
* ROCm 6.4.1
|
* ROCm 6.4.1
|
||||||
* vLLM 0.9.1
|
* vLLM 0.9.1
|
||||||
* PyTorch 2.7.0
|
* PyTorch 2.7.0
|
||||||
-
|
-
|
||||||
* :doc:`Documentation <../vllm>`
|
* :doc:`Documentation <vllm-0.9.1-20250715>`
|
||||||
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250715/images/sha256-4a429705fa95a58f6d20aceab43b1b76fa769d57f32d5d28bd3f4e030e2a78ea>`__
|
* `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250715/images/sha256-4a429705fa95a58f6d20aceab43b1b76fa769d57f32d5d28bd3f4e030e2a78ea>`__
|
||||||
|
|
||||||
* - ``rocm/vllm:rocm6.4.1_vllm_0.9.1_20250702``
|
* - ``rocm/vllm:rocm6.4.1_vllm_0.9.1_20250702``
|
||||||
|
|||||||
@@ -103,7 +103,7 @@ PyTorch inference performance testing
|
|||||||
|
|
||||||
The Chai-1 benchmark uses a specifically selected Docker image using ROCm 6.2.3 and PyTorch 2.3.0 to address an accuracy issue.
|
The Chai-1 benchmark uses a specifically selected Docker image using ROCm 6.2.3 and PyTorch 2.3.0 to address an accuracy issue.
|
||||||
|
|
||||||
.. container:: model-doc pyt_clip_inference pyt_mochi_video_inference pyt_wan2.1_inference pyt_janus_pro_inference
|
.. container:: model-doc pyt_clip_inference pyt_mochi_video_inference pyt_wan2.1_inference pyt_janus_pro_inference pyt_hy_video
|
||||||
|
|
||||||
Use the following command to pull the `ROCm PyTorch Docker image <https://hub.docker.com/layers/rocm/pytorch/latest/images/sha256-05b55983e5154f46e7441897d0908d79877370adca4d1fff4899d9539d6c4969>`__ from Docker Hub.
|
Use the following command to pull the `ROCm PyTorch Docker image <https://hub.docker.com/layers/rocm/pytorch/latest/images/sha256-05b55983e5154f46e7441897d0908d79877370adca4d1fff4899d9539d6c4969>`__ from Docker Hub.
|
||||||
|
|
||||||
|
|||||||
@@ -7,7 +7,7 @@
|
|||||||
vLLM inference performance testing
|
vLLM inference performance testing
|
||||||
**********************************
|
**********************************
|
||||||
|
|
||||||
.. _vllm-benchmark-unified-docker:
|
.. _vllm-benchmark-unified-docker-812:
|
||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
|
||||||
|
|
||||||
@@ -47,17 +47,11 @@ What's new
|
|||||||
|
|
||||||
The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <previous-versions/vllm-history>`.
|
The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <previous-versions/vllm-history>`.
|
||||||
|
|
||||||
* The ``--compilation-config-parameter`` is no longer required as its options are now enabled by default.
|
* Upgraded to vLLM v0.10.
|
||||||
This parameter has been removed from the benchmarking script.
|
|
||||||
|
|
||||||
* Resolved Llama 3.1 405 B custom all-reduce issue, eliminating the need for ``--disable-custom-all-reduce``.
|
* FP8 KV cache support via AITER.
|
||||||
This parameter has been removed from the benchmarking script.
|
|
||||||
|
|
||||||
* Fixed a ``+rms_norm`` custom kernel issue.
|
* Full graph capture support via AITER.
|
||||||
|
|
||||||
* Added quick reduce functionality. Set ``VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=FP`` to enable; supported modes are ``FP``, ``INT8``, ``INT6``, ``INT4``.
|
|
||||||
|
|
||||||
* Implemented a workaround to potentially mitigate GPU crashes experienced with the Command R+ model, pending a driver fix.
|
|
||||||
|
|
||||||
Supported models
|
Supported models
|
||||||
================
|
================
|
||||||
@@ -67,7 +61,7 @@ Supported models
|
|||||||
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
{% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
|
||||||
{% set model_groups = data.vllm_benchmark.model_groups %}
|
{% set model_groups = data.vllm_benchmark.model_groups %}
|
||||||
|
|
||||||
.. _vllm-benchmark-available-models:
|
.. _vllm-benchmark-available-models-812:
|
||||||
|
|
||||||
The following models are supported for inference performance benchmarking
|
The following models are supported for inference performance benchmarking
|
||||||
with vLLM and ROCm. Some instructions, commands, and recommendations in this
|
with vLLM and ROCm. Some instructions, commands, and recommendations in this
|
||||||
@@ -102,7 +96,7 @@ Supported models
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
.. _vllm-benchmark-vllm:
|
.. _vllm-benchmark-vllm-812:
|
||||||
|
|
||||||
{% for model_group in model_groups %}
|
{% for model_group in model_groups %}
|
||||||
{% for model in model_group.models %}
|
{% for model in model_group.models %}
|
||||||
@@ -124,14 +118,14 @@ Supported models
|
|||||||
See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
|
See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
|
||||||
more information.
|
more information.
|
||||||
|
|
||||||
.. _vllm-benchmark-performance-measurements:
|
.. _vllm-benchmark-performance-measurements-812:
|
||||||
|
|
||||||
Performance measurements
|
Performance measurements
|
||||||
========================
|
========================
|
||||||
|
|
||||||
To evaluate performance, the
|
To evaluate performance, the
|
||||||
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
|
`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
|
||||||
page provides reference throughput and latency measurements for inferencing popular AI models.
|
page provides reference throughput and serving measurements for inferencing popular AI models.
|
||||||
|
|
||||||
.. important::
|
.. important::
|
||||||
|
|
||||||
@@ -176,7 +170,7 @@ system's configuration.
|
|||||||
Once the setup is complete, choose between two options to reproduce the
|
Once the setup is complete, choose between two options to reproduce the
|
||||||
benchmark results:
|
benchmark results:
|
||||||
|
|
||||||
.. _vllm-benchmark-mad:
|
.. _vllm-benchmark-mad-812:
|
||||||
|
|
||||||
{% for model_group in model_groups %}
|
{% for model_group in model_groups %}
|
||||||
{% for model in model_group.models %}
|
{% for model in model_group.models %}
|
||||||
@@ -209,12 +203,15 @@ system's configuration.
|
|||||||
--timeout 28800
|
--timeout 28800
|
||||||
|
|
||||||
MAD launches a Docker container with the name
|
MAD launches a Docker container with the name
|
||||||
``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
|
``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the
|
||||||
model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
|
model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv``
|
||||||
|
and ``{{ model.mad_tag }}_serving.csv``.
|
||||||
|
|
||||||
Although the :ref:`available models <vllm-benchmark-available-models>` are preconfigured
|
Although the :ref:`available models
|
||||||
to collect latency and throughput performance data, you can also change the benchmarking
|
<vllm-benchmark-available-models>` are preconfigured to collect
|
||||||
parameters. See the standalone benchmarking tab for more information.
|
offline throughput and online serving performance data, you can
|
||||||
|
also change the benchmarking parameters. See the standalone
|
||||||
|
benchmarking tab for more information.
|
||||||
|
|
||||||
{% if model.tunableop %}
|
{% if model.tunableop %}
|
||||||
|
|
||||||
@@ -224,14 +221,12 @@ system's configuration.
|
|||||||
TunableOp automatically explores different implementations and configurations of certain PyTorch
|
TunableOp automatically explores different implementations and configurations of certain PyTorch
|
||||||
operators to find the fastest one for your hardware.
|
operators to find the fastest one for your hardware.
|
||||||
|
|
||||||
By default, ``{{model.mad_tag}}`` runs with TunableOp disabled
|
By default, ``{{model.mad_tag}}`` runs with TunableOp disabled (see
|
||||||
(see
|
`<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To enable it, include
|
||||||
`<https://github.com/ROCm/MAD/blob/develop/models.json>`__).
|
the ``--tunableop on`` argument in your run.
|
||||||
To enable it, include the ``--tunableop on`` argument in your
|
|
||||||
run.
|
|
||||||
|
|
||||||
Enabling TunableOp triggers a two-pass run -- a warm-up followed
|
Enabling TunableOp triggers a two-pass run -- a warm-up followed by the
|
||||||
by the performance-collection run.
|
performance-collection run.
|
||||||
|
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
@@ -269,6 +264,13 @@ system's configuration.
|
|||||||
|
|
||||||
3. To start the benchmark, use the following command with the appropriate options.
|
3. To start the benchmark, use the following command with the appropriate options.
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
./run.sh \
|
||||||
|
--config $CONFIG_CSV \
|
||||||
|
--model_repo {{ model.model_repo }} \
|
||||||
|
<overrides>
|
||||||
|
|
||||||
.. dropdown:: Benchmark options
|
.. dropdown:: Benchmark options
|
||||||
:open:
|
:open:
|
||||||
|
|
||||||
@@ -280,42 +282,40 @@ system's configuration.
|
|||||||
- Options
|
- Options
|
||||||
- Description
|
- Description
|
||||||
|
|
||||||
* - ``$test_option``
|
* - ``--config``
|
||||||
- latency
|
- ``configs/default.csv``
|
||||||
- Measure decoding token latency
|
- Run configs from the CSV for the chosen model repo and benchmark.
|
||||||
|
|
||||||
* -
|
* -
|
||||||
- throughput
|
- ``configs/extended.csv``
|
||||||
- Measure token generation throughput
|
-
|
||||||
|
|
||||||
* -
|
* -
|
||||||
- all
|
- ``configs/performance.csv``
|
||||||
- Measure both throughput and latency
|
-
|
||||||
|
|
||||||
* - ``$num_gpu``
|
* - ``--benchmark``
|
||||||
- 1 or 8
|
- ``throughput``
|
||||||
- Number of GPUs
|
- Measure offline end-to-end throughput.
|
||||||
|
|
||||||
* - ``$datatype``
|
* -
|
||||||
- ``float16`` or ``float8``
|
- ``serving``
|
||||||
- Data type
|
- Measure online serving performance.
|
||||||
|
|
||||||
|
* -
|
||||||
|
- ``all``
|
||||||
|
- Measure both throughput and serving.
|
||||||
|
|
||||||
|
* - `<overrides>`
|
||||||
|
- See `run.sh <https://github.com/ROCm/MAD/blob/develop/scripts/vllm/run.sh>`__ for more info.
|
||||||
|
- Additional overrides to the config CSV.
|
||||||
|
|
||||||
The input sequence length, output sequence length, and tensor parallel (TP) are
|
The input sequence length, output sequence length, and tensor parallel (TP) are
|
||||||
already configured. You don't need to specify them with this script.
|
already configured. You don't need to specify them with this script.
|
||||||
|
|
||||||
Command:
|
|
||||||
|
|
||||||
.. code-block::
|
|
||||||
|
|
||||||
./vllm_benchmark_report.sh \
|
|
||||||
-s $test_option \
|
|
||||||
-m {{model.model_repo}} \
|
|
||||||
-g $num_gpu \
|
|
||||||
-d {{model.precision}}
|
|
||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
|
|
||||||
For best performance, it's recommend to run with ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``.
|
For best performance, it's recommended to run with ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``.
|
||||||
|
|
||||||
If you encounter the following error, pass your access-authorized Hugging
|
If you encounter the following error, pass your access-authorized Hugging
|
||||||
Face token to the gated models.
|
Face token to the gated models.
|
||||||
@@ -331,33 +331,33 @@ system's configuration.
|
|||||||
|
|
||||||
Here are some examples of running the benchmark with various options:
|
Here are some examples of running the benchmark with various options:
|
||||||
|
|
||||||
* Latency benchmark
|
|
||||||
|
|
||||||
Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
|
|
||||||
|
|
||||||
.. code-block::
|
|
||||||
|
|
||||||
./vllm_benchmark_report.sh \
|
|
||||||
-s latency \
|
|
||||||
-m {{model.model_repo}} \
|
|
||||||
-g 8 \
|
|
||||||
-d {{model.precision}}
|
|
||||||
|
|
||||||
Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
|
|
||||||
|
|
||||||
* Throughput benchmark
|
* Throughput benchmark
|
||||||
|
|
||||||
Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
|
Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
./vllm_benchmark_report.sh \
|
export MAD_MODEL_NAME={{ model.mad_tag }}
|
||||||
-s throughput \
|
./run.sh \
|
||||||
-m {{model.model_repo}} \
|
--config configs/default.csv \
|
||||||
-g 8 \
|
--model_repo {{model.model_repo}} \
|
||||||
-d {{model.precision}}
|
--benchmark throughput
|
||||||
|
|
||||||
Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
|
Find the throughput benchmark report at ``./{{ model.mad_tag }}_throughput.csv``.
|
||||||
|
|
||||||
|
* Serving benchmark
|
||||||
|
|
||||||
|
Use this command to benchmark the serving performance of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
export MAD_MODEL_NAME={{ model.mad_tag }}
|
||||||
|
./run.sh \
|
||||||
|
--config configs/default.csv \
|
||||||
|
--model_repo {{model.model_repo}} \
|
||||||
|
--benchmark serving
|
||||||
|
|
||||||
|
Find the serving benchmark report at ``./{{ model.mad_tag }}_serving.csv``.
|
||||||
|
|
||||||
.. raw:: html
|
.. raw:: html
|
||||||
|
|
||||||
@@ -400,7 +400,7 @@ To reproduce this ROCm/vLLM Docker image release, follow these steps:
|
|||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
cd vllm
|
cd vllm
|
||||||
git checkout b432b7a285aa0dcb9677380936ffa74931bb6d6f
|
git checkout 340ea86dfe5955d6f9a9e767d6abab5aacf2c978
|
||||||
|
|
||||||
3. Build the Docker image. Replace ``vllm-rocm`` with your desired image tag.
|
3. Build the Docker image. Replace ``vllm-rocm`` with your desired image tag.
|
||||||
|
|
||||||
@@ -408,11 +408,6 @@ To reproduce this ROCm/vLLM Docker image release, follow these steps:
|
|||||||
|
|
||||||
docker build -f docker/Dockerfile.rocm -t vllm-rocm .
|
docker build -f docker/Dockerfile.rocm -t vllm-rocm .
|
||||||
|
|
||||||
Known issues and workarounds
|
|
||||||
============================
|
|
||||||
|
|
||||||
AITER does not support FP8 KV cache yet.
|
|
||||||
|
|
||||||
Further reading
|
Further reading
|
||||||
===============
|
===============
|
||||||
|
|
||||||
|
|||||||
@@ -1,14 +1,14 @@
|
|||||||
.. meta::
|
.. meta::
|
||||||
:description: How to install ROCm and popular machine learning frameworks.
|
:description: How to install ROCm and popular deep learning frameworks.
|
||||||
:keywords: ROCm, AI, LLM, train, fine-tune, FSDP, DeepSpeed, LLaMA, tutorial
|
:keywords: ROCm, AI, LLM, train, fine-tune, FSDP, DeepSpeed, LLaMA, tutorial
|
||||||
|
|
||||||
.. _rocm-for-ai-install:
|
.. _rocm-for-ai-install:
|
||||||
|
|
||||||
***********************************************
|
********************************************
|
||||||
Installing ROCm and machine learning frameworks
|
Installing ROCm and deep learning frameworks
|
||||||
***********************************************
|
********************************************
|
||||||
|
|
||||||
Before getting started, install ROCm and supported machine learning frameworks.
|
Before getting started, install ROCm and supported deep learning frameworks.
|
||||||
|
|
||||||
.. grid:: 1
|
.. grid:: 1
|
||||||
|
|
||||||
@@ -43,29 +43,16 @@ distribution's package manager. See the following documentation resources to get
|
|||||||
If you encounter any issues during installation, refer to the
|
If you encounter any issues during installation, refer to the
|
||||||
:doc:`Installation troubleshooting <rocm-install-on-linux:reference/install-faq>` guide.
|
:doc:`Installation troubleshooting <rocm-install-on-linux:reference/install-faq>` guide.
|
||||||
|
|
||||||
Machine learning frameworks
|
Deep learning frameworks
|
||||||
===========================
|
========================
|
||||||
|
|
||||||
ROCm supports popular machine learning frameworks and libraries including `PyTorch
|
ROCm supports deep learning frameworks and libraries including `PyTorch
|
||||||
<https://pytorch.org/blog/pytorch-for-amd-rocm-platform-now-available-as-python-package>`_, `TensorFlow
|
<https://pytorch.org/blog/pytorch-for-amd-rocm-platform-now-available-as-python-package>`_, `TensorFlow
|
||||||
<https://tensorflow.org>`_, `JAX <https://jax.readthedocs.io/en/latest>`_, and `DeepSpeed
|
<https://tensorflow.org>`_, `JAX <https://jax.readthedocs.io/en/latest>`_, and more.
|
||||||
<https://cloudblogs.microsoft.com/opensource/2022/03/21/supporting-efficient-large-model-training-on-amd-instinct-gpus-with-deepspeed/>`_.
|
|
||||||
|
|
||||||
Review the framework installation documentation. For ease-of-use, it's recommended to use official ROCm prebuilt Docker
|
Review the :doc:`framework installation documentation <../deep-learning-rocm>`. For ease-of-use, it's recommended to use official ROCm prebuilt Docker
|
||||||
images with the framework pre-installed.
|
images with the framework pre-installed.
|
||||||
|
|
||||||
* :doc:`PyTorch for ROCm <rocm-install-on-linux:install/3rd-party/pytorch-install>`
|
|
||||||
|
|
||||||
* :doc:`TensorFlow for ROCm <rocm-install-on-linux:install/3rd-party/tensorflow-install>`
|
|
||||||
|
|
||||||
* :doc:`JAX for ROCm <rocm-install-on-linux:install/3rd-party/jax-install>`
|
|
||||||
|
|
||||||
* :doc:`verl for ROCm <rocm-install-on-linux:install/3rd-party/verl-install>`
|
|
||||||
|
|
||||||
* :doc:`Stanford Megatron-LM for ROCm <rocm-install-on-linux:install/3rd-party/jax-install>`
|
|
||||||
|
|
||||||
* :doc:`DGL for ROCm <rocm-install-on-linux:install/3rd-party/jax-install>`
|
|
||||||
|
|
||||||
Next steps
|
Next steps
|
||||||
==========
|
==========
|
||||||
|
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
:orphan:
|
||||||
|
|
||||||
.. meta::
|
.. meta::
|
||||||
:description: How to train a model using Megatron-LM for ROCm.
|
:description: How to train a model using Megatron-LM for ROCm.
|
||||||
:keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
|
:keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
|
||||||
@@ -6,6 +8,14 @@
|
|||||||
Training a model with Megatron-LM for ROCm
|
Training a model with Megatron-LM for ROCm
|
||||||
******************************************
|
******************************************
|
||||||
|
|
||||||
|
.. caution::
|
||||||
|
|
||||||
|
The ROCm Megatron-LM framework now has limited support with this Docker
|
||||||
|
environment; it now focuses on Primus with Megatron-Core. See :doc:`primus-megatron`.
|
||||||
|
|
||||||
|
To learn how to migrate your existing workloads to Primus with Megatron-Core,
|
||||||
|
see :doc:`previous-versions/megatron-lm-primus-migration-guide`.
|
||||||
|
|
||||||
The `Megatron-LM framework for ROCm <https://github.com/ROCm/Megatron-LM>`_ is
|
The `Megatron-LM framework for ROCm <https://github.com/ROCm/Megatron-LM>`_ is
|
||||||
a specialized fork of the robust Megatron-LM, designed to enable efficient
|
a specialized fork of the robust Megatron-LM, designed to enable efficient
|
||||||
training of large-scale language models on AMD GPUs. By leveraging AMD
|
training of large-scale language models on AMD GPUs. By leveraging AMD
|
||||||
@@ -20,13 +30,17 @@ essential components, including PyTorch, ROCm libraries, and Megatron-LM
|
|||||||
utilities. It contains the following software components to accelerate training
|
utilities. It contains the following software components to accelerate training
|
||||||
workloads:
|
workloads:
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
This Docker environment is based on Python 3.10 and Ubuntu 22.04. For an alternative environment with
|
||||||
|
Python 3.12 and Ubuntu 24.04, see the :doc:`previous ROCm Megatron-LM v25.6 Docker release <previous-versions/megatron-lm-v25.6>`.
|
||||||
|
|
||||||
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
|
||||||
|
|
||||||
{% set dockers = data.dockers %}
|
{% set dockers = data.dockers %}
|
||||||
{% if dockers|length > 1 %}
|
|
||||||
.. tab-set::
|
.. tab-set::
|
||||||
|
|
||||||
{% for docker in data.dockers %}
|
{% for docker in dockers %}
|
||||||
.. tab-item:: ``{{ docker.pull_tag }}``
|
.. tab-item:: ``{{ docker.pull_tag }}``
|
||||||
:sync: {{ docker.pull_tag }}
|
:sync: {{ docker.pull_tag }}
|
||||||
|
|
||||||
@@ -42,28 +56,14 @@ workloads:
|
|||||||
|
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
{% elif dockers|length == 1 %}
|
|
||||||
.. list-table::
|
|
||||||
:header-rows: 1
|
|
||||||
|
|
||||||
* - Software component
|
|
||||||
- Version
|
|
||||||
|
|
||||||
{% for component_name, component_version in docker.components %}
|
|
||||||
* - {{ component_name }}
|
|
||||||
- {{ component_version }}
|
|
||||||
|
|
||||||
{% endfor %}
|
|
||||||
{% endif %}
|
|
||||||
|
|
||||||
.. _amd-megatron-lm-model-support:
|
.. _amd-megatron-lm-model-support:
|
||||||
|
|
||||||
The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
|
|
||||||
|
|
||||||
Supported models
|
Supported models
|
||||||
================
|
================
|
||||||
|
|
||||||
The following models are supported for training performance benchmarking with Megatron-LM and ROCm.
|
The following models are supported for training performance benchmarking with Megatron-LM and ROCm
|
||||||
|
on AMD Instinct MI300X series accelerators.
|
||||||
Some instructions, commands, and training recommendations in this documentation might
|
Some instructions, commands, and training recommendations in this documentation might
|
||||||
vary by model -- select one to get started.
|
vary by model -- select one to get started.
|
||||||
|
|
||||||
@@ -177,7 +177,7 @@ Download the Docker image
|
|||||||
{% if dockers|length > 1 %}
|
{% if dockers|length > 1 %}
|
||||||
.. tab-set::
|
.. tab-set::
|
||||||
|
|
||||||
{% for docker in data.dockers %}
|
{% for docker in dockers %}
|
||||||
.. tab-item:: {{ docker.doc_name }}
|
.. tab-item:: {{ docker.doc_name }}
|
||||||
:sync: {{ docker.pull_tag }}
|
:sync: {{ docker.pull_tag }}
|
||||||
|
|
||||||
@@ -227,10 +227,17 @@ Download the Docker image
|
|||||||
docker start megatron_training_env
|
docker start megatron_training_env
|
||||||
docker exec -it megatron_training_env bash
|
docker exec -it megatron_training_env bash
|
||||||
|
|
||||||
The Docker container includes a pre-installed, verified version of the ROCm
|
4. **Megatron-LM backward compatibility setup** -- this Docker is primarily intended for use with Primus, but it maintains Megatron-LM compatibility with limited support.
|
||||||
Megatron-LM development branch
|
To roll back to using Megatron-LM, follow these steps:
|
||||||
`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev>`__, including necessary
|
|
||||||
training scripts.
|
.. code-block:: shell
|
||||||
|
|
||||||
|
cd /workspace/Megatron-LM/
|
||||||
|
pip uninstall megatron-core
|
||||||
|
pip install -e .
|
||||||
|
|
||||||
|
The Docker container hosts
|
||||||
|
`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev>`__ at verified commit ``e8e9edc``.
|
||||||
|
|
||||||
.. _amd-megatron-lm-environment-setup:
|
.. _amd-megatron-lm-environment-setup:
|
||||||
|
|
||||||
|
|||||||
@@ -16,12 +16,20 @@ previous releases of the ``ROCm/megatron-lm`` Docker image on `Docker Hub <https
|
|||||||
- Components
|
- Components
|
||||||
- Resources
|
- Resources
|
||||||
|
|
||||||
* - v25.6 (latest)
|
* - v25.7 (latest)
|
||||||
|
-
|
||||||
|
* ROCm
|
||||||
|
* PyTorch
|
||||||
|
-
|
||||||
|
* :doc:`Documentation <../megatron-lm>`
|
||||||
|
* `Docker Hub (py310) <https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a>`__
|
||||||
|
|
||||||
|
* - v25.6
|
||||||
-
|
-
|
||||||
* ROCm 6.4.1
|
* ROCm 6.4.1
|
||||||
* PyTorch 2.8.0a0+git7d205b2
|
* PyTorch 2.8.0a0+git7d205b2
|
||||||
-
|
-
|
||||||
* :doc:`Documentation <../megatron-lm>`
|
* :doc:`Documentation <megatron-lm-v25.6>`
|
||||||
* `Docker Hub (py312) <https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py312/images/sha256-482ff906532285bceabdf2bda629bd32cb6174d2d07f4243a736378001b28df0>`__
|
* `Docker Hub (py312) <https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py312/images/sha256-482ff906532285bceabdf2bda629bd32cb6174d2d07f4243a736378001b28df0>`__
|
||||||
* `Docker Hub (py310) <https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py310/images/sha256-9627bd9378684fe26cb1a10c7dd817868f553b33402e49b058355b0f095568d6>`__
|
* `Docker Hub (py310) <https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py310/images/sha256-9627bd9378684fe26cb1a10c7dd817868f553b33402e49b058355b0f095568d6>`__
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,175 @@
|
|||||||
|
:orphan:
|
||||||
|
|
||||||
|
**********************************************************************
|
||||||
|
Migrating workloads to Primus (Megatron-Core backend) from Megatron-LM
|
||||||
|
**********************************************************************
|
||||||
|
|
||||||
|
Primus supports Megatron-Core as backend optimization library,
|
||||||
|
replacing ROCm Megatron-LM. This document outlines the steps to migrate
|
||||||
|
workload from ROCm Megatron-LM to Primus with the Megatron-Core backend.
|
||||||
|
|
||||||
|
Model architecture
|
||||||
|
==================
|
||||||
|
|
||||||
|
ROCm Megatron-LM defines model architecture parameters in the training scripts;
|
||||||
|
for example, the Llama 3 8B model parameters are defined in
|
||||||
|
`examples/llama/train_llama3.sh <https://github.com/ROCm/Megatron-LM/blob/rocm_dev/examples/llama/train_llama3.sh#L117>`__
|
||||||
|
as shown below:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
HIDDEN_SIZE=4096
|
||||||
|
FFN_HIDDEN_SIZE=14336
|
||||||
|
NUM_LAYERS=32
|
||||||
|
NUM_HEADS=32
|
||||||
|
NUM_KV_HEADS=8
|
||||||
|
|
||||||
|
Primus defines the model architecture through model YAML configuration files
|
||||||
|
inside the ``primus/configs/models/megatron/`` repository. For example, Llama 3 8B
|
||||||
|
model architecture parameters are defined in
|
||||||
|
`primus/configs/models/megatron/llama3_8B.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/llama3_8B.yaml>`__
|
||||||
|
as shown below:
|
||||||
|
|
||||||
|
.. code-block:: yaml
|
||||||
|
|
||||||
|
bases:
|
||||||
|
- llama3_base.yaml
|
||||||
|
|
||||||
|
tokenizer_type: Llama3Tokenizer
|
||||||
|
tokenizer_model: meta-llama/Llama-3.1-8B
|
||||||
|
|
||||||
|
ffn_hidden_size: 14336
|
||||||
|
hidden_size: 4096
|
||||||
|
num_attention_heads: 32
|
||||||
|
num_layers: 32
|
||||||
|
num_query_groups: 8
|
||||||
|
|
||||||
|
Primus' model config files follow a hierarchical design, meaning that new model
|
||||||
|
config YAMLs can inherit existing model config files by importing them as
|
||||||
|
bases. For example,
|
||||||
|
`llama3.1_8B.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/llama3.1_8B.yaml>`__
|
||||||
|
uses ``llama3_8B.yaml`` as a base config and overrides few parameters, as shown below.
|
||||||
|
In this example, ``llama3.1_8B`` overrides the ``max_position_embeddings`` value:
|
||||||
|
|
||||||
|
.. code-block:: yaml
|
||||||
|
|
||||||
|
bases:
|
||||||
|
- llama3_8B.yaml
|
||||||
|
|
||||||
|
tokenizer_type: Llama3Tokenizer
|
||||||
|
tokenizer_model: meta-llama/Llama-3.1-8B
|
||||||
|
|
||||||
|
max_position_embeddings: 131072
|
||||||
|
|
||||||
|
.. tip::
|
||||||
|
|
||||||
|
Primus provides ``llama_base.yaml`` as the base configuration, which can be
|
||||||
|
used as bases for additional model architectures. For example,
|
||||||
|
`mixtral_base.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/mixtral_base.yaml>`__
|
||||||
|
and
|
||||||
|
`deepseek_v3_base.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/deepseek_v3_base.yaml>`__
|
||||||
|
define ``llama_base.yaml`` as its base.
|
||||||
|
|
||||||
|
.. code-block:: yaml
|
||||||
|
|
||||||
|
# Example mixtral_base.yaml:
|
||||||
|
|
||||||
|
bases:
|
||||||
|
- llama_base.yaml
|
||||||
|
|
||||||
|
init_method_std: 0.01
|
||||||
|
rotary_base: 1000000
|
||||||
|
qk_layernorm: false
|
||||||
|
|
||||||
|
group_query_attention: true
|
||||||
|
num_query_groups: 8
|
||||||
|
|
||||||
|
# moe parameters
|
||||||
|
num_experts: 8
|
||||||
|
moe_router_topk: 2
|
||||||
|
moe_router_load_balancing_type: aux_loss
|
||||||
|
moe_aux_loss_coeff: 1e-2
|
||||||
|
moe_grouped_gemm: true
|
||||||
|
moe_token_dispatcher_type: alltoall
|
||||||
|
|
||||||
|
It is recommended to add a new ``${MODEL_NAME}_base.yaml`` to add a new
|
||||||
|
category of model and define new models on top of it. For example, to add
|
||||||
|
Qwen2.5 models in Primus, we define
|
||||||
|
`qwen2.5_base.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/qwen2.5_base.yaml>`__
|
||||||
|
and build
|
||||||
|
`qwen2.5_7B.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/qwen2.5_7B.yaml>`__
|
||||||
|
and
|
||||||
|
`qwen2.5_72B.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/qwen2.5_72B.yaml>`__
|
||||||
|
using ``qwen2.5_base.yaml`` as the base config.
|
||||||
|
|
||||||
|
Training parameters
|
||||||
|
===================
|
||||||
|
|
||||||
|
ROCm Megatron-LM also defines the training parameters, like batch size,
|
||||||
|
tensor-parallelism, precision, as so on, in the training scripts. For example,
|
||||||
|
Llama3 8B model parameters are defined in
|
||||||
|
`examples/llama/train_llama3.sh <https://github.com/ROCm/Megatron-LM/blob/rocm_dev/examples/llama/train_llama3.sh>`__
|
||||||
|
as shown below:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
TP="${TP:-8}"
|
||||||
|
PP="${PP:-1}"
|
||||||
|
CP="${CP:-1}"
|
||||||
|
MBS="${MBS:-1}"
|
||||||
|
BS="${BS:-8}"
|
||||||
|
|
||||||
|
Primus defines the training parameters in top-level YAML files -- see
|
||||||
|
`examples/megatron/configs/
|
||||||
|
<https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1/examples/megatron/configs>`__.
|
||||||
|
For example, the `llama3.1_8B-pretrain.yaml
|
||||||
|
<https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/examples/megatron/configs/llama3.1_8B-pretrain.yaml>`__
|
||||||
|
configuration imports the ``llama3.1_8B.yaml`` model architecture file. Users can then override
|
||||||
|
the default training parameters in ``llama3.1_8B-pretrain.yaml``.
|
||||||
|
|
||||||
|
.. code-block:: yaml
|
||||||
|
|
||||||
|
# model to run
|
||||||
|
model: llama3.1_8B.yaml # Model architecture yaml
|
||||||
|
overrides:
|
||||||
|
# log
|
||||||
|
# disable_wandb: false
|
||||||
|
# disable_tensorboard: false
|
||||||
|
stderr_sink_level: DEBUG
|
||||||
|
|
||||||
|
log_avg_skip_iterations: 2
|
||||||
|
log_avg_reset_interval: 50
|
||||||
|
|
||||||
|
train_iters: 50
|
||||||
|
micro_batch_size: 2
|
||||||
|
global_batch_size: 128
|
||||||
|
|
||||||
|
seq_length: 8192
|
||||||
|
max_position_embeddings: 8192
|
||||||
|
|
||||||
|
lr: 1.0e-5
|
||||||
|
min_lr: 0.0
|
||||||
|
lr_warmup_iters: 2
|
||||||
|
lr_decay_iters: null
|
||||||
|
lr_decay_style: cosine
|
||||||
|
weight_decay: 0.1
|
||||||
|
adam_beta1: 0.9
|
||||||
|
adam_beta2: 0.95
|
||||||
|
eod_mask_loss: true
|
||||||
|
init_method_std: 0.008
|
||||||
|
norm_epsilon: 1.0e-6
|
||||||
|
|
||||||
|
Backward compatibility with Megatron-LM
|
||||||
|
=======================================
|
||||||
|
|
||||||
|
The Dockerized environment used for Primus maintains compatibility with Megatron-LM with
|
||||||
|
limited support. To roll back to using Megatron-LM, follow these steps.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
cd /workspace/Megatron-LM/
|
||||||
|
pip uninstall megatron-core
|
||||||
|
pip install -e .
|
||||||
|
|
||||||
|
Once Megatron-LM is installed, follow :doc:`the documentation <../megatron-lm>` to run workloads as
|
||||||
|
usual.
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,602 @@
|
|||||||
|
.. meta::
|
||||||
|
:description: How to train a model using Megatron-LM for ROCm.
|
||||||
|
:keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
|
||||||
|
|
||||||
|
**********************************************
|
||||||
|
Training a model with Primus and Megatron-Core
|
||||||
|
**********************************************
|
||||||
|
|
||||||
|
`Primus <https://github.com/AMD-AIG-AIMA/Primus>`__ is a unified and flexible
|
||||||
|
LLM training framework designed to streamline training. It streamlines LLM
|
||||||
|
training on AMD Instinct accelerators using a modular, reproducible configuration paradigm.
|
||||||
|
Primus is backend-agnostic and supports multiple training engines -- including Megatron-Core.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
Primus with the Megatron-Core backend is intended to replace ROCm
|
||||||
|
Megatron-LM in this Dockerized training environment. To learn how to migrate
|
||||||
|
workloads from Megatron-LM to Primus with Megatron-Core, see
|
||||||
|
:doc:`previous-versions/megatron-lm-primus-migration-guide`.
|
||||||
|
|
||||||
|
For ease of use, AMD provides a ready-to-use Docker image for MI300 series accelerators
|
||||||
|
containing essential components for Primus and Megatron-Core.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
This Docker environment is based on Python 3.10 and Ubuntu 22.04. For an alternative environment with
|
||||||
|
Python 3.12 and Ubuntu 24.04, see the :doc:`previous ROCm Megatron-LM v25.6 Docker release <previous-versions/megatron-lm-v25.6>`.
|
||||||
|
|
||||||
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
|
||||||
|
|
||||||
|
{% set dockers = data.dockers %}
|
||||||
|
{% set docker = dockers[0] %}
|
||||||
|
.. list-table::
|
||||||
|
:header-rows: 1
|
||||||
|
|
||||||
|
* - Software component
|
||||||
|
- Version
|
||||||
|
|
||||||
|
{% for component_name, component_version in docker.components.items() %}
|
||||||
|
* - {{ component_name }}
|
||||||
|
- {{ component_version }}
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
.. _amd-primus-megatron-lm-model-support:
|
||||||
|
|
||||||
|
Supported models
|
||||||
|
================
|
||||||
|
|
||||||
|
The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
|
||||||
|
Some instructions, commands, and training examples in this documentation might
|
||||||
|
vary by model -- select one to get started.
|
||||||
|
|
||||||
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
|
||||||
|
|
||||||
|
{% set model_groups = data.model_groups %}
|
||||||
|
.. raw:: html
|
||||||
|
|
||||||
|
<div id="vllm-benchmark-ud-params-picker" class="container-fluid">
|
||||||
|
<div class="row">
|
||||||
|
<div class="col-2 me-2 model-param-head">Model</div>
|
||||||
|
<div class="row col-10">
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
<div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="row mt-1">
|
||||||
|
<div class="col-2 me-2 model-param-head">Model variant</div>
|
||||||
|
<div class="row col-10">
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
{% set models = model_group.models %}
|
||||||
|
{% for model in models %}
|
||||||
|
{% if models|length % 3 == 0 %}
|
||||||
|
<div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
|
{% else %}
|
||||||
|
<div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
|
||||||
|
{% endif %}
|
||||||
|
{% endfor %}
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
Some models, such as Llama, require an external license agreement through
|
||||||
|
a third party (for example, Meta).
|
||||||
|
|
||||||
|
System validation
|
||||||
|
=================
|
||||||
|
|
||||||
|
Before running AI workloads, it's important to validate that your AMD hardware is configured
|
||||||
|
correctly and performing optimally.
|
||||||
|
|
||||||
|
If you have already validated your system settings, including aspects like NUMA auto-balancing, you
|
||||||
|
can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
|
||||||
|
optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
|
||||||
|
before starting training.
|
||||||
|
|
||||||
|
To test for optimal performance, consult the recommended :ref:`System health benchmarks
|
||||||
|
<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
|
||||||
|
system's configuration.
|
||||||
|
|
||||||
|
.. _mi300x-amd-primus-megatron-lm-training:
|
||||||
|
|
||||||
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
|
||||||
|
|
||||||
|
{% set dockers = data.dockers %}
|
||||||
|
{% set docker = dockers[0] %}
|
||||||
|
|
||||||
|
Environment setup
|
||||||
|
=================
|
||||||
|
|
||||||
|
Use the following instructions to set up the environment, configure the script to train models, and
|
||||||
|
reproduce the benchmark results on MI300X series accelerators with the ``{{ docker.pull_tag }}`` image.
|
||||||
|
|
||||||
|
.. _amd-primus-megatron-lm-requirements:
|
||||||
|
|
||||||
|
Download the Docker image
|
||||||
|
-------------------------
|
||||||
|
|
||||||
|
1. Use the following command to pull the Docker image from Docker Hub.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker pull {{ docker.pull_tag }}
|
||||||
|
|
||||||
|
2. Launch the Docker container.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker run -it \
|
||||||
|
--device /dev/dri \
|
||||||
|
--device /dev/kfd \
|
||||||
|
--device /dev/infiniband \
|
||||||
|
--network host --ipc host \
|
||||||
|
--group-add video \
|
||||||
|
--cap-add SYS_PTRACE \
|
||||||
|
--security-opt seccomp=unconfined \
|
||||||
|
--privileged \
|
||||||
|
-v $HOME:$HOME \
|
||||||
|
--shm-size 128G \
|
||||||
|
--name primus_training_env \
|
||||||
|
{{ docker.pull_tag }}
|
||||||
|
|
||||||
|
3. Use these commands if you exit the ``primus_training_env`` container and need to return to it.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
docker start primus_training_env
|
||||||
|
docker exec -it primus_training_env bash
|
||||||
|
|
||||||
|
The Docker container hosts verified release tag ``v0.1.0-rc1`` of the `Primus
|
||||||
|
<https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1>`__ repository.
|
||||||
|
|
||||||
|
.. _amd-primus-megatron-lm-environment-setup:
|
||||||
|
|
||||||
|
Configuration
|
||||||
|
=============
|
||||||
|
|
||||||
|
Primus defines a training configuration in YAML for each model in
|
||||||
|
`examples/megatron/configs <https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1/examples/megatron/configs>`__.
|
||||||
|
|
||||||
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
|
||||||
|
|
||||||
|
{% set model_groups = data.model_groups %}
|
||||||
|
{% for model_group in model_groups %}
|
||||||
|
{% for model in model_group.models %}
|
||||||
|
.. container:: model-doc {{ model.mad_tag }}
|
||||||
|
|
||||||
|
To update training parameters for {{ model.model }}, you can update ``examples/megatron/configs/{{ model.config_name }}``.
|
||||||
|
Note that training configuration YAML files for other models follow this naming convention.
|
||||||
|
|
||||||
|
{% endfor %}
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
See :ref:`Key options <amd-primus-megatron-lm-benchmark-test-vars>` for more information on configuration options.
|
||||||
|
|
||||||
|
Dataset options
|
||||||
|
---------------
|
||||||
|
|
||||||
|
You can use either mock data or real data for training.
|
||||||
|
|
||||||
|
* Mock data can be useful for testing and validation. Use the ``mock_data`` field to toggle between mock and real data. The default
|
||||||
|
value is ``true`` for enabled.
|
||||||
|
|
||||||
|
.. code-block:: yaml
|
||||||
|
|
||||||
|
mock_data: true
|
||||||
|
|
||||||
|
* If you're using a real dataset, update the ``train_data_path`` field to point to the location of your dataset.
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
mock_data: false
|
||||||
|
train_data_path: /path/to/your/dataset
|
||||||
|
|
||||||
|
Ensure that the files are accessible inside the Docker container.
|
||||||
|
|
||||||
|
.. _amd-primus-megatron-lm-tokenizer:
|
||||||
|
|
||||||
|
Tokenizer
|
||||||
|
---------
|
||||||
|
|
||||||
|
In Primus, each model uses a tokenizer from Hugging Face. For example, Llama
|
||||||
|
3.1 8B model uses ``tokenizer_model: meta-llama/Llama-3.1-8B`` and
|
||||||
|
``tokenizer_type: Llama3Tokenizer`` defined in the `llama3.1-8B model
|
||||||
|
<https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1/primus/configs/models/megatron/llama3.1_8B.yaml>`__
|
||||||
|
definition. As such, you need to set the ``HF_TOKEN`` environment variable with
|
||||||
|
right permissions to access the tokenizer for each model.
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
# Export your HF_TOKEN in the workspace
|
||||||
|
export HF_TOKEN=<your_hftoken>
|
||||||
|
|
||||||
|
.. _amd-primus-megatron-lm-run-training:
|
||||||
|
|
||||||
|
Run training
|
||||||
|
============
|
||||||
|
|
||||||
|
Use the following example commands to set up the environment, configure
|
||||||
|
:ref:`key options <amd-primus-megatron-lm-benchmark-test-vars>`, and run training on
|
||||||
|
MI300X series accelerators with the AMD Megatron-LM environment.
|
||||||
|
|
||||||
|
Single node training
|
||||||
|
--------------------
|
||||||
|
|
||||||
|
To run training on a single node, navigate to ``/workspace/Primus`` and use the following setup command:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
pip install -r requirements.txt
|
||||||
|
export HSA_NO_SCRATCH_RECLAIM=1
|
||||||
|
export NVTE_CK_USES_BWD_V3=1
|
||||||
|
|
||||||
|
Once setup is complete, run the appropriate training command.
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.3-70b
|
||||||
|
|
||||||
|
To run pre-training for Llama 3.3 70B BF16, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
|
||||||
|
bash ./examples/run_pretrain.sh \
|
||||||
|
--micro_batch_size 2 \
|
||||||
|
--global_batch_size 16 \
|
||||||
|
--train_iters 50
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b
|
||||||
|
|
||||||
|
To run pre-training for Llama 3.1 8B FP8, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
|
||||||
|
bash ./examples/run_pretrain.sh \
|
||||||
|
--train_iters 50 \
|
||||||
|
--fp8 hybrid
|
||||||
|
|
||||||
|
For Llama 3.1 8B BF16, use the following command:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
|
||||||
|
bash ./examples/run_pretrain.sh --train_iters 50
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-70b
|
||||||
|
|
||||||
|
To run pre-training for Llama 3.1 70B BF16, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
|
||||||
|
bash ./examples/run_pretrain.sh \
|
||||||
|
--train_iters 50
|
||||||
|
|
||||||
|
To run the training on a single node for Llama 3.1 70B FP8 with proxy, use the following command:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
|
||||||
|
bash ./examples/run_pretrain.sh \
|
||||||
|
--train_iters 50 \
|
||||||
|
--num_layers 40 \
|
||||||
|
--fp8 hybrid \
|
||||||
|
--no_fp8_weight_transpose_cache true
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
Use two or more nodes to run the *full* Llama 70B model with FP8 precision.
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-7b
|
||||||
|
|
||||||
|
To run pre-training for Llama 2 7B FP8, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
|
||||||
|
bash ./examples/run_pretrain.sh \
|
||||||
|
--train_iters 50 \
|
||||||
|
--fp8 hybrid
|
||||||
|
|
||||||
|
To run pre-training for Llama 2 7B BF16, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
|
||||||
|
bash ./examples/run_pretrain.sh --train_iters 50
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-70b
|
||||||
|
|
||||||
|
To run pre-training for Llama 2 70B BF16, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
|
||||||
|
bash ./examples/run_pretrain.sh --train_iters 50
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v3-proxy
|
||||||
|
|
||||||
|
To run training on a single node for DeepSeek-V3 (MoE with expert parallel) with 3-layer proxy,
|
||||||
|
use the following command:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/deepseek_v3-pretrain.yaml \
|
||||||
|
bash examples/run_pretrain.sh \
|
||||||
|
--num_layers 3 \
|
||||||
|
--moe_layer_freq 1 \
|
||||||
|
--train_iters 50
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v2-lite-16b
|
||||||
|
|
||||||
|
To run training on a single node for DeepSeek-V2-Lite (MoE with expert parallel),
|
||||||
|
use the following command:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/deepseek_v2_lite-pretrain.yaml \
|
||||||
|
bash examples/run_pretrain.sh \
|
||||||
|
--global_batch_size 256 \
|
||||||
|
--train_iters 50
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x7b
|
||||||
|
|
||||||
|
To run training on a single node for Mixtral 8x7B (MoE with expert parallel),
|
||||||
|
use the following command:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/mixtral_8x7B_v0.1-pretrain.yaml \
|
||||||
|
bash examples/run_pretrain.sh --train_iters 50
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x22b-proxy
|
||||||
|
|
||||||
|
To run training on a single node for Mixtral 8x7B (MoE with expert parallel) with 4-layer proxy,
|
||||||
|
use the following command:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/mixtral_8x22B_v0.1-pretrain.yaml \
|
||||||
|
bash examples/run_pretrain.sh \
|
||||||
|
--num_layers 4 \
|
||||||
|
--pipeline_model_parallel_size 1 \
|
||||||
|
--micro_batch_size 1 \
|
||||||
|
--global_batch_size 16 \
|
||||||
|
--train_iters 50
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-7b
|
||||||
|
|
||||||
|
To run training on a single node for Qwen 2.5 7B BF16, use the following
|
||||||
|
command:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \
|
||||||
|
bash examples/run_pretrain.sh --train_iters 50
|
||||||
|
|
||||||
|
For FP8, use the following command.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \
|
||||||
|
bash examples/run_pretrain.sh \
|
||||||
|
--train_iters 50 \
|
||||||
|
--fp8 hybrid
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-72b
|
||||||
|
|
||||||
|
To run the training on a single node for Qwen 2.5 72B BF16, use the following command.
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \
|
||||||
|
bash examples/run_pretrain.sh --train_iters 50
|
||||||
|
|
||||||
|
Multi-node training examples
|
||||||
|
----------------------------
|
||||||
|
|
||||||
|
To run training on multiple nodes, you can use the
|
||||||
|
`run_slurm_pretrain.sh <https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1/examples/run_slurm_pretrain.sh>`__
|
||||||
|
to launch the multi-node workload. Use the following steps to setup your environment:
|
||||||
|
|
||||||
|
.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
|
||||||
|
|
||||||
|
{% set dockers = data.dockers %}
|
||||||
|
{% set docker = dockers[0] %}
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
cd /workspace/Primus/
|
||||||
|
export DOCKER_IMAGE={{ docker.pull_tag }}
|
||||||
|
export HF_TOKEN=<your_HF_token>
|
||||||
|
export HSA_NO_SCRATCH_RECLAIM=1
|
||||||
|
export NVTE_CK_USES_BWD_V3=1
|
||||||
|
export NCCL_IB_HCA=<your_NCCL_IB_HCA> # specify which RDMA interfaces to use for communication
|
||||||
|
export NCCL_SOCKET_IFNAME=<your_NCCL_SOCKET_IFNAME> # your Network Interface
|
||||||
|
export GLOO_SOCKET_IFNAME=<your_GLOO_SOCKET_IFNAME> # your Network Interface
|
||||||
|
export NCCL_IB_GID_INDEX=3 # Set InfiniBand GID index for NCCL communication. Default is 3 for ROCE
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
* Make sure correct network drivers are installed on the nodes. If inside a Docker, either install the drivers inside the Docker container or pass the network drivers from the host while creating Docker container.
|
||||||
|
* If ``NCCL_IB_HCA`` and ``NCCL_SOCKET_IFNAME`` are not set, Primus will try to auto-detect. However, since NICs can vary accross different cluster, it is encouraged to explicitly export your NCCL parameters for the cluster.
|
||||||
|
* To find your network interface, you can use ``ip a``.
|
||||||
|
* To find RDMA interfaces, you can use ``ibv_devices`` to get the list of all the RDMA/IB devices.
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.3-70b
|
||||||
|
|
||||||
|
To train Llama 3.3 70B FP8 on 8 nodes, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
NNODES=8 EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
|
||||||
|
bash examples/run_slurm_pretrain.sh \
|
||||||
|
--micro_batch_size 4 \
|
||||||
|
--global_batch_size 256 \
|
||||||
|
--recompute_num_layers 80 \
|
||||||
|
--no_fp8_weight_transpose_cache true \
|
||||||
|
--fp8 hybrid
|
||||||
|
|
||||||
|
To train Llama 3.3 70B BF16 on 8 nodes, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
NNODES=8 EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
|
||||||
|
bash examples/run_slurm_pretrain.sh \
|
||||||
|
--micro_batch_size 1 \
|
||||||
|
--global_batch_size 256 \
|
||||||
|
--recompute_num_layers 12
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b
|
||||||
|
|
||||||
|
To train Llama 3.1 8B FP8 on 8 nodes, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
# Adjust the training parameters. For e.g., `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case
|
||||||
|
NNODES=8 EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
|
||||||
|
bash ./examples/run_slurm_pretrain.sh \
|
||||||
|
--global_batch_size 1024 \
|
||||||
|
--fp8 hybrid
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-70b
|
||||||
|
|
||||||
|
To train Llama 3.1 70B FP8 on 8 nodes, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
NNODES=8 EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
|
||||||
|
bash examples/run_slurm_pretrain.sh \
|
||||||
|
--micro_batch_size 4 \
|
||||||
|
--global_batch_size 256 \
|
||||||
|
--recompute_num_layers 80 \
|
||||||
|
--no_fp8_weight_transpose_cache true \
|
||||||
|
--fp8 hybrid
|
||||||
|
|
||||||
|
To train Llama 3.1 70B BF16 on 8 nodes, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
NNODES=8 EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
|
||||||
|
bash examples/run_slurm_pretrain.sh \
|
||||||
|
--micro_batch_size 1 \
|
||||||
|
--global_batch_size 256 \
|
||||||
|
--recompute_num_layers 12
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-7b
|
||||||
|
|
||||||
|
To train Llama 2 8B FP8 on 8 nodes, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
# Adjust the training parameters. For e.g., `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case
|
||||||
|
NNODES=8 EXP=examples/megatron/configs/llama2_7B-pretrain.yaml bash ./examples/run_slurm_pretrain.sh --global_batch_size 2048 --fp8 hybrid
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-70b
|
||||||
|
|
||||||
|
To train Llama 2 70B FP8 on 8 nodes, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
NNODES=8 EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
|
||||||
|
bash examples/run_slurm_pretrain.sh \
|
||||||
|
--micro_batch_size 10 \
|
||||||
|
--global_batch_size 640 \
|
||||||
|
--recompute_num_layers 80 \
|
||||||
|
--no_fp8_weight_transpose_cache true \
|
||||||
|
--fp8 hybrid
|
||||||
|
|
||||||
|
To train Llama 2 70B BF16 on 8 nodes, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
NNODES=8 EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
|
||||||
|
bash ./examples/run_slurm_pretrain.sh \
|
||||||
|
--micro_batch_size 2 \
|
||||||
|
--global_batch_size 1536 \
|
||||||
|
--recompute_num_layers 12
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x7b
|
||||||
|
|
||||||
|
To train Mixtral 8x7B BF16 on 8 nodes, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
NNODES=8 EXP=examples/megatron/configs/mixtral_8x7B_v0.1-pretrain.yaml \
|
||||||
|
bash examples/run_slurm_pretrain.sh \
|
||||||
|
--micro_batch_size 2 \
|
||||||
|
--global_batch_size 256
|
||||||
|
|
||||||
|
.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-72b
|
||||||
|
|
||||||
|
To train Qwen2.5 72B FP8 on 8 nodes, run:
|
||||||
|
|
||||||
|
.. code-block:: shell
|
||||||
|
|
||||||
|
NNODES=8 EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \
|
||||||
|
bash examples/run_slurm_pretrain.sh \
|
||||||
|
--micro_batch_size 8 \
|
||||||
|
--global_batch_size 512 \
|
||||||
|
--recompute_num_layers 80 \
|
||||||
|
--no_fp8_weight_transpose_cache true \
|
||||||
|
--fp8 hybrid
|
||||||
|
|
||||||
|
.. _amd-primus-megatron-lm-benchmark-test-vars:
|
||||||
|
|
||||||
|
Key options
|
||||||
|
-----------
|
||||||
|
|
||||||
|
The following are key options to take note of
|
||||||
|
|
||||||
|
fp8
|
||||||
|
``hybrid`` enables FP8 GEMMs.
|
||||||
|
|
||||||
|
use_torch_fsdp2
|
||||||
|
``use_torch_fsdp2: 1`` enables torch fsdp-v2. If FSDP is enabled,
|
||||||
|
set ``use_distributed_optimizer`` and ``overlap_param_gather`` to ``false``.
|
||||||
|
|
||||||
|
profile
|
||||||
|
To enable PyTorch profiling, set these parameters:
|
||||||
|
|
||||||
|
.. code-block:: yaml
|
||||||
|
|
||||||
|
profile: true
|
||||||
|
use_pytorch_profiler: true
|
||||||
|
profile_step_end: 7
|
||||||
|
profile_step_start: 6
|
||||||
|
|
||||||
|
train_iters
|
||||||
|
The total number of iterations (default: 50).
|
||||||
|
|
||||||
|
mock_data
|
||||||
|
True by default.
|
||||||
|
|
||||||
|
micro_batch_size
|
||||||
|
Micro batch size.
|
||||||
|
|
||||||
|
global_batch_size
|
||||||
|
Global batch size.
|
||||||
|
|
||||||
|
recompute_granularity
|
||||||
|
For activation checkpointing.
|
||||||
|
|
||||||
|
num_layers
|
||||||
|
For using a reduced number of layers as with proxy models.
|
||||||
|
|
||||||
|
Previous versions
|
||||||
|
=================
|
||||||
|
|
||||||
|
See :doc:`previous-versions/megatron-lm-history` to find documentation for previous releases
|
||||||
|
of the ``ROCm/megatron-lm`` Docker image.
|
||||||
|
|
||||||
|
This training environment now uses Primus with Megatron as the primary
|
||||||
|
configuration. Limited support for the legacy ROCm Megatron-LM is still
|
||||||
|
available. For instructions on using ROCm Megatron-LM, see the
|
||||||
|
:doc:`megatron-lm` document.
|
||||||
@@ -21,6 +21,8 @@ In this guide, you'll learn about:
|
|||||||
|
|
||||||
- Training a model
|
- Training a model
|
||||||
|
|
||||||
|
- :doc:`With Primus (Megatron-LM backend) <benchmark-docker/primus-megatron>`
|
||||||
|
|
||||||
- :doc:`With Megatron-LM <benchmark-docker/megatron-lm>`
|
- :doc:`With Megatron-LM <benchmark-docker/megatron-lm>`
|
||||||
|
|
||||||
- :doc:`With PyTorch <benchmark-docker/pytorch-training>`
|
- :doc:`With PyTorch <benchmark-docker/pytorch-training>`
|
||||||
|
|||||||
@@ -285,7 +285,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
|
|||||||
- Radeon AI PRO R9700
|
- Radeon AI PRO R9700
|
||||||
- RDNA4
|
- RDNA4
|
||||||
- gfx1201
|
- gfx1201
|
||||||
- 16
|
- 32
|
||||||
- 64
|
- 64
|
||||||
- 32 or 64
|
- 32 or 64
|
||||||
- 128
|
- 128
|
||||||
|
|||||||
@@ -27,6 +27,24 @@ subtrees:
|
|||||||
title: ROCm on Radeon GPUs
|
title: ROCm on Radeon GPUs
|
||||||
- file: how-to/deep-learning-rocm.md
|
- file: how-to/deep-learning-rocm.md
|
||||||
title: Deep learning frameworks
|
title: Deep learning frameworks
|
||||||
|
subtrees:
|
||||||
|
- entries:
|
||||||
|
- file: compatibility/ml-compatibility/pytorch-compatibility.rst
|
||||||
|
title: PyTorch compatibility
|
||||||
|
- file: compatibility/ml-compatibility/tensorflow-compatibility.rst
|
||||||
|
title: TensorFlow compatibility
|
||||||
|
- file: compatibility/ml-compatibility/jax-compatibility.rst
|
||||||
|
title: JAX compatibility
|
||||||
|
- file: compatibility/ml-compatibility/verl-compatibility.rst
|
||||||
|
title: verl compatibility
|
||||||
|
- file: compatibility/ml-compatibility/stanford-megatron-lm-compatibility.rst
|
||||||
|
title: Stanford Megatron-LM compatibility
|
||||||
|
- file: compatibility/ml-compatibility/dgl-compatibility.rst
|
||||||
|
title: DGL compatibility
|
||||||
|
- file: compatibility/ml-compatibility/megablocks-compatibility.rst
|
||||||
|
title: Megablocks compatibility
|
||||||
|
- file: compatibility/ml-compatibility/taichi-compatibility.rst
|
||||||
|
title: Taichi compatibility
|
||||||
- file: how-to/build-rocm.rst
|
- file: how-to/build-rocm.rst
|
||||||
title: Build ROCm from source
|
title: Build ROCm from source
|
||||||
|
|
||||||
@@ -44,8 +62,8 @@ subtrees:
|
|||||||
title: Training
|
title: Training
|
||||||
subtrees:
|
subtrees:
|
||||||
- entries:
|
- entries:
|
||||||
- file: how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
|
- file: how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst
|
||||||
title: Train a model with Megatron-LM
|
title: Train a model with Primus and Megatron-Core
|
||||||
- file: how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
|
- file: how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
|
||||||
title: Train a model with PyTorch
|
title: Train a model with PyTorch
|
||||||
- file: how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
|
- file: how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
rocm-docs-core==1.20.1
|
rocm-docs-core==1.22.0
|
||||||
sphinx-reredirects
|
sphinx-reredirects
|
||||||
sphinx-sitemap
|
sphinx-sitemap
|
||||||
sphinxcontrib.datatemplates==0.11.0
|
sphinxcontrib.datatemplates==0.11.0
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ beautifulsoup4==4.13.4
|
|||||||
# via pydata-sphinx-theme
|
# via pydata-sphinx-theme
|
||||||
breathe==4.36.0
|
breathe==4.36.0
|
||||||
# via rocm-docs-core
|
# via rocm-docs-core
|
||||||
certifi==2025.4.26
|
certifi==2025.7.14
|
||||||
# via requests
|
# via requests
|
||||||
cffi==1.17.1
|
cffi==1.17.1
|
||||||
# via
|
# via
|
||||||
@@ -35,18 +35,16 @@ click==8.2.1
|
|||||||
# via
|
# via
|
||||||
# jupyter-cache
|
# jupyter-cache
|
||||||
# sphinx-external-toc
|
# sphinx-external-toc
|
||||||
comm==0.2.2
|
comm==0.2.3
|
||||||
# via ipykernel
|
# via ipykernel
|
||||||
cryptography==45.0.3
|
cryptography==45.0.5
|
||||||
# via pyjwt
|
# via pyjwt
|
||||||
debugpy==1.8.14
|
debugpy==1.8.15
|
||||||
# via ipykernel
|
# via ipykernel
|
||||||
decorator==5.2.1
|
decorator==5.2.1
|
||||||
# via ipython
|
# via ipython
|
||||||
defusedxml==0.7.1
|
defusedxml==0.7.1
|
||||||
# via sphinxcontrib-datatemplates
|
# via sphinxcontrib-datatemplates
|
||||||
deprecated==1.2.18
|
|
||||||
# via pygithub
|
|
||||||
docutils==0.21.2
|
docutils==0.21.2
|
||||||
# via
|
# via
|
||||||
# myst-parser
|
# myst-parser
|
||||||
@@ -62,7 +60,7 @@ fastjsonschema==2.21.1
|
|||||||
# rocm-docs-core
|
# rocm-docs-core
|
||||||
gitdb==4.0.12
|
gitdb==4.0.12
|
||||||
# via gitpython
|
# via gitpython
|
||||||
gitpython==3.1.44
|
gitpython==3.1.45
|
||||||
# via rocm-docs-core
|
# via rocm-docs-core
|
||||||
greenlet==3.2.3
|
greenlet==3.2.3
|
||||||
# via sqlalchemy
|
# via sqlalchemy
|
||||||
@@ -74,7 +72,7 @@ importlib-metadata==8.7.0
|
|||||||
# via
|
# via
|
||||||
# jupyter-cache
|
# jupyter-cache
|
||||||
# myst-nb
|
# myst-nb
|
||||||
ipykernel==6.29.5
|
ipykernel==6.30.0
|
||||||
# via myst-nb
|
# via myst-nb
|
||||||
ipython==8.37.0
|
ipython==8.37.0
|
||||||
# via
|
# via
|
||||||
@@ -86,7 +84,7 @@ jinja2==3.1.6
|
|||||||
# via
|
# via
|
||||||
# myst-parser
|
# myst-parser
|
||||||
# sphinx
|
# sphinx
|
||||||
jsonschema==4.24.0
|
jsonschema==4.25.0
|
||||||
# via nbformat
|
# via nbformat
|
||||||
jsonschema-specifications==2025.4.1
|
jsonschema-specifications==2025.4.1
|
||||||
# via jsonschema
|
# via jsonschema
|
||||||
@@ -116,7 +114,7 @@ mdit-py-plugins==0.4.2
|
|||||||
# via myst-parser
|
# via myst-parser
|
||||||
mdurl==0.1.2
|
mdurl==0.1.2
|
||||||
# via markdown-it-py
|
# via markdown-it-py
|
||||||
myst-nb==1.2.0
|
myst-nb==1.3.0
|
||||||
# via rocm-docs-core
|
# via rocm-docs-core
|
||||||
myst-parser==4.0.1
|
myst-parser==4.0.1
|
||||||
# via myst-nb
|
# via myst-nb
|
||||||
@@ -134,7 +132,6 @@ nest-asyncio==1.6.0
|
|||||||
packaging==25.0
|
packaging==25.0
|
||||||
# via
|
# via
|
||||||
# ipykernel
|
# ipykernel
|
||||||
# pydata-sphinx-theme
|
|
||||||
# sphinx
|
# sphinx
|
||||||
parso==0.8.4
|
parso==0.8.4
|
||||||
# via jedi
|
# via jedi
|
||||||
@@ -152,13 +149,13 @@ pure-eval==0.2.3
|
|||||||
# via stack-data
|
# via stack-data
|
||||||
pycparser==2.22
|
pycparser==2.22
|
||||||
# via cffi
|
# via cffi
|
||||||
pydata-sphinx-theme==0.15.4
|
pydata-sphinx-theme==0.16.1
|
||||||
# via
|
# via
|
||||||
# rocm-docs-core
|
# rocm-docs-core
|
||||||
# sphinx-book-theme
|
# sphinx-book-theme
|
||||||
pygithub==2.6.1
|
pygithub==2.7.0
|
||||||
# via rocm-docs-core
|
# via rocm-docs-core
|
||||||
pygments==2.19.1
|
pygments==2.19.2
|
||||||
# via
|
# via
|
||||||
# accessible-pygments
|
# accessible-pygments
|
||||||
# ipython
|
# ipython
|
||||||
@@ -178,7 +175,7 @@ pyyaml==6.0.2
|
|||||||
# rocm-docs-core
|
# rocm-docs-core
|
||||||
# sphinx-external-toc
|
# sphinx-external-toc
|
||||||
# sphinxcontrib-datatemplates
|
# sphinxcontrib-datatemplates
|
||||||
pyzmq==26.4.0
|
pyzmq==27.0.0
|
||||||
# via
|
# via
|
||||||
# ipykernel
|
# ipykernel
|
||||||
# jupyter-client
|
# jupyter-client
|
||||||
@@ -190,9 +187,9 @@ requests==2.32.4
|
|||||||
# via
|
# via
|
||||||
# pygithub
|
# pygithub
|
||||||
# sphinx
|
# sphinx
|
||||||
rocm-docs-core==1.20.1
|
rocm-docs-core==1.22.0
|
||||||
# via -r requirements.in
|
# via -r requirements.in
|
||||||
rpds-py==0.25.1
|
rpds-py==0.26.0
|
||||||
# via
|
# via
|
||||||
# jsonschema
|
# jsonschema
|
||||||
# referencing
|
# referencing
|
||||||
@@ -220,7 +217,7 @@ sphinx==8.1.3
|
|||||||
# sphinx-reredirects
|
# sphinx-reredirects
|
||||||
# sphinxcontrib-datatemplates
|
# sphinxcontrib-datatemplates
|
||||||
# sphinxcontrib-runcmd
|
# sphinxcontrib-runcmd
|
||||||
sphinx-book-theme==1.1.4
|
sphinx-book-theme==1.1.3
|
||||||
# via rocm-docs-core
|
# via rocm-docs-core
|
||||||
sphinx-copybutton==0.5.2
|
sphinx-copybutton==0.5.2
|
||||||
# via rocm-docs-core
|
# via rocm-docs-core
|
||||||
@@ -252,7 +249,7 @@ sphinxcontrib-runcmd==0.2.0
|
|||||||
# via sphinxcontrib-datatemplates
|
# via sphinxcontrib-datatemplates
|
||||||
sphinxcontrib-serializinghtml==2.0.0
|
sphinxcontrib-serializinghtml==2.0.0
|
||||||
# via sphinx
|
# via sphinx
|
||||||
sqlalchemy==2.0.41
|
sqlalchemy==2.0.42
|
||||||
# via jupyter-cache
|
# via jupyter-cache
|
||||||
stack-data==0.6.3
|
stack-data==0.6.3
|
||||||
# via ipython
|
# via ipython
|
||||||
@@ -266,7 +263,6 @@ tornado==6.5.1
|
|||||||
# jupyter-client
|
# jupyter-client
|
||||||
traitlets==5.14.3
|
traitlets==5.14.3
|
||||||
# via
|
# via
|
||||||
# comm
|
|
||||||
# ipykernel
|
# ipykernel
|
||||||
# ipython
|
# ipython
|
||||||
# jupyter-client
|
# jupyter-client
|
||||||
@@ -274,7 +270,7 @@ traitlets==5.14.3
|
|||||||
# matplotlib-inline
|
# matplotlib-inline
|
||||||
# nbclient
|
# nbclient
|
||||||
# nbformat
|
# nbformat
|
||||||
typing-extensions==4.14.0
|
typing-extensions==4.14.1
|
||||||
# via
|
# via
|
||||||
# beautifulsoup4
|
# beautifulsoup4
|
||||||
# exceptiongroup
|
# exceptiongroup
|
||||||
@@ -290,7 +286,5 @@ urllib3==2.5.0
|
|||||||
# requests
|
# requests
|
||||||
wcwidth==0.2.13
|
wcwidth==0.2.13
|
||||||
# via prompt-toolkit
|
# via prompt-toolkit
|
||||||
wrapt==1.17.2
|
|
||||||
# via deprecated
|
|
||||||
zipp==3.23.0
|
zipp==3.23.0
|
||||||
# via importlib-metadata
|
# via importlib-metadata
|
||||||
|
|||||||
Reference in New Issue
Block a user