Merge pull request #5224 from amd-jnovotny/dlf-matt-docs643

Deep learning frameworks edits for scale (#5189)
Deep learning frameworks edits for scale (#5189 )
2026-01-10 23:28:03 -05:00 · 2025-08-22 11:56:27 -04:00 · 2025-08-22 11:52:32 -04:00 · 2025-08-22 09:02:40 -04:00 · 2025-08-18 10:20:22 -04:00 · 2025-08-15 13:52:51 -04:00
27 changed files with 2873 additions and 295 deletions
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -5,6 +5,7 @@ ACEs
 ACS
 AccVGPR
 AccVGPRs
 AITER
 ALU
 AllReduce
 AMD
@@ -115,6 +116,7 @@ Deprecations
 DevCap
 DirectX
 Dockerfile
 Dockerized
 Doxygen
 dropless
 ELMo
@@ -122,6 +124,7 @@ ENDPGM
 EPYC
 ESXi
 EoS
 fas
 FBGEMM
 FFT
 FFTs
@@ -194,6 +197,7 @@ HWE
 HWS
 Haswell
 Higgs
 href
 Hyperparameters
 Huggingface
 ICD
@@ -360,6 +364,7 @@ PowerEdge
 PowerShell
 Pretrained
 Pretraining
 Primus
 Profiler's
 PyPi
 Pytest
@@ -524,6 +529,7 @@ Xilinx
 Xnack
 Xteam
 YAML
 YAMLs
 YML
 YModel
 ZeRO
@@ -584,6 +590,7 @@ completers
 composable
 concretization
 config
 configs
 conformant
 constructible
 convolutional
@@ -794,7 +801,9 @@ preprocessing
 preprocessor
 prequantized
 prerequisites
 pretrain
 pretraining
 primus
 profiler
 profilers
 protobuf
--- a/docs/compatibility/compatibility-matrix-historical-6.0.csv
+++ b/docs/compatibility/compatibility-matrix-historical-6.0.csv
@@ -31,9 +31,9 @@ ROCm Version,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6
      :doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.14.0, 2.13.1, 2.12.1","2.14.0, 2.13.1, 2.12.1"
      :doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.4.35,0.4.35,0.4.35,0.4.35,0.4.31,0.4.31,0.4.31,0.4.31,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26
      :doc:`verl <../compatibility/ml-compatibility/verl-compatibility>` [#verl_compat]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.3.0.post0,N/A,N/A,N/A,N/A,N/A
-      :doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>`,N/A,N/A,N/A,N/A,85f95ae,85f95ae,85f95ae,85f95ae,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>` [#stanford-megatron-lm_compat]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,85f95ae,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
      :doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat]_,N/A,N/A,N/A,2.4.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,
-      :doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>`,N/A,N/A,N/A,N/A,0.7.0,0.7.0,0.7.0,0.7.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>` [#megablocks_compat]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.7.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
      :doc:`Taichi <../compatibility/ml-compatibility/taichi-compatibility>` [#taichi_compat]_,N/A,N/A,N/A,N/A,N/A,1.8.0b1,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
      `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.2,1.2,1.2,1.2,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.14.1,1.14.1
 ,,,,,,,,,,,,,,,,,,
--- a/docs/compatibility/compatibility-matrix.rst
+++ b/docs/compatibility/compatibility-matrix.rst
@@ -242,7 +242,9 @@ Expand for full historical view of:
   .. [#mi300_602-past-60] **For ROCm 6.0.2** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.
   .. [#mi300_600-past-60] **For ROCm 6.0.0** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.
   .. [#verl_compat] verl is only supported on ROCm 6.2.0.
   .. [#stanford-megatron-lm_compat] Stanford Megatron-LM is only supported on ROCm 6.3.0.
   .. [#dgl_compat] DGL is only supported on ROCm 6.4.0.
   .. [#megablocks_compat] Megablocks is only supported on ROCm 6.3.0.
   .. [#taichi_compat] Taichi is only supported on ROCm 6.3.2.
   .. [#kfd_support-past-60] As of ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The tested user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and kernel-space support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
   .. [#ROCT-rocr-past-60] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -147,6 +147,8 @@ article_pages = [
    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250521", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250605", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250702", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250702", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/deploy-your-model", "os": ["linux"]},
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml
@@ -0,0 +1,163 @@
 vllm_benchmark:
  unified_docker:
    latest:
      # TODO: update me
      pull_tag: rocm/vllm:rocm6.4.1_vllm_0.9.1_20250715
      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250715/images/sha256-4a429705fa95a58f6d20aceab43b1b76fa769d57f32d5d28bd3f4e030e2a78ea
      rocm_version: 6.4.1
      vllm_version: 0.9.1 (0.9.2.dev364+gb432b7a28.rocm641)
      pytorch_version: 2.7.0+gitf717b2a
      hipblaslt_version: 0.15
  model_groups:
    - group: Meta Llama
      tag: llama
      models:
      - model: Llama 3.1 8B
        mad_tag: pyt_vllm_llama-3.1-8b
        model_repo: meta-llama/Llama-3.1-8B-Instruct
        url: https://huggingface.co/meta-llama/Llama-3.1-8B
        precision: float16
      - model: Llama 3.1 70B
        mad_tag: pyt_vllm_llama-3.1-70b
        model_repo: meta-llama/Llama-3.1-70B-Instruct
        url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
        precision: float16
      - model: Llama 3.1 405B
        mad_tag: pyt_vllm_llama-3.1-405b
        model_repo: meta-llama/Llama-3.1-405B-Instruct
        url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
        precision: float16
      - model: Llama 2 7B
        mad_tag: pyt_vllm_llama-2-7b
        model_repo: meta-llama/Llama-2-7b-chat-hf
        url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
        precision: float16
      - model: Llama 2 70B
        mad_tag: pyt_vllm_llama-2-70b
        model_repo: meta-llama/Llama-2-70b-chat-hf
        url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
        precision: float16
      - model: Llama 3.1 8B FP8
        mad_tag: pyt_vllm_llama-3.1-8b_fp8
        model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
        url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
        precision: float8
      - model: Llama 3.1 70B FP8
        mad_tag: pyt_vllm_llama-3.1-70b_fp8
        model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
        url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
        precision: float8
      - model: Llama 3.1 405B FP8
        mad_tag: pyt_vllm_llama-3.1-405b_fp8
        model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
        precision: float8
    - group: Mistral AI
      tag: mistral
      models:
      - model: Mixtral MoE 8x7B
        mad_tag: pyt_vllm_mixtral-8x7b
        model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
        url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
        precision: float16
      - model: Mixtral MoE 8x22B
        mad_tag: pyt_vllm_mixtral-8x22b
        model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
        url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
        precision: float16
      - model: Mistral 7B
        mad_tag: pyt_vllm_mistral-7b
        model_repo: mistralai/Mistral-7B-Instruct-v0.3
        url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
        precision: float16
      - model: Mixtral MoE 8x7B FP8
        mad_tag: pyt_vllm_mixtral-8x7b_fp8
        model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
        url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
        precision: float8
      - model: Mixtral MoE 8x22B FP8
        mad_tag: pyt_vllm_mixtral-8x22b_fp8
        model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
        url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
        precision: float8
      - model: Mistral 7B FP8
        mad_tag: pyt_vllm_mistral-7b_fp8
        model_repo: amd/Mistral-7B-v0.1-FP8-KV
        url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
        precision: float8
    - group: Qwen
      tag: qwen
      models:
      - model: Qwen2 7B
        mad_tag: pyt_vllm_qwen2-7b
        model_repo: Qwen/Qwen2-7B-Instruct
        url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
        precision: float16
      - model: Qwen2 72B
        mad_tag: pyt_vllm_qwen2-72b
        model_repo: Qwen/Qwen2-72B-Instruct
        url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
        precision: float16
      - model: QwQ-32B
        mad_tag: pyt_vllm_qwq-32b
        model_repo: Qwen/QwQ-32B
        url: https://huggingface.co/Qwen/QwQ-32B
        precision: float16
        tunableop: true
    - group: Databricks DBRX
      tag: dbrx
      models:
      - model: DBRX Instruct
        mad_tag: pyt_vllm_dbrx-instruct
        model_repo: databricks/dbrx-instruct
        url: https://huggingface.co/databricks/dbrx-instruct
        precision: float16
      - model: DBRX Instruct FP8
        mad_tag: pyt_vllm_dbrx_fp8
        model_repo: amd/dbrx-instruct-FP8-KV
        url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
        precision: float8
    - group: Google Gemma
      tag: gemma
      models:
      - model: Gemma 2 27B
        mad_tag: pyt_vllm_gemma-2-27b
        model_repo: google/gemma-2-27b
        url: https://huggingface.co/google/gemma-2-27b
        precision: float16
    - group: Cohere
      tag: cohere
      models:
      - model: C4AI Command R+ 08-2024
        mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
        model_repo: CohereForAI/c4ai-command-r-plus-08-2024
        url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
        precision: float16
      - model: C4AI Command R+ 08-2024 FP8
        mad_tag: pyt_vllm_command-r-plus_fp8
        model_repo: amd/c4ai-command-r-plus-FP8-KV
        url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
        precision: float8
    - group: DeepSeek
      tag: deepseek
      models:
      - model: DeepSeek MoE 16B
        mad_tag: pyt_vllm_deepseek-moe-16b-chat
        model_repo: deepseek-ai/deepseek-moe-16b-chat
        url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
        precision: float16
    - group: Microsoft Phi
      tag: phi
      models:
      - model: Phi-4
        mad_tag: pyt_vllm_phi-4
        model_repo: microsoft/phi-4
        url: https://huggingface.co/microsoft/phi-4
    - group: TII Falcon
      tag: falcon
      models:
      - model: Falcon 180B
        mad_tag: pyt_vllm_falcon-180b
        model_repo: tiiuae/falcon-180B
        url: https://huggingface.co/tiiuae/falcon-180B
        precision: float16
--- a/docs/data/how-to/rocm-for-ai/inference/pytorch-inference-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/pytorch-inference-benchmark-models.yaml
@@ -39,7 +39,7 @@ pytorch_inference_benchmark:
        model_repo: Wan-AI/Wan2.1-T2V-14B
        url: https://huggingface.co/Wan-AI/Wan2.1-T2V-14B
        precision: bfloat16
-    - group: Janus-Pro
+    - group: Janus Pro
      tag: janus-pro
      models:
      - model: Janus Pro 7B
@@ -47,3 +47,11 @@ pytorch_inference_benchmark:
        model_repo: deepseek-ai/Janus-Pro-7B
        url: https://huggingface.co/deepseek-ai/Janus-Pro-7B
        precision: bfloat16
    - group: Hunyuan Video
      tag: hunyuan
      models:
      - model: Hunyuan Video
        mad_tag: pyt_hy_video
        model_repo: tencent/HunyuanVideo
        url: https://huggingface.co/tencent/HunyuanVideo
        precision: float16
--- a/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
@@ -2,11 +2,11 @@ vllm_benchmark:
  unified_docker:
    latest:
      # TODO: update me
-      pull_tag: rocm/vllm:rocm6.4.1_vllm_0.9.1_20250715
+      pull_tag: rocm/vllm:rocm6.4.1_vllm_0.10.0_20250812
-      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250715/images/sha256-4a429705fa95a58f6d20aceab43b1b76fa769d57f32d5d28bd3f4e030e2a78ea
+      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.0_20250812/images/sha256-4c277ad39af3a8c9feac9b30bf78d439c74d9b4728e788a419d3f1d0c30cacaa
      rocm_version: 6.4.1
-      vllm_version: 0.9.1 (0.9.2.dev364+gb432b7a28.rocm641)
+      vllm_version: 0.10.0 (0.10.1.dev395+g340ea86df.rocm641)
-      pytorch_version: 2.7.0+gitf717b2a
+      pytorch_version: 2.7.0+gitf717b2a (2.7.0+gitf717b2a)
      hipblaslt_version: 0.15
  model_groups:
    - group: Meta Llama
@@ -27,11 +27,6 @@ vllm_benchmark:
        model_repo: meta-llama/Llama-3.1-405B-Instruct
        url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
        precision: float16
      - model: Llama 2 7B
        mad_tag: pyt_vllm_llama-2-7b
        model_repo: meta-llama/Llama-2-7b-chat-hf
        url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
        precision: float16
      - model: Llama 2 70B
        mad_tag: pyt_vllm_llama-2-70b
        model_repo: meta-llama/Llama-2-70b-chat-hf
@@ -65,11 +60,6 @@ vllm_benchmark:
        model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
        url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
        precision: float16
      - model: Mistral 7B
        mad_tag: pyt_vllm_mistral-7b
        model_repo: mistralai/Mistral-7B-Instruct-v0.3
        url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
        precision: float16
      - model: Mixtral MoE 8x7B FP8
        mad_tag: pyt_vllm_mixtral-8x7b_fp8
        model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
@@ -80,72 +70,15 @@ vllm_benchmark:
        model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
        url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
        precision: float8
      - model: Mistral 7B FP8
        mad_tag: pyt_vllm_mistral-7b_fp8
        model_repo: amd/Mistral-7B-v0.1-FP8-KV
        url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
        precision: float8
    - group: Qwen
      tag: qwen
      models:
      - model: Qwen2 7B
        mad_tag: pyt_vllm_qwen2-7b
        model_repo: Qwen/Qwen2-7B-Instruct
        url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
        precision: float16
      - model: Qwen2 72B
        mad_tag: pyt_vllm_qwen2-72b
        model_repo: Qwen/Qwen2-72B-Instruct
        url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
        precision: float16
      - model: QwQ-32B
        mad_tag: pyt_vllm_qwq-32b
        model_repo: Qwen/QwQ-32B
        url: https://huggingface.co/Qwen/QwQ-32B
        precision: float16
        tunableop: true
    - group: Databricks DBRX
      tag: dbrx
      models:
      - model: DBRX Instruct
        mad_tag: pyt_vllm_dbrx-instruct
        model_repo: databricks/dbrx-instruct
        url: https://huggingface.co/databricks/dbrx-instruct
        precision: float16
      - model: DBRX Instruct FP8
        mad_tag: pyt_vllm_dbrx_fp8
        model_repo: amd/dbrx-instruct-FP8-KV
        url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
        precision: float8
    - group: Google Gemma
      tag: gemma
      models:
      - model: Gemma 2 27B
        mad_tag: pyt_vllm_gemma-2-27b
        model_repo: google/gemma-2-27b
        url: https://huggingface.co/google/gemma-2-27b
        precision: float16
    - group: Cohere
      tag: cohere
      models:
      - model: C4AI Command R+ 08-2024
        mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
        model_repo: CohereForAI/c4ai-command-r-plus-08-2024
        url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
        precision: float16
      - model: C4AI Command R+ 08-2024 FP8
        mad_tag: pyt_vllm_command-r-plus_fp8
        model_repo: amd/c4ai-command-r-plus-FP8-KV
        url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
        precision: float8
    - group: DeepSeek
      tag: deepseek
      models:
      - model: DeepSeek MoE 16B
        mad_tag: pyt_vllm_deepseek-moe-16b-chat
        model_repo: deepseek-ai/deepseek-moe-16b-chat
        url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
        precision: float16
    - group: Microsoft Phi
      tag: phi
      models:
@@ -153,11 +86,3 @@ vllm_benchmark:
        mad_tag: pyt_vllm_phi-4
        model_repo: microsoft/phi-4
        url: https://huggingface.co/microsoft/phi-4
    - group: TII Falcon
      tag: falcon
      models:
      - model: Falcon 180B
        mad_tag: pyt_vllm_falcon-180b
        model_repo: tiiuae/falcon-180B
        url: https://huggingface.co/tiiuae/falcon-180B
        precision: float16
--- a/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
@@ -1,26 +1,15 @@
 dockers:
-  - pull_tag: rocm/megatron-lm:v25.6_py312
+  - pull_tag: rocm/megatron-lm:v25.7_py310
-    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py312/images/sha256-482ff906532285bceabdf2bda629bd32cb6174d2d07f4243a736378001b28df0
+    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a
    components:
-      ROCm: 6.4.1
+      ROCm: 6.4.2
-      PyTorch: 2.8.0a0+git7d205b2
+      Primus: v0.1.0-rc1
-      Python: 3.12
+      PyTorch: 2.8.0a0+gitd06a406
      Transformer Engine: 2.1.0.dev0+8c4a512
      hipBLASLt: 393e413
      Triton: 3.3.0
      RCCL: 2.23.4.7a84c5d
    doc_name: Ubuntu 24.04 + Python 3.12
  - pull_tag: rocm/megatron-lm:v25.6_py310
    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py310/images/sha256-9627bd9378684fe26cb1a10c7dd817868f553b33402e49b058355b0f095568d6
    components:
      ROCm: 6.4.1
      PyTorch: 2.8.0a0+git7d205b2
      Python: "3.10"
-      Transformer Engine: 2.1.0.dev0+8c4a512
+      Transformer Engine: 2.1.0.dev0+ba586519
-      hipBLASLt: 393e413
+      hipBLASLt: 37ba1d36
      Triton: 3.3.0
-      RCCL: 2.23.4.7a84c5d
+      RCCL: 2.22.3
    doc_name: Ubuntu 22.04 + Python 3.10
 model_groups:
  - group: Meta Llama
    tag: llama
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.6-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.6-benchmark-models.yaml
@@ -0,0 +1,60 @@
 dockers:
  - pull_tag: rocm/megatron-lm:v25.6_py312
    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py312/images/sha256-482ff906532285bceabdf2bda629bd32cb6174d2d07f4243a736378001b28df0
    components:
      ROCm: 6.4.1
      PyTorch: 2.8.0a0+git7d205b2
      Python: 3.12
      Transformer Engine: 2.1.0.dev0+8c4a512
      hipBLASLt: 393e413
      Triton: 3.3.0
      RCCL: 2.23.4.7a84c5d
    doc_name: Ubuntu 24.04 + Python 3.12
  - pull_tag: rocm/megatron-lm:v25.6_py310
    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py310/images/sha256-9627bd9378684fe26cb1a10c7dd817868f553b33402e49b058355b0f095568d6
    components:
      ROCm: 6.4.1
      PyTorch: 2.8.0a0+git7d205b2
      Python: "3.10"
      Transformer Engine: 2.1.0.dev0+8c4a512
      hipBLASLt: 393e413
      Triton: 3.3.0
      RCCL: 2.23.4.7a84c5d
    doc_name: Ubuntu 22.04 + Python 3.10
 model_groups:
  - group: Meta Llama
    tag: llama
    models:
      - model: Llama 3.3 70B
        mad_tag: pyt_megatron_lm_train_llama-3.3-70b
      - model: Llama 3.1 8B
        mad_tag: pyt_megatron_lm_train_llama-3.1-8b
      - model: Llama 3.1 70B
        mad_tag: pyt_megatron_lm_train_llama-3.1-70b
      - model: Llama 3.1 70B (proxy)
        mad_tag: pyt_megatron_lm_train_llama-3.1-70b-proxy
      - model: Llama 2 7B
        mad_tag: pyt_megatron_lm_train_llama-2-7b
      - model: Llama 2 70B
        mad_tag: pyt_megatron_lm_train_llama-2-70b
  - group: DeepSeek
    tag: deepseek
    models:
      - model: DeepSeek-V3 (proxy)
        mad_tag: pyt_megatron_lm_train_deepseek-v3-proxy
      - model: DeepSeek-V2-Lite
        mad_tag: pyt_megatron_lm_train_deepseek-v2-lite-16b
  - group: Mistral AI
    tag: mistral
    models:
      - model: Mixtral 8x7B
        mad_tag: pyt_megatron_lm_train_mixtral-8x7b
      - model: Mixtral 8x22B (proxy)
        mad_tag: pyt_megatron_lm_train_mixtral-8x22b-proxy
  - group: Qwen
    tag: qwen
    models:
      - model: Qwen 2.5 7B
        mad_tag: pyt_megatron_lm_train_qwen2.5-7b
      - model: Qwen 2.5 72B
        mad_tag: pyt_megatron_lm_train_qwen2.5-72b
--- a/docs/data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
@@ -0,0 +1,58 @@
 dockers:
  - pull_tag: rocm/megatron-lm:v25.7_py310
    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a
    components:
      ROCm: 6.4.2
      Primus: v0.1.0-rc1
      PyTorch: 2.8.0a0+gitd06a406
      Python: "3.10"
      Transformer Engine: 2.1.0.dev0+ba586519
      hipBLASLt: 37ba1d36
      Triton: 3.3.0
      RCCL: 2.22.3
 model_groups:
  - group: Meta Llama
    tag: llama
    models:
      - model: Llama 3.3 70B
        mad_tag: primus_pyt_megatron_lm_train_llama-3.3-70b
        config_name: llama3.3_70B-pretrain.yaml
      - model: Llama 3.1 70B
        mad_tag: primus_pyt_megatron_lm_train_llama-3.1-70b
        config_name: llama3.1_70B-pretrain.yaml
      - model: Llama 3.1 8B
        mad_tag: primus_pyt_megatron_lm_train_llama-3.1-8b
        config_name: llama3.1_8B-pretrain.yaml
      - model: Llama 2 7B
        mad_tag: primus_pyt_megatron_lm_train_llama-2-7b
        config_name: llama2_7B-pretrain.yaml
      - model: Llama 2 70B
        mad_tag: primus_pyt_megatron_lm_train_llama-2-70b
        config_name: llama2_70B-pretrain.yaml
  - group: DeepSeek
    tag: deepseek
    models:
      - model: DeepSeek-V3 (proxy)
        mad_tag: primus_pyt_megatron_lm_train_deepseek-v3-proxy
        config_name: deepseek_v3-pretrain.yaml
      - model: DeepSeek-V2-Lite
        mad_tag: primus_pyt_megatron_lm_train_deepseek-v2-lite-16b
        config_name: deepseek_v2_lite-pretrain.yaml
  - group: Mistral AI
    tag: mistral
    models:
      - model: Mixtral 8x7B
        mad_tag: primus_pyt_megatron_lm_train_mixtral-8x7b
        config_name: mixtral_8x7B_v0.1-pretrain.yaml
      - model: Mixtral 8x22B (proxy)
        mad_tag: primus_pyt_megatron_lm_train_mixtral-8x22b-proxy
        config_name: mixtral_8x22B_v0.1-pretrain.yaml
  - group: Qwen
    tag: qwen
    models:
      - model: Qwen 2.5 7B
        mad_tag: primus_pyt_megatron_lm_train_qwen2.5-7b
        config_name: primus_qwen2.5_7B-pretrain.yaml
      - model: Qwen 2.5 72B
        mad_tag: primus_pyt_megatron_lm_train_qwen2.5-72b
        config_name: qwen2.5_72B-pretrain.yaml
--- a/docs/how-to/deep-learning-rocm.rst
+++ b/docs/how-to/deep-learning-rocm.rst
@@ -2,58 +2,132 @@
   :description: How to install deep learning frameworks for ROCm
   :keywords: deep learning, frameworks, ROCm, install, PyTorch, TensorFlow, JAX, MAGMA, DeepSpeed, ML, AI
-********************************************
+**********************************
-Installing deep learning frameworks for ROCm
+Deep learning frameworks for ROCm
-********************************************
+**********************************
-ROCm provides a comprehensive ecosystem for deep learning development, including
+Deep learning frameworks provide environments for machine learning, training, fine-tuning, inference, and performance optimization.
 :ref:`libraries <artificial-intelligence-apis>` for optimized deep learning operations and ROCm-aware versions of popular
 deep learning frameworks and libraries such as PyTorch, TensorFlow, and JAX. ROCm works closely with these
 frameworks to ensure that framework-specific optimizations take advantage of AMD accelerator and GPU architectures.
-The following guides provide information on compatibility and supported
+ROCm offers a complete ecosystem for developing and running deep learning applications efficiently. It also provides ROCm-compatible versions of popular frameworks and libraries, such as PyTorch, TensorFlow, JAX, and others.
 features for these ROCm-enabled deep learning frameworks.
-* :doc:`PyTorch compatibility <../compatibility/ml-compatibility/pytorch-compatibility>`
+The AMD ROCm organization actively contributes to open-source development and collaborates closely with framework organizations. This collaboration ensures that framework-specific optimizations effectively leverage AMD GPUs and accelerators.
 * :doc:`TensorFlow compatibility <../compatibility/ml-compatibility/tensorflow-compatibility>`
 * :doc:`JAX compatibility <../compatibility/ml-compatibility/jax-compatibility>`
 * :doc:`verl compatibility <../compatibility/ml-compatibility/verl-compatibility>`
 * :doc:`Stanford Megatron-LM compatibility <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>`
 * :doc:`DGL compatibility <../compatibility/ml-compatibility/dgl-compatibility>`
 * :doc:`Megablocks compatibility <../compatibility/ml-compatibility/megablocks-compatibility>`
 * :doc:`Taichi compatibility <../compatibility/ml-compatibility/taichi-compatibility>`
-This chart steps through typical installation workflows for installing deep learning frameworks for ROCm.
+The table below summarizes information about ROCm-enabled deep learning frameworks. It includes details on ROCm compatibility and third-party tool support, installation steps and options, and links to GitHub resources. For a complete list of supported framework versions on ROCm, see the :doc:`Compatibility matrix <../compatibility/compatibility-matrix>` topic.
-.. image:: ../data/how-to/framework_install_2024_07_04.png
+.. list-table:: 
-   :alt: Flowchart for installing ROCm-aware machine learning frameworks
+    :header-rows: 1
-   :align: center
+    :widths: 5 3 6 3
-See the installation instructions to get started.
+    * - Framework
      - Installation
      - Installation options
      - GitHub
-* :doc:`PyTorch for ROCm <rocm-install-on-linux:install/3rd-party/pytorch-install>`
+    * - `PyTorch <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/pytorch-compatibility.html>`_
-* :doc:`TensorFlow for ROCm <rocm-install-on-linux:install/3rd-party/tensorflow-install>`
+      - .. raw:: html
-* :doc:`JAX for ROCm <rocm-install-on-linux:install/3rd-party/jax-install>`
+         
-* :doc:`verl for ROCm <rocm-install-on-linux:install/3rd-party/verl-install>`
+          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html"><i class="fas fa-link fa-lg"></i></a>
-* :doc:`Stanford Megatron-LM for ROCm <rocm-install-on-linux:install/3rd-party/stanford-megatron-lm-install>`
+      - 
-* :doc:`DGL for ROCm <rocm-install-on-linux:install/3rd-party/dgl-install>`
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-a-docker-image-with-pytorch-pre-installed>`_ 
-* :doc:`Megablocks for ROCm <rocm-install-on-linux:install/3rd-party/megablocks-install>`
+        - `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-a-wheels-package>`_
-* :doc:`Taichi for ROCm <rocm-install-on-linux:install/3rd-party/taichi-install>`
+        - `ROCm Base Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-the-pytorch-rocm-base-docker-image>`_ 
        - `Upstream Docker file <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-the-pytorch-upstream-dockerfile>`_
      - .. raw:: html
          <a href="https://github.com/ROCm/pytorch"><i class="fab fa-github fa-lg"></i></a>
    * - `TensorFlow <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/tensorflow-compatibility.html>`_
      - .. raw:: html
          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html"><i class="fas fa-link fa-lg"></i></a>
      - 
        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html#using-a-docker-image-with-tensorflow-pre-installed>`_
        - `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html#using-a-wheels-package>`_
-.. note::
+      - .. raw:: html
          <a href="https://github.com/ROCm/tensorflow-upstream"><i class="fab fa-github fa-lg"></i></a> 
    * - `JAX <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/jax-compatibility.html>`_
      - .. raw:: html
          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/jax-install.html"><i class="fas fa-link fa-lg"></i></a>
      - 
        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/jax-install.html#using-a-prebuilt-docker-image>`_
      - .. raw:: html
          <a href="https://github.com/ROCm/jax"><i class="fab fa-github fa-lg"></i></a>
    * - `verl <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/verl-compatibility.html>`_
      - .. raw:: html
          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/verl-install.html"><i class="fas fa-link fa-lg"></i></a>
      - 
        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/verl-install.html#use-a-prebuilt-docker-image-with-verl-pre-installed>`_
      - .. raw:: html
          <a href="https://github.com/ROCm/verl"><i class="fab fa-github fa-lg"></i></a>
    * - `Stanford Megatron-LM <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/stanford-megatron-lm-compatibility.html>`_
      - .. raw:: html
          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/stanford-megatron-lm-install.html"><i class="fas fa-link fa-lg"></i></a>
      - 
        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/stanford-megatron-lm-install.html#use-a-prebuilt-docker-image-with-stanford-megatron-lm-pre-installed>`_
      - .. raw:: html
          <a href="https://github.com/ROCm/Stanford-Megatron-LM"><i class="fab fa-github fa-lg"></i></a>
    * - `DGL <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/dgl-compatibility.html>`_
      - .. raw:: html
          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/dgl-install.html"><i class="fas fa-link fa-lg"></i></a>
      - 
        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/dgl-install.html#use-a-prebuilt-docker-image-with-dgl-pre-installed>`_
      - .. raw:: html
          <a href="https://github.com/ROCm/dgl"><i class="fab fa-github fa-lg"></i></a> 
    * - `Megablocks <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/megablocks-compatibility.html>`_
      - .. raw:: html
          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/megablocks-install.html"><i class="fas fa-link fa-lg"></i></a>
      - 
        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/megablocks-install.html#using-a-prebuilt-docker-image-with-megablocks-pre-installed>`_
      - .. raw:: html
          <a href="https://github.com/ROCm/megablocks"><i class="fab fa-github fa-lg"></i></a>
    * - `Taichi <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/taichi-compatibility.html>`_
      - .. raw:: html
          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html"><i class="fas fa-link fa-lg"></i></a>
      - 
        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html#use-a-prebuilt-docker-image-with-taichi-pre-installed>`_ 
        - `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html#use-a-wheels-package>`_
      - .. raw:: html
          <a href="https://github.com/ROCm/taichi"><i class="fab fa-github fa-lg"></i></a>      
   For guidance on installing ROCm itself, refer to :doc:`ROCm installation for Linux <rocm-install-on-linux:index>`.
 Learn how to use your ROCm deep learning environment for training, fine-tuning, inference, and performance optimization
 through the following guides.
 * :doc:`rocm-for-ai/index`
-* :doc:`Training <rocm-for-ai/training/index>`
+* :doc:`Use ROCm for training <rocm-for-ai/training/index>`
 * :doc:`Use ROCm for fine-tuning LLMs <rocm-for-ai/fine-tuning/index>`
 * :doc:`Use ROCm for AI inference <rocm-for-ai/inference/index>`
 * :doc:`Use ROCm for AI inference optimization <rocm-for-ai/inference-optimization/index>`
 * :doc:`Fine-tuning LLMs <rocm-for-ai/fine-tuning/index>`
 * :doc:`Inference <rocm-for-ai/inference/index>`
 * :doc:`Inference optimization <rocm-for-ai/inference-optimization/index>`
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250702.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250702.rst
@@ -14,7 +14,7 @@ vLLM inference performance testing
   This documentation does not reflect the latest version of ROCm vLLM
   inference performance documentation. See :doc:`../vllm` for the latest version.
-.. _vllm-benchmark-unified-docker:
+.. _vllm-benchmark-unified-docker-702:
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250702-benchmark-models.yaml
@@ -77,7 +77,7 @@ vLLM inference performance testing
        </div>
      </div>
-   .. _vllm-benchmark-vllm:
+   .. _vllm-benchmark-vllm-702:
   {% for model_group in model_groups %}
      {% for model in model_group.models %}
@@ -159,7 +159,7 @@ vLLM inference performance testing
   Once the setup is complete, choose between two options to reproduce the
   benchmark results:
-   .. _vllm-benchmark-mad:
+   .. _vllm-benchmark-mad-702:
   {% for model_group in model_groups %}
      {% for model in model_group.models %}
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715.rst
@@ -0,0 +1,450 @@
 :orphan:
 .. meta::
   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
                 ROCm vLLM Docker image.
   :keywords: model, MAD, automation, dashboarding, validate
 **********************************
 vLLM inference performance testing
 **********************************
 .. caution::
   This documentation does not reflect the latest version of ROCm vLLM
   inference performance documentation. See :doc:`../vllm` for the latest version.
 .. _vllm-benchmark-unified-docker-715:
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml
   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
   {% set model_groups = data.vllm_benchmark.model_groups %}
   The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
   a prebuilt, optimized environment for validating large language model (LLM)
   inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
   accelerators and includes the following components:
   .. list-table::
      :header-rows: 1
      * - Software component
        - Version
      * - `ROCm <https://github.com/ROCm/ROCm>`__
        - {{ unified_docker.rocm_version }}
      * - `vLLM <https://docs.vllm.ai/en/latest>`__
        - {{ unified_docker.vllm_version }}
      * - `PyTorch <https://github.com/ROCm/pytorch>`__
        - {{ unified_docker.pytorch_version }}
      * - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`__
        - {{ unified_docker.hipblaslt_version }}
 With this Docker image, you can quickly test the :ref:`expected
 inference performance numbers <vllm-benchmark-performance-measurements>` for
 MI300X series accelerators.
 What's new
 ==========
 The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <vllm-history>`.
 * The ``--compilation-config-parameter`` is no longer required as its options are now enabled by default.
  This parameter has been removed from the benchmarking script.
 * Resolved Llama 3.1 405 B custom all-reduce issue, eliminating the need for ``--disable-custom-all-reduce``.
  This parameter has been removed from the benchmarking script.
 * Fixed a ``+rms_norm`` custom kernel issue.
 * Added quick reduce functionality. Set ``VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=FP`` to enable; supported modes are ``FP``, ``INT8``, ``INT6``, ``INT4``.
 * Implemented a workaround to potentially mitigate GPU crashes experienced with the Command R+ model, pending a driver fix.
 Supported models
 ================
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml
   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
   {% set model_groups = data.vllm_benchmark.model_groups %}
   .. _vllm-benchmark-available-models-715:
   The following models are supported for inference performance benchmarking
   with vLLM and ROCm. Some instructions, commands, and recommendations in this
   documentation might vary by model -- select one to get started.
   .. raw:: html
      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
      <div class="row">
         <div class="col-2 me-2 model-param-head">Model group</div>
         <div class="row col-10">
   {% for model_group in model_groups %}
            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
   {% endfor %}
         </div>
      </div>
      <div class="row mt-1">
         <div class="col-2 me-2 model-param-head">Model</div>
         <div class="row col-10">
   {% for model_group in model_groups %}
      {% set models = model_group.models %}
      {% for model in models %}
         {% if models|length % 3 == 0 %}
            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
         {% else %}
            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
         {% endif %}
      {% endfor %}
   {% endfor %}
         </div>
      </div>
      </div>
   .. _vllm-benchmark-vllm-715:
   {% for model_group in model_groups %}
      {% for model in model_group.models %}
   .. container:: model-doc {{model.mad_tag}}
      .. note::
         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
         Some models require access authorization prior to use via an external license agreement through a third party.
      {% endfor %}
   {% endfor %}
 .. note::
   vLLM is a toolkit and library for LLM inference and serving. AMD implements
   high-performance custom kernels and modules in vLLM to enhance performance.
   See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
   more information.
 .. _vllm-benchmark-performance-measurements-715:
 Performance measurements
 ========================
 To evaluate performance, the
 `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
 page provides reference throughput and latency measurements for inferencing popular AI models.
 .. important::
   The performance data presented in
   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
   only reflects the latest version of this inference benchmarking environment.
   The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
 System validation
 =================
 Before running AI workloads, it's important to validate that your AMD hardware is configured
 correctly and performing optimally.
 If you have already validated your system settings, including aspects like NUMA auto-balancing, you
 can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
 optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
 before starting training.
 To test for optimal performance, consult the recommended :ref:`System health benchmarks
 <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
 system's configuration.
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml
   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
   {% set model_groups = data.vllm_benchmark.model_groups %}
   Pull the Docker image
   =====================
   Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
   Use the following command to pull the Docker image from Docker Hub.
   .. code-block:: shell
      docker pull {{ unified_docker.pull_tag }}
   Benchmarking
   ============
   Once the setup is complete, choose between two options to reproduce the
   benchmark results:
   .. _vllm-benchmark-mad-715:
   {% for model_group in model_groups %}
      {% for model in model_group.models %}
   .. container:: model-doc {{model.mad_tag}}
      .. tab-set::
         .. tab-item:: MAD-integrated benchmarking
            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
               directory and install the required packages on the host machine.
               .. code-block:: shell
                  git clone https://github.com/ROCm/MAD
                  cd MAD
                  pip install -r requirements.txt
            2. Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
               using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
               .. code-block:: shell
                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
                  madengine run \
                      --tags {{model.mad_tag}} \
                      --keep-model-dir \
                      --live-output \
                      --timeout 28800
            MAD launches a Docker container with the name
            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
            model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
            Although the :ref:`available models <vllm-benchmark-available-models>` are preconfigured
            to collect latency and throughput performance data, you can also change the benchmarking
            parameters. See the standalone benchmarking tab for more information.
            {% if model.tunableop %}
            .. note::
               For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
               TunableOp automatically explores different implementations and configurations of certain PyTorch
               operators to find the fastest one for your hardware.
               By default, ``{{model.mad_tag}}`` runs with TunableOp disabled
               (see
               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__).
               To enable it, include the ``--tunableop on`` argument in your
               run.
               Enabling TunableOp triggers a two-pass run -- a warm-up followed
               by the performance-collection run.
            {% endif %}
         .. tab-item:: Standalone benchmarking
            .. rubric:: Download the Docker image and required scripts
            1. Run the vLLM benchmark tool independently by starting the
               `Docker container <{{ unified_docker.docker_hub_url }}>`_
               as shown in the following snippet.
               .. code-block:: shell
                  docker pull {{ unified_docker.pull_tag }}
                  docker run -it \
                      --device=/dev/kfd \
                      --device=/dev/dri \
                      --group-add video \
                      --shm-size 16G \
                      --security-opt seccomp=unconfined \
                      --security-opt apparmor=unconfined \
                      --cap-add=SYS_PTRACE \
                      -v $(pwd):/workspace \
                      --env HUGGINGFACE_HUB_CACHE=/workspace \
                      --name test \
                      {{ unified_docker.pull_tag }}
            2. In the Docker container, clone the ROCm MAD repository and navigate to the
               benchmark scripts directory at ``~/MAD/scripts/vllm``.
               .. code-block:: shell
                  git clone https://github.com/ROCm/MAD
                  cd MAD/scripts/vllm
            3. To start the benchmark, use the following command with the appropriate options.
               .. dropdown:: Benchmark options
                  :open:
                  .. list-table::
                     :header-rows: 1
                     :align: center
                     * - Name
                       - Options
                       - Description
                     * - ``$test_option``
                       - latency
                       - Measure decoding token latency
                     * -
                       - throughput
                       - Measure token generation throughput
                     * -
                       - all
                       - Measure both throughput and latency
                     * - ``$num_gpu``
                       - 1 or 8
                       - Number of GPUs
                     * - ``$datatype``
                       - ``float16`` or ``float8``
                       - Data type
                  The input sequence length, output sequence length, and tensor parallel (TP) are
                  already configured. You don't need to specify them with this script.
               Command:
               .. code-block::
                  ./vllm_benchmark_report.sh \
                      -s $test_option \
                      -m {{model.model_repo}} \
                      -g $num_gpu \
                      -d {{model.precision}}
               .. note::
                  For best performance, it's recommend to run with ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``.
                  If you encounter the following error, pass your access-authorized Hugging
                  Face token to the gated models.
                  .. code-block::
                     OSError: You are trying to access a gated repo.
                     # pass your HF_TOKEN
                     export HF_TOKEN=$your_personal_hf_token
            .. rubric:: Benchmarking examples
            Here are some examples of running the benchmark with various options:
            * Latency benchmark
              Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
              .. code-block::
                 ./vllm_benchmark_report.sh \
                     -s latency \
                     -m {{model.model_repo}} \
                     -g 8 \
                     -d {{model.precision}}
              Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
            * Throughput benchmark
              Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
              .. code-block:: shell
                 ./vllm_benchmark_report.sh \
                     -s throughput \
                     -m {{model.model_repo}} \
                     -g 8 \
                     -d {{model.precision}}
              Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
            .. raw:: html
               <style>
               mjx-container[jax="CHTML"][display="true"] {
                  text-align: left;
                  margin: 0;
               }
               </style>
            .. note::
               Throughput is calculated as:
               - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
               - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
      {% endfor %}
   {% endfor %}
 Advanced usage
 ==============
 For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
 see the developer's guide at `<https://github.com/ROCm/vllm/tree/f94ec9beeca1071cc34f9d1e206d8c7f3ac76129/docs/dev-docker>`__.
 Reproducing the Docker image
 ----------------------------
 To reproduce this ROCm/vLLM Docker image release, follow these steps:
 1. Clone the `vLLM repository <https://github.com/ROCm/vllm>`__.
   .. code-block:: shell
      git clone https://github.com/ROCm/vllm.git
 2. Checkout the specific release commit.
   .. code-block:: shell
      cd vllm
      git checkout b432b7a285aa0dcb9677380936ffa74931bb6d6f
 3. Build the Docker image. Replace ``vllm-rocm`` with your desired image tag.
   .. code-block:: shell
      docker build -f docker/Dockerfile.rocm -t vllm-rocm .
 Known issues and workarounds
 ============================
 AITER does not support FP8 KV cache yet.
 Further reading
 ===============
 - To learn more about the options for latency and throughput benchmark scripts,
  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
 - To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
 - To learn more about system settings and management practices to configure your system for
  AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
 - For application performance optimization strategies for HPC and AI workloads,
  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
 - To learn how to run community models from Hugging Face on AMD GPUs, see
  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
 - To learn how to fine-tune LLMs and optimize inference, see
  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
 - For a list of other ready-made Docker images for AI with ROCm, see
  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
 Previous versions
 =================
 See :doc:`vllm-history` to find documentation for previous releases
 of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst
@@ -16,14 +16,23 @@ previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub <https://hub.
     - Components
     - Resources
-   * - ``rocm/vllm:rocm6.4.1_vllm_0.9.1_20250715``
+   * - ``rocm/vllm:rocm6.4.1_vllm_0.10.0_20250812``
       (latest)
     - 
       * ROCm 6.4.1
       * vLLM 0.10.0
       * PyTorch 2.7.0
     - 
       * :doc:`Documentation <../vllm>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.0_20250812/images/sha256-4c277ad39af3a8c9feac9b30bf78d439c74d9b4728e788a419d3f1d0c30cacaa>`__
   * - ``rocm/vllm:rocm6.4.1_vllm_0.9.1_20250715``
     - 
       * ROCm 6.4.1
       * vLLM 0.9.1
       * PyTorch 2.7.0
     - 
-       * :doc:`Documentation <../vllm>`
+       * :doc:`Documentation <vllm-0.9.1-20250715>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250715/images/sha256-4a429705fa95a58f6d20aceab43b1b76fa769d57f32d5d28bd3f4e030e2a78ea>`__
   * - ``rocm/vllm:rocm6.4.1_vllm_0.9.1_20250702``
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference.rst
@@ -103,7 +103,7 @@ PyTorch inference performance testing
         The Chai-1 benchmark uses a specifically selected Docker image using ROCm 6.2.3 and PyTorch 2.3.0 to address an accuracy issue.
-   .. container:: model-doc pyt_clip_inference pyt_mochi_video_inference pyt_wan2.1_inference pyt_janus_pro_inference
+   .. container:: model-doc pyt_clip_inference pyt_mochi_video_inference pyt_wan2.1_inference pyt_janus_pro_inference pyt_hy_video
      Use the following command to pull the `ROCm PyTorch Docker image <https://hub.docker.com/layers/rocm/pytorch/latest/images/sha256-05b55983e5154f46e7441897d0908d79877370adca4d1fff4899d9539d6c4969>`__ from Docker Hub.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
@@ -7,7 +7,7 @@
 vLLM inference performance testing
 **********************************
-.. _vllm-benchmark-unified-docker:
+.. _vllm-benchmark-unified-docker-812:
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
@@ -47,17 +47,11 @@ What's new
 The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <previous-versions/vllm-history>`.
-* The ``--compilation-config-parameter`` is no longer required as its options are now enabled by default.
+* Upgraded to vLLM v0.10.
  This parameter has been removed from the benchmarking script.
-* Resolved Llama 3.1 405 B custom all-reduce issue, eliminating the need for ``--disable-custom-all-reduce``.
+* FP8 KV cache support via AITER.
  This parameter has been removed from the benchmarking script.
-* Fixed a ``+rms_norm`` custom kernel issue.
+* Full graph capture support via AITER.
 * Added quick reduce functionality. Set ``VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=FP`` to enable; supported modes are ``FP``, ``INT8``, ``INT6``, ``INT4``.
 * Implemented a workaround to potentially mitigate GPU crashes experienced with the Command R+ model, pending a driver fix.
 Supported models
 ================
@@ -67,7 +61,7 @@ Supported models
   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
   {% set model_groups = data.vllm_benchmark.model_groups %}
-   .. _vllm-benchmark-available-models:
+   .. _vllm-benchmark-available-models-812:
   The following models are supported for inference performance benchmarking
   with vLLM and ROCm. Some instructions, commands, and recommendations in this
@@ -102,7 +96,7 @@ Supported models
      </div>
      </div>
-   .. _vllm-benchmark-vllm:
+   .. _vllm-benchmark-vllm-812:
   {% for model_group in model_groups %}
      {% for model in model_group.models %}
@@ -124,14 +118,14 @@ Supported models
   See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
   more information.
-.. _vllm-benchmark-performance-measurements:
+.. _vllm-benchmark-performance-measurements-812:
 Performance measurements
 ========================
 To evaluate performance, the
 `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
-page provides reference throughput and latency measurements for inferencing popular AI models.
+page provides reference throughput and serving measurements for inferencing popular AI models.
 .. important::
@@ -176,7 +170,7 @@ system's configuration.
   Once the setup is complete, choose between two options to reproduce the
   benchmark results:
-   .. _vllm-benchmark-mad:
+   .. _vllm-benchmark-mad-812:
   {% for model_group in model_groups %}
      {% for model in model_group.models %}
@@ -209,12 +203,15 @@ system's configuration.
                      --timeout 28800
            MAD launches a Docker container with the name
-            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
+            ``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the
-            model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
+            model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv``
            and ``{{ model.mad_tag }}_serving.csv``.
-            Although the :ref:`available models <vllm-benchmark-available-models>` are preconfigured
+            Although the :ref:`available models
-            to collect latency and throughput performance data, you can also change the benchmarking
+            <vllm-benchmark-available-models>` are preconfigured to collect
-            parameters. See the standalone benchmarking tab for more information.
+            offline throughput and online serving performance data, you can
            also change the benchmarking parameters. See the standalone
            benchmarking tab for more information.
            {% if model.tunableop %}
@@ -224,14 +221,12 @@ system's configuration.
               TunableOp automatically explores different implementations and configurations of certain PyTorch
               operators to find the fastest one for your hardware.
-               By default, ``{{model.mad_tag}}`` runs with TunableOp disabled
+               By default, ``{{model.mad_tag}}`` runs with TunableOp disabled (see
-               (see
+               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To enable it, include
-               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__).
+               the ``--tunableop on`` argument in your run.
               To enable it, include the ``--tunableop on`` argument in your
               run.
-               Enabling TunableOp triggers a two-pass run -- a warm-up followed
+               Enabling TunableOp triggers a two-pass run -- a warm-up followed by the
-               by the performance-collection run.
+               performance-collection run.
            {% endif %}
@@ -269,6 +264,13 @@ system's configuration.
            3. To start the benchmark, use the following command with the appropriate options.
               .. code-block::
                  ./run.sh \
                      --config $CONFIG_CSV \
                      --model_repo {{ model.model_repo }} \
                      <overrides>
               .. dropdown:: Benchmark options
                  :open:
@@ -280,42 +282,40 @@ system's configuration.
                       - Options
                       - Description
-                     * - ``$test_option``
+                     * - ``--config``
-                       - latency
+                       - ``configs/default.csv``
-                       - Measure decoding token latency
+                       - Run configs from the CSV for the chosen model repo and benchmark.
                     * -
-                       - throughput
+                       - ``configs/extended.csv``
-                       - Measure token generation throughput
+                       - 
                     * -
-                       - all
+                       - ``configs/performance.csv``
-                       - Measure both throughput and latency
+                       - 
-                     * - ``$num_gpu``
+                     * - ``--benchmark``
-                       - 1 or 8
+                       - ``throughput``
-                       - Number of GPUs
+                       - Measure offline end-to-end throughput.
-                     * - ``$datatype``
+                     * - 
-                       - ``float16`` or ``float8``
+                       - ``serving``
-                       - Data type
+                       - Measure online serving performance.
                     * - 
                       - ``all``
                       - Measure both throughput and serving.
                     * - `<overrides>`
                       - See `run.sh <https://github.com/ROCm/MAD/blob/develop/scripts/vllm/run.sh>`__ for more info.
                       - Additional overrides to the config CSV.
                  The input sequence length, output sequence length, and tensor parallel (TP) are
                  already configured. You don't need to specify them with this script.
               Command:
               .. code-block::
                  ./vllm_benchmark_report.sh \
                      -s $test_option \
                      -m {{model.model_repo}} \
                      -g $num_gpu \
                      -d {{model.precision}}
               .. note::
-                  For best performance, it's recommend to run with ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``.
+                  For best performance, it's recommended to run with ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``.
                  If you encounter the following error, pass your access-authorized Hugging
                  Face token to the gated models.
@@ -331,33 +331,33 @@ system's configuration.
            Here are some examples of running the benchmark with various options:
            * Latency benchmark
              Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
              .. code-block::
                 ./vllm_benchmark_report.sh \
                     -s latency \
                     -m {{model.model_repo}} \
                     -g 8 \
                     -d {{model.precision}}
              Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
            * Throughput benchmark
              Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
              .. code-block:: shell
-                 ./vllm_benchmark_report.sh \
+                 export MAD_MODEL_NAME={{ model.mad_tag }}
-                     -s throughput \
+                 ./run.sh \
-                     -m {{model.model_repo}} \
+                     --config configs/default.csv \
-                     -g 8 \
+                     --model_repo {{model.model_repo}} \
-                     -d {{model.precision}}
+                     --benchmark throughput
-              Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
+              Find the throughput benchmark report at ``./{{ model.mad_tag }}_throughput.csv``.
            * Serving benchmark
              Use this command to benchmark the serving performance of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
              .. code-block::
                 export MAD_MODEL_NAME={{ model.mad_tag }}
                 ./run.sh \
                     --config configs/default.csv \
                     --model_repo {{model.model_repo}} \
                     --benchmark serving
              Find the serving benchmark report at ``./{{ model.mad_tag }}_serving.csv``.
            .. raw:: html
@@ -400,7 +400,7 @@ To reproduce this ROCm/vLLM Docker image release, follow these steps:
   .. code-block:: shell
      cd vllm
-      git checkout b432b7a285aa0dcb9677380936ffa74931bb6d6f
+      git checkout 340ea86dfe5955d6f9a9e767d6abab5aacf2c978
 3. Build the Docker image. Replace ``vllm-rocm`` with your desired image tag.
@@ -408,11 +408,6 @@ To reproduce this ROCm/vLLM Docker image release, follow these steps:
      docker build -f docker/Dockerfile.rocm -t vllm-rocm .
 Known issues and workarounds
 ============================
 AITER does not support FP8 KV cache yet.
 Further reading
 ===============
--- a/docs/how-to/rocm-for-ai/install.rst
+++ b/docs/how-to/rocm-for-ai/install.rst
@@ -1,14 +1,14 @@
 .. meta::
-   :description: How to install ROCm and popular machine learning frameworks.
+   :description: How to install ROCm and popular deep learning frameworks.
   :keywords: ROCm, AI, LLM, train, fine-tune, FSDP, DeepSpeed, LLaMA, tutorial
 .. _rocm-for-ai-install:
-***********************************************
+********************************************
-Installing ROCm and machine learning frameworks
+Installing ROCm and deep learning frameworks
-***********************************************
+********************************************
-Before getting started, install ROCm and supported machine learning frameworks.
+Before getting started, install ROCm and supported deep learning frameworks.
 .. grid:: 1
@@ -43,29 +43,16 @@ distribution's package manager. See the following documentation resources to get
      If you encounter any issues during installation, refer to the
      :doc:`Installation troubleshooting <rocm-install-on-linux:reference/install-faq>` guide.
-Machine learning frameworks
+Deep learning frameworks
-===========================
+========================
-ROCm supports popular machine learning frameworks and libraries including `PyTorch
+ROCm supports deep learning frameworks and libraries including `PyTorch
 <https://pytorch.org/blog/pytorch-for-amd-rocm-platform-now-available-as-python-package>`_, `TensorFlow
-<https://tensorflow.org>`_, `JAX <https://jax.readthedocs.io/en/latest>`_, and `DeepSpeed
+<https://tensorflow.org>`_, `JAX <https://jax.readthedocs.io/en/latest>`_, and more.
 <https://cloudblogs.microsoft.com/opensource/2022/03/21/supporting-efficient-large-model-training-on-amd-instinct-gpus-with-deepspeed/>`_.
-Review the framework installation documentation. For ease-of-use, it's recommended to use official ROCm prebuilt Docker
+Review the :doc:`framework installation documentation <../deep-learning-rocm>`. For ease-of-use, it's recommended to use official ROCm prebuilt Docker
 images with the framework pre-installed.
 * :doc:`PyTorch for ROCm <rocm-install-on-linux:install/3rd-party/pytorch-install>`
 * :doc:`TensorFlow for ROCm <rocm-install-on-linux:install/3rd-party/tensorflow-install>`
 * :doc:`JAX for ROCm <rocm-install-on-linux:install/3rd-party/jax-install>`
 * :doc:`verl for ROCm <rocm-install-on-linux:install/3rd-party/verl-install>`
 * :doc:`Stanford Megatron-LM for ROCm <rocm-install-on-linux:install/3rd-party/jax-install>`
 * :doc:`DGL for ROCm <rocm-install-on-linux:install/3rd-party/jax-install>`
 Next steps
 ==========
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
@@ -1,3 +1,5 @@
 :orphan:
 .. meta::
   :description: How to train a model using Megatron-LM for ROCm.
   :keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
@@ -6,6 +8,14 @@
 Training a model with Megatron-LM for ROCm
 ******************************************
 .. caution::
   The ROCm Megatron-LM framework now has limited support with this Docker
   environment; it now focuses on Primus with Megatron-Core. See :doc:`primus-megatron`.
   To learn how to migrate your existing workloads to Primus with Megatron-Core,
   see :doc:`previous-versions/megatron-lm-primus-migration-guide`.
 The `Megatron-LM framework for ROCm <https://github.com/ROCm/Megatron-LM>`_ is
 a specialized fork of the robust Megatron-LM, designed to enable efficient
 training of large-scale language models on AMD GPUs. By leveraging AMD
@@ -20,13 +30,17 @@ essential components, including PyTorch, ROCm libraries, and Megatron-LM
 utilities. It contains the following software components to accelerate training
 workloads:
 .. note::
   This Docker environment is based on Python 3.10 and Ubuntu 22.04. For an alternative environment with
   Python 3.12 and Ubuntu 24.04, see the :doc:`previous ROCm Megatron-LM v25.6 Docker release <previous-versions/megatron-lm-v25.6>`.
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
   {% set dockers = data.dockers %}
   {% if dockers|length > 1 %}
   .. tab-set::
-      {% for docker in data.dockers %}
+      {% for docker in dockers %}
      .. tab-item:: ``{{ docker.pull_tag }}``
         :sync: {{ docker.pull_tag }}
@@ -42,28 +56,14 @@ workloads:
            {% endfor %}
      {% endfor %}
   {% elif dockers|length == 1 %}
   .. list-table::
      :header-rows: 1
      * - Software component
        - Version
      {% for component_name, component_version in docker.components %}
      * - {{ component_name }}
        - {{ component_version }}
      {% endfor %}
   {% endif %}
   .. _amd-megatron-lm-model-support:
   The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
   Supported models
   ================
-   The following models are supported for training performance benchmarking with Megatron-LM and ROCm.
+   The following models are supported for training performance benchmarking with Megatron-LM and ROCm
   on AMD Instinct MI300X series accelerators.
   Some instructions, commands, and training recommendations in this documentation might
   vary by model -- select one to get started.
@@ -177,7 +177,7 @@ Download the Docker image
      {% if dockers|length > 1 %}
      .. tab-set::
-         {% for docker in data.dockers %}
+         {% for docker in dockers %}
         .. tab-item:: {{ docker.doc_name }}
            :sync: {{ docker.pull_tag }}
@@ -227,10 +227,17 @@ Download the Docker image
      docker start megatron_training_env
      docker exec -it megatron_training_env bash
-The Docker container includes a pre-installed, verified version of the ROCm
+4. **Megatron-LM backward compatibility setup** -- this Docker is primarily intended for use with Primus, but it maintains Megatron-LM compatibility with limited support.
-Megatron-LM development branch
+   To roll back to using Megatron-LM, follow these steps:
-`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev>`__, including necessary
+
-training scripts.
+   .. code-block:: shell
      cd /workspace/Megatron-LM/
      pip uninstall megatron-core
      pip install -e .
 The Docker container hosts
 `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev>`__ at verified commit ``e8e9edc``.
 .. _amd-megatron-lm-environment-setup:
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst
@@ -16,12 +16,20 @@ previous releases of the ``ROCm/megatron-lm`` Docker image on `Docker Hub <https
     - Components
     - Resources
-   * - v25.6 (latest)
+   * - v25.7 (latest)
     - 
       * ROCm 
       * PyTorch 
     - 
       * :doc:`Documentation <../megatron-lm>`
       * `Docker Hub (py310) <https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a>`__
   * - v25.6
     - 
       * ROCm 6.4.1
       * PyTorch 2.8.0a0+git7d205b2
     - 
-       * :doc:`Documentation <../megatron-lm>`
+       * :doc:`Documentation <megatron-lm-v25.6>`
       * `Docker Hub (py312) <https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py312/images/sha256-482ff906532285bceabdf2bda629bd32cb6174d2d07f4243a736378001b28df0>`__
       * `Docker Hub (py310) <https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py310/images/sha256-9627bd9378684fe26cb1a10c7dd817868f553b33402e49b058355b0f095568d6>`__
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-primus-migration-guide.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-primus-migration-guide.rst
@@ -0,0 +1,175 @@
 :orphan:
 **********************************************************************
 Migrating workloads to Primus (Megatron-Core backend) from Megatron-LM
 **********************************************************************
 Primus supports Megatron-Core as backend optimization library,
 replacing ROCm Megatron-LM. This document outlines the steps to migrate
 workload from ROCm Megatron-LM to Primus with the Megatron-Core backend.
 Model architecture
 ==================
 ROCm Megatron-LM defines model architecture parameters in the training scripts;
 for example, the Llama 3 8B model parameters are defined in
 `examples/llama/train_llama3.sh <https://github.com/ROCm/Megatron-LM/blob/rocm_dev/examples/llama/train_llama3.sh#L117>`__
 as shown below:
 .. code-block:: bash
   HIDDEN_SIZE=4096 
   FFN_HIDDEN_SIZE=14336 
   NUM_LAYERS=32 
   NUM_HEADS=32 
   NUM_KV_HEADS=8
 Primus defines the model architecture through model YAML configuration files
 inside the ``primus/configs/models/megatron/`` repository. For example, Llama 3 8B
 model architecture parameters are defined in
 `primus/configs/models/megatron/llama3_8B.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/llama3_8B.yaml>`__
 as shown below:
 .. code-block:: yaml
   bases:
     - llama3_base.yaml
   tokenizer_type: Llama3Tokenizer
   tokenizer_model: meta-llama/Llama-3.1-8B
   ffn_hidden_size: 14336
   hidden_size: 4096
   num_attention_heads: 32
   num_layers: 32
   num_query_groups: 8
 Primus' model config files follow a hierarchical design, meaning that new model
 config YAMLs can inherit existing model config files by importing them as
 bases. For example,
 `llama3.1_8B.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/llama3.1_8B.yaml>`__
 uses ``llama3_8B.yaml`` as a base config and overrides few parameters, as shown below.
 In this example, ``llama3.1_8B`` overrides the ``max_position_embeddings`` value:
 .. code-block:: yaml
   bases:
     - llama3_8B.yaml
   tokenizer_type: Llama3Tokenizer
   tokenizer_model: meta-llama/Llama-3.1-8B
   max_position_embeddings: 131072
 .. tip::
   Primus provides ``llama_base.yaml`` as the base configuration, which can be
   used as bases for additional model architectures. For example,
   `mixtral_base.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/mixtral_base.yaml>`__
   and
   `deepseek_v3_base.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/deepseek_v3_base.yaml>`__
   define ``llama_base.yaml`` as its base.
   .. code-block:: yaml
      # Example mixtral_base.yaml:
      bases:
        - llama_base.yaml
      init_method_std: 0.01
      rotary_base: 1000000
      qk_layernorm: false
      group_query_attention: true
      num_query_groups: 8
      # moe parameters
      num_experts: 8
      moe_router_topk: 2
      moe_router_load_balancing_type: aux_loss
      moe_aux_loss_coeff: 1e-2
      moe_grouped_gemm: true
      moe_token_dispatcher_type: alltoall
 It is recommended to add a new ``${MODEL_NAME}_base.yaml`` to add a new
 category of model and define new models on top of it. For example, to add
 Qwen2.5 models in Primus, we define
 `qwen2.5_base.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/qwen2.5_base.yaml>`__
 and build
 `qwen2.5_7B.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/qwen2.5_7B.yaml>`__
 and
 `qwen2.5_72B.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/qwen2.5_72B.yaml>`__
 using ``qwen2.5_base.yaml`` as the base config.
 Training parameters
 ===================
 ROCm Megatron-LM also defines the training parameters, like batch size,
 tensor-parallelism, precision, as so on, in the training scripts. For example,
 Llama3 8B model parameters are defined in
 `examples/llama/train_llama3.sh <https://github.com/ROCm/Megatron-LM/blob/rocm_dev/examples/llama/train_llama3.sh>`__
 as shown below:
 .. code-block:: bash
   TP="${TP:-8}"
   PP="${PP:-1}"
   CP="${CP:-1}"
   MBS="${MBS:-1}"
   BS="${BS:-8}"
 Primus defines the training parameters in top-level YAML files -- see
 `examples/megatron/configs/
 <https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1/examples/megatron/configs>`__.
 For example, the `llama3.1_8B-pretrain.yaml
 <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/examples/megatron/configs/llama3.1_8B-pretrain.yaml>`__
 configuration imports the ``llama3.1_8B.yaml`` model architecture file. Users can then override
 the default training parameters in ``llama3.1_8B-pretrain.yaml``.
 .. code-block:: yaml
   # model to run
   model: llama3.1_8B.yaml  # Model architecture yaml
   overrides:
     # log
     # disable_wandb: false
     # disable_tensorboard: false
     stderr_sink_level: DEBUG
     log_avg_skip_iterations: 2
     log_avg_reset_interval: 50
     train_iters: 50
     micro_batch_size: 2
     global_batch_size: 128
     seq_length: 8192
     max_position_embeddings: 8192
     lr: 1.0e-5
     min_lr: 0.0
     lr_warmup_iters: 2
     lr_decay_iters: null
     lr_decay_style: cosine
     weight_decay: 0.1
     adam_beta1: 0.9
     adam_beta2: 0.95
     eod_mask_loss: true
     init_method_std: 0.008
     norm_epsilon: 1.0e-6
 Backward compatibility with Megatron-LM
 =======================================
 The Dockerized environment used for Primus maintains compatibility with Megatron-LM with
 limited support. To roll back to using Megatron-LM, follow these steps.
 .. code-block:: shell
   cd /workspace/Megatron-LM/
   pip uninstall megatron-core
   pip install -e .
 Once Megatron-LM is installed, follow :doc:`the documentation <../megatron-lm>` to run workloads as
 usual.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.6.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.6.rst
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst
@@ -0,0 +1,602 @@
 .. meta::
   :description: How to train a model using Megatron-LM for ROCm.
   :keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
 **********************************************
 Training a model with Primus and Megatron-Core
 **********************************************
 `Primus <https://github.com/AMD-AIG-AIMA/Primus>`__ is a unified and flexible
 LLM training framework designed to streamline training. It streamlines LLM
 training on AMD Instinct accelerators using a modular, reproducible configuration paradigm.
 Primus is backend-agnostic and supports multiple training engines -- including Megatron-Core.
 .. note::
   Primus with the Megatron-Core backend is intended to replace ROCm
   Megatron-LM in this Dockerized training environment. To learn how to migrate
   workloads from Megatron-LM to Primus with Megatron-Core, see
   :doc:`previous-versions/megatron-lm-primus-migration-guide`.
 For ease of use, AMD provides a ready-to-use Docker image for MI300 series accelerators
 containing essential components for Primus and Megatron-Core.
 .. note::
   This Docker environment is based on Python 3.10 and Ubuntu 22.04. For an alternative environment with
   Python 3.12 and Ubuntu 24.04, see the :doc:`previous ROCm Megatron-LM v25.6 Docker release <previous-versions/megatron-lm-v25.6>`.
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
   {% set dockers = data.dockers %}
   {% set docker = dockers[0] %}
   .. list-table::
      :header-rows: 1
      * - Software component
        - Version
      {% for component_name, component_version in docker.components.items() %}
      * - {{ component_name }}
        - {{ component_version }}
      {% endfor %}
 .. _amd-primus-megatron-lm-model-support:
 Supported models
 ================
 The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
 Some instructions, commands, and training examples in this documentation might
 vary by model -- select one to get started.
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
   {% set model_groups = data.model_groups %}
   .. raw:: html
         <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
           <div class="row">
             <div class="col-2 me-2 model-param-head">Model</div>
             <div class="row col-10">
      {% for model_group in model_groups %}
               <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
      {% endfor %}
             </div>
           </div>
           <div class="row mt-1">
             <div class="col-2 me-2 model-param-head">Model variant</div>
             <div class="row col-10">
      {% for model_group in model_groups %}
         {% set models = model_group.models %}
         {% for model in models %}
            {% if models|length % 3 == 0 %}
               <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
            {% else %}
               <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
            {% endif %}
         {% endfor %}
      {% endfor %}
             </div>
           </div>
         </div>
 .. note::
   Some models, such as Llama, require an external license agreement through
   a third party (for example, Meta).
 System validation
 =================
 Before running AI workloads, it's important to validate that your AMD hardware is configured
 correctly and performing optimally.
 If you have already validated your system settings, including aspects like NUMA auto-balancing, you
 can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
 optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
 before starting training.
 To test for optimal performance, consult the recommended :ref:`System health benchmarks
 <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
 system's configuration.
 .. _mi300x-amd-primus-megatron-lm-training:
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
   {% set dockers = data.dockers %}
      {% set docker = dockers[0] %}
   Environment setup
   =================
   Use the following instructions to set up the environment, configure the script to train models, and
   reproduce the benchmark results on MI300X series accelerators with the ``{{ docker.pull_tag }}`` image.
   .. _amd-primus-megatron-lm-requirements:
   Download the Docker image
   -------------------------
   1. Use the following command to pull the Docker image from Docker Hub.
      .. code-block:: shell
         docker pull {{ docker.pull_tag }}
   2. Launch the Docker container.
      .. code-block:: shell
         docker run -it \
             --device /dev/dri \
             --device /dev/kfd \
             --device /dev/infiniband \
             --network host --ipc host \
             --group-add video \
             --cap-add SYS_PTRACE \
             --security-opt seccomp=unconfined \
             --privileged \
             -v $HOME:$HOME \
             --shm-size 128G \
             --name primus_training_env \
             {{ docker.pull_tag }}
 3. Use these commands if you exit the ``primus_training_env`` container and need to return to it.
   .. code-block:: shell
      docker start primus_training_env
      docker exec -it primus_training_env bash
 The Docker container hosts verified release tag ``v0.1.0-rc1`` of the `Primus
 <https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1>`__ repository.
 .. _amd-primus-megatron-lm-environment-setup:
 Configuration
 =============
 Primus defines a training configuration in YAML for each model in
 `examples/megatron/configs <https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1/examples/megatron/configs>`__.
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
   {% set model_groups = data.model_groups %}
   {% for model_group in model_groups %}
      {% for model in model_group.models %}
   .. container:: model-doc {{ model.mad_tag }}
      To update training parameters for {{ model.model }}, you can update ``examples/megatron/configs/{{ model.config_name }}``.
      Note that training configuration YAML files for other models follow this naming convention.
      {% endfor %}
   {% endfor %}
 .. note::
   See :ref:`Key options <amd-primus-megatron-lm-benchmark-test-vars>` for more information on configuration options.
 Dataset options
 ---------------
 You can use either mock data or real data for training.
 * Mock data can be useful for testing and validation. Use the ``mock_data`` field to toggle between mock and real data. The default
  value is ``true`` for enabled.
  .. code-block:: yaml
     mock_data: true
 * If you're using a real dataset, update the ``train_data_path`` field to point to the location of your dataset.
  .. code-block:: bash
     mock_data: false
     train_data_path: /path/to/your/dataset
  Ensure that the files are accessible inside the Docker container.
 .. _amd-primus-megatron-lm-tokenizer:
 Tokenizer
 ---------
 In Primus, each model uses a tokenizer from Hugging Face. For example, Llama
 3.1 8B model uses ``tokenizer_model: meta-llama/Llama-3.1-8B`` and
 ``tokenizer_type: Llama3Tokenizer`` defined in the `llama3.1-8B model
 <https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1/primus/configs/models/megatron/llama3.1_8B.yaml>`__
 definition. As such, you need to set the ``HF_TOKEN`` environment variable with
 right permissions to access the tokenizer for each model.
 .. code-block:: bash
   # Export your HF_TOKEN in the workspace
   export HF_TOKEN=<your_hftoken>
 .. _amd-primus-megatron-lm-run-training:
 Run training
 ============
 Use the following example commands to set up the environment, configure
 :ref:`key options <amd-primus-megatron-lm-benchmark-test-vars>`, and run training on
 MI300X series accelerators with the AMD Megatron-LM environment.
 Single node training
 --------------------
 To run training on a single node, navigate to ``/workspace/Primus`` and use the following setup command:
 .. code-block:: shell
   pip install -r requirements.txt
   export HSA_NO_SCRATCH_RECLAIM=1
   export NVTE_CK_USES_BWD_V3=1
 Once setup is complete, run the appropriate training command.
 .. container:: model-doc primus_pyt_megatron_lm_train_llama-3.3-70b
   To run pre-training for Llama 3.3 70B BF16, run:
   .. code-block:: shell
      EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
      bash ./examples/run_pretrain.sh \
          --micro_batch_size 2 \
          --global_batch_size 16 \
          --train_iters 50
 .. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b
   To run pre-training for Llama 3.1 8B FP8, run:
   .. code-block:: shell
      EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
      bash ./examples/run_pretrain.sh \
          --train_iters 50 \
          --fp8 hybrid
   For Llama 3.1 8B BF16, use the following command:
   .. code-block:: shell
      EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
      bash ./examples/run_pretrain.sh --train_iters 50
 .. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-70b
   To run pre-training for Llama 3.1 70B BF16, run:
   .. code-block:: shell
      EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
      bash ./examples/run_pretrain.sh \
           --train_iters 50
   To run the training on a single node for Llama 3.1 70B FP8 with proxy, use the following command:
   .. code-block:: shell
      EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
      bash ./examples/run_pretrain.sh \
          --train_iters 50 \
          --num_layers 40 \
          --fp8 hybrid \
          --no_fp8_weight_transpose_cache true
   .. note::
      Use two or more nodes to run the *full* Llama 70B model with FP8 precision.
 .. container:: model-doc primus_pyt_megatron_lm_train_llama-2-7b
   To run pre-training for Llama 2 7B FP8, run:
   .. code-block:: shell
      EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
      bash ./examples/run_pretrain.sh \
          --train_iters 50 \
          --fp8 hybrid
   To run pre-training for Llama 2 7B BF16, run:
   .. code-block:: shell
      EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
      bash ./examples/run_pretrain.sh --train_iters 50
 .. container:: model-doc primus_pyt_megatron_lm_train_llama-2-70b
   To run pre-training for Llama 2 70B BF16, run:
   .. code-block:: shell
      EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
      bash ./examples/run_pretrain.sh --train_iters 50 
 .. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v3-proxy
   To run training on a single node for DeepSeek-V3 (MoE with expert parallel) with 3-layer proxy, 
   use the following command:
   .. code-block:: shell
      EXP=examples/megatron/configs/deepseek_v3-pretrain.yaml \
      bash examples/run_pretrain.sh \
          --num_layers 3 \
          --moe_layer_freq 1 \
          --train_iters 50
 .. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v2-lite-16b
   To run training on a single node for DeepSeek-V2-Lite (MoE with expert parallel),
   use the following command:
   .. code-block:: shell
      EXP=examples/megatron/configs/deepseek_v2_lite-pretrain.yaml \
      bash examples/run_pretrain.sh \
          --global_batch_size 256 \
          --train_iters 50
 .. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x7b
   To run training on a single node for Mixtral 8x7B (MoE with expert parallel),
   use the following command:
   .. code-block:: shell
      EXP=examples/megatron/configs/mixtral_8x7B_v0.1-pretrain.yaml \
      bash examples/run_pretrain.sh --train_iters 50
 .. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x22b-proxy
   To run training on a single node for Mixtral 8x7B (MoE with expert parallel) with 4-layer proxy,
   use the following command:
   .. code-block:: shell
      EXP=examples/megatron/configs/mixtral_8x22B_v0.1-pretrain.yaml \
      bash examples/run_pretrain.sh \
          --num_layers 4 \
          --pipeline_model_parallel_size 1 \
          --micro_batch_size 1 \
          --global_batch_size 16 \
          --train_iters 50
 .. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-7b
   To run training on a single node for Qwen 2.5 7B BF16, use the following
   command:
   .. code-block:: shell
      EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \
      bash examples/run_pretrain.sh --train_iters 50
   For FP8, use the following command.
   .. code-block:: shell
      EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \
      bash examples/run_pretrain.sh \
          --train_iters 50 \
          --fp8 hybrid
 .. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-72b
   To run the training on a single node for Qwen 2.5 72B BF16, use the following command.
   .. code-block:: shell
      EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \
      bash examples/run_pretrain.sh --train_iters 50
 Multi-node training examples
 ----------------------------
 To run training on multiple nodes, you can use the
 `run_slurm_pretrain.sh <https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1/examples/run_slurm_pretrain.sh>`__
 to launch the multi-node workload. Use the following steps to setup your environment:
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
   {% set dockers = data.dockers %}
   {% set docker = dockers[0] %}
   .. code-block:: shell
      cd /workspace/Primus/
      export DOCKER_IMAGE={{ docker.pull_tag }}
      export HF_TOKEN=<your_HF_token>
      export HSA_NO_SCRATCH_RECLAIM=1
      export NVTE_CK_USES_BWD_V3=1
      export NCCL_IB_HCA=<your_NCCL_IB_HCA> # specify which RDMA interfaces to use for communication
      export NCCL_SOCKET_IFNAME=<your_NCCL_SOCKET_IFNAME> # your Network Interface
      export GLOO_SOCKET_IFNAME=<your_GLOO_SOCKET_IFNAME> # your Network Interface
      export NCCL_IB_GID_INDEX=3 # Set InfiniBand GID index for NCCL communication. Default is 3 for ROCE
 .. note::
   * Make sure correct network drivers are installed on the nodes. If inside a Docker, either install the drivers inside the Docker container or pass the network drivers from the host while creating Docker container.
   * If ``NCCL_IB_HCA`` and ``NCCL_SOCKET_IFNAME`` are not set, Primus will try to auto-detect. However, since NICs can vary accross different cluster, it is encouraged to explicitly export your NCCL parameters for the cluster.
   * To find your network interface, you can use ``ip a``.
   * To find RDMA interfaces, you can use ``ibv_devices`` to get the list of all the RDMA/IB  devices.
 .. container:: model-doc primus_pyt_megatron_lm_train_llama-3.3-70b
   To train Llama 3.3 70B FP8 on 8 nodes, run:
   .. code-block:: shell
      NNODES=8 EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
      bash examples/run_slurm_pretrain.sh \
          --micro_batch_size 4 \
          --global_batch_size 256 \
          --recompute_num_layers 80 \
          --no_fp8_weight_transpose_cache true \
          --fp8 hybrid
   To train Llama 3.3 70B BF16 on 8 nodes, run:
   .. code-block:: shell
      NNODES=8 EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
      bash examples/run_slurm_pretrain.sh \
          --micro_batch_size 1 \
          --global_batch_size 256 \
          --recompute_num_layers 12
 .. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b
   To train Llama 3.1 8B FP8 on 8 nodes, run:
   .. code-block:: shell
      # Adjust the training parameters. For e.g., `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case 
      NNODES=8 EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
      bash ./examples/run_slurm_pretrain.sh \
          --global_batch_size 1024 \
          --fp8 hybrid
 .. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-70b
   To train Llama 3.1 70B FP8 on 8 nodes, run:
   .. code-block:: shell
      NNODES=8 EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
      bash examples/run_slurm_pretrain.sh \
          --micro_batch_size 4 \
          --global_batch_size 256 \
          --recompute_num_layers 80 \
          --no_fp8_weight_transpose_cache true \
          --fp8 hybrid
   To train Llama 3.1 70B BF16 on 8 nodes, run:
   .. code-block:: shell
      NNODES=8 EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
      bash examples/run_slurm_pretrain.sh \
          --micro_batch_size 1 \
          --global_batch_size 256 \
          --recompute_num_layers 12
 .. container:: model-doc primus_pyt_megatron_lm_train_llama-2-7b
   To train Llama 2 8B FP8 on 8 nodes, run:
   .. code-block:: shell
      # Adjust the training parameters. For e.g., `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case 
      NNODES=8 EXP=examples/megatron/configs/llama2_7B-pretrain.yaml bash ./examples/run_slurm_pretrain.sh --global_batch_size 2048 --fp8 hybrid
 .. container:: model-doc primus_pyt_megatron_lm_train_llama-2-70b
   To train Llama 2 70B FP8 on 8 nodes, run:
   .. code-block:: shell
      NNODES=8 EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
      bash examples/run_slurm_pretrain.sh \
          --micro_batch_size 10 \
          --global_batch_size 640 \
          --recompute_num_layers 80 \
          --no_fp8_weight_transpose_cache true \
          --fp8 hybrid
   To train Llama 2 70B BF16 on 8 nodes, run:
   .. code-block:: shell
      NNODES=8 EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
      bash ./examples/run_slurm_pretrain.sh \
          --micro_batch_size 2 \
          --global_batch_size 1536 \
          --recompute_num_layers 12
 .. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x7b
   To train Mixtral 8x7B BF16 on 8 nodes, run:
   .. code-block:: shell
      NNODES=8 EXP=examples/megatron/configs/mixtral_8x7B_v0.1-pretrain.yaml \
      bash examples/run_slurm_pretrain.sh \
          --micro_batch_size 2 \
          --global_batch_size 256
 .. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-72b
   To train Qwen2.5 72B FP8 on 8 nodes, run:
   .. code-block:: shell
      NNODES=8 EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \
      bash examples/run_slurm_pretrain.sh \
          --micro_batch_size 8 \
          --global_batch_size 512 \
          --recompute_num_layers 80 \
          --no_fp8_weight_transpose_cache true \
          --fp8 hybrid
 .. _amd-primus-megatron-lm-benchmark-test-vars:
 Key options
 -----------
 The following are key options to take note of
 fp8
  ``hybrid`` enables FP8 GEMMs.
 use_torch_fsdp2
  ``use_torch_fsdp2: 1``  enables torch fsdp-v2. If FSDP is enabled,
  set ``use_distributed_optimizer`` and ``overlap_param_gather`` to ``false``.
 profile
  To enable PyTorch profiling, set these parameters:
  .. code-block:: yaml
     profile: true
     use_pytorch_profiler: true
     profile_step_end: 7
     profile_step_start: 6
 train_iters
  The total number of iterations (default: 50).
 mock_data
  True by default.
 micro_batch_size
  Micro batch size.
 global_batch_size
  Global batch size.
 recompute_granularity
  For activation checkpointing.
 num_layers
  For using a reduced number of layers as with proxy models.
 Previous versions
 =================
 See :doc:`previous-versions/megatron-lm-history` to find documentation for previous releases
 of the ``ROCm/megatron-lm`` Docker image.
 This training environment now uses Primus with Megatron as the primary
 configuration. Limited support for the legacy ROCm Megatron-LM is still
 available. For instructions on using ROCm Megatron-LM, see the
 :doc:`megatron-lm` document.
--- a/docs/how-to/rocm-for-ai/training/index.rst
+++ b/docs/how-to/rocm-for-ai/training/index.rst
@@ -21,6 +21,8 @@ In this guide, you'll learn about:
 - Training a model
  - :doc:`With Primus (Megatron-LM backend) <benchmark-docker/primus-megatron>`
  - :doc:`With Megatron-LM <benchmark-docker/megatron-lm>`
  - :doc:`With PyTorch <benchmark-docker/pytorch-training>`
--- a/docs/reference/gpu-arch-specs.rst
+++ b/docs/reference/gpu-arch-specs.rst
@@ -285,7 +285,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - Radeon AI PRO R9700
          - RDNA4
          - gfx1201
-          - 16
+          - 32
          - 64
          - 32 or 64
          - 128
--- a/docs/sphinx/_toc.yml.in
+++ b/docs/sphinx/_toc.yml.in
@@ -27,6 +27,24 @@ subtrees:
    title: ROCm on Radeon GPUs
  - file: how-to/deep-learning-rocm.md
    title: Deep learning frameworks
    subtrees:
    - entries:
      - file: compatibility/ml-compatibility/pytorch-compatibility.rst
        title: PyTorch compatibility
      - file: compatibility/ml-compatibility/tensorflow-compatibility.rst
        title: TensorFlow compatibility  
      - file: compatibility/ml-compatibility/jax-compatibility.rst
        title: JAX compatibility
      - file: compatibility/ml-compatibility/verl-compatibility.rst
        title: verl compatibility  
      - file: compatibility/ml-compatibility/stanford-megatron-lm-compatibility.rst
        title: Stanford Megatron-LM compatibility
      - file: compatibility/ml-compatibility/dgl-compatibility.rst
        title: DGL compatibility  
      - file: compatibility/ml-compatibility/megablocks-compatibility.rst
        title: Megablocks compatibility
      - file: compatibility/ml-compatibility/taichi-compatibility.rst
        title: Taichi compatibility 
  - file: how-to/build-rocm.rst
    title: Build ROCm from source
@@ -44,8 +62,8 @@ subtrees:
        title: Training
        subtrees:
        - entries:
-          - file: how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
+          - file: how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst
-            title: Train a model with Megatron-LM
+            title: Train a model with Primus and Megatron-Core
          - file: how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
            title: Train a model with PyTorch
          - file: how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
@@ -1,4 +1,4 @@
-rocm-docs-core==1.20.1
+rocm-docs-core==1.22.0
 sphinx-reredirects
 sphinx-sitemap
 sphinxcontrib.datatemplates==0.11.0
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -23,7 +23,7 @@ beautifulsoup4==4.13.4
    # via pydata-sphinx-theme
 breathe==4.36.0
    # via rocm-docs-core
-certifi==2025.4.26
+certifi==2025.7.14
    # via requests
 cffi==1.17.1
    # via
@@ -35,18 +35,16 @@ click==8.2.1
    # via
    #   jupyter-cache
    #   sphinx-external-toc
-comm==0.2.2
+comm==0.2.3
    # via ipykernel
-cryptography==45.0.3
+cryptography==45.0.5
    # via pyjwt
-debugpy==1.8.14
+debugpy==1.8.15
    # via ipykernel
 decorator==5.2.1
    # via ipython
 defusedxml==0.7.1
    # via sphinxcontrib-datatemplates
 deprecated==1.2.18
    # via pygithub
 docutils==0.21.2
    # via
    #   myst-parser
@@ -62,7 +60,7 @@ fastjsonschema==2.21.1
    #   rocm-docs-core
 gitdb==4.0.12
    # via gitpython
-gitpython==3.1.44
+gitpython==3.1.45
    # via rocm-docs-core
 greenlet==3.2.3
    # via sqlalchemy
@@ -74,7 +72,7 @@ importlib-metadata==8.7.0
    # via
    #   jupyter-cache
    #   myst-nb
-ipykernel==6.29.5
+ipykernel==6.30.0
    # via myst-nb
 ipython==8.37.0
    # via
@@ -86,7 +84,7 @@ jinja2==3.1.6
    # via
    #   myst-parser
    #   sphinx
-jsonschema==4.24.0
+jsonschema==4.25.0
    # via nbformat
 jsonschema-specifications==2025.4.1
    # via jsonschema
@@ -116,7 +114,7 @@ mdit-py-plugins==0.4.2
    # via myst-parser
 mdurl==0.1.2
    # via markdown-it-py
-myst-nb==1.2.0
+myst-nb==1.3.0
    # via rocm-docs-core
 myst-parser==4.0.1
    # via myst-nb
@@ -134,7 +132,6 @@ nest-asyncio==1.6.0
 packaging==25.0
    # via
    #   ipykernel
    #   pydata-sphinx-theme
    #   sphinx
 parso==0.8.4
    # via jedi
@@ -152,13 +149,13 @@ pure-eval==0.2.3
    # via stack-data
 pycparser==2.22
    # via cffi
-pydata-sphinx-theme==0.15.4
+pydata-sphinx-theme==0.16.1
    # via
    #   rocm-docs-core
    #   sphinx-book-theme
-pygithub==2.6.1
+pygithub==2.7.0
    # via rocm-docs-core
-pygments==2.19.1
+pygments==2.19.2
    # via
    #   accessible-pygments
    #   ipython
@@ -178,7 +175,7 @@ pyyaml==6.0.2
    #   rocm-docs-core
    #   sphinx-external-toc
    #   sphinxcontrib-datatemplates
-pyzmq==26.4.0
+pyzmq==27.0.0
    # via
    #   ipykernel
    #   jupyter-client
@@ -190,9 +187,9 @@ requests==2.32.4
    # via
    #   pygithub
    #   sphinx
-rocm-docs-core==1.20.1
+rocm-docs-core==1.22.0
    # via -r requirements.in
-rpds-py==0.25.1
+rpds-py==0.26.0
    # via
    #   jsonschema
    #   referencing
@@ -220,7 +217,7 @@ sphinx==8.1.3
    #   sphinx-reredirects
    #   sphinxcontrib-datatemplates
    #   sphinxcontrib-runcmd
-sphinx-book-theme==1.1.4
+sphinx-book-theme==1.1.3
    # via rocm-docs-core
 sphinx-copybutton==0.5.2
    # via rocm-docs-core
@@ -252,7 +249,7 @@ sphinxcontrib-runcmd==0.2.0
    # via sphinxcontrib-datatemplates
 sphinxcontrib-serializinghtml==2.0.0
    # via sphinx
-sqlalchemy==2.0.41
+sqlalchemy==2.0.42
    # via jupyter-cache
 stack-data==0.6.3
    # via ipython
@@ -266,7 +263,6 @@ tornado==6.5.1
    #   jupyter-client
 traitlets==5.14.3
    # via
    #   comm
    #   ipykernel
    #   ipython
    #   jupyter-client
@@ -274,7 +270,7 @@ traitlets==5.14.3
    #   matplotlib-inline
    #   nbclient
    #   nbformat
-typing-extensions==4.14.0
+typing-extensions==4.14.1
    # via
    #   beautifulsoup4
    #   exceptiongroup
@@ -290,7 +286,5 @@ urllib3==2.5.0
    #   requests
 wcwidth==0.2.13
    # via prompt-toolkit
 wrapt==1.17.2
    # via deprecated
 zipp==3.23.0
    # via importlib-metadata
Author	SHA1	Message	Date
Jeffrey Novotny	3c3847f9f7	Merge pull request #5224 from amd-jnovotny/dlf-matt-docs643 Deep learning frameworks edits for scale (#5189)	2025-08-22 11:56:27 -04:00
Matt Williams	249bd177ec	Deep learning frameworks edits for scale (#5189 ) * Deep learning frameworks edits for scale Based on https://ontrack-internal.amd.com/browse/ROCDOC-1809 * update table table * leo comments * formatting * format * update table based on feedback * header * Update machine learning page * headers * Apply suggestions from code review Co-authored-by: anisha-amd <anisha.sankar@amd.com> * Update .wordlist.txt * formatting * Update docs/how-to/deep-learning-rocm.rst Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com> --------- Co-authored-by: Matt Williams <Matt.Williams+amdeng@amd.com> Co-authored-by: anisha-amd <anisha.sankar@amd.com> Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com> (cherry picked from commit `1d42f7cc62`)	2025-08-22 11:52:32 -04:00
Peter Park	b2ee8d4b2e	docs: Add Primus (Megatron) training Docker documentation (#5218 ) (#5222 ) (cherry picked from commit `98029db4ee`)	2025-08-22 09:02:40 -04:00
Peter Park	3f834cf520	Fix documented VRAM for Radeon AI Pro R9700 (#5203 ) (#5204 ) (cherry picked from commit `c154b7e0a3`)	2025-08-18 10:20:22 -04:00
Peter Park	70ba866c5b	vLLM inference benchmark doc: add missing data field (#5199 ) (#5200 ) (cherry picked from commit `55d0a88ec5`)	2025-08-15 13:52:51 -04:00
Peter Park	320ec4669a	[docs/6.4.3] Update vLLM benchmark doc for 20250812 Docker release (#5196 ) (#5198 ) (cherry picked from commit `7ee22790ce`)	2025-08-14 15:51:58 -04:00
anisha-amd	c9bd93b537	[Docs] 6.4.3: compatibility matrix frameworks support update (#5186 )	2025-08-12 14:25:45 -04:00
Peter Park	a060550bcd	Add Hunyuan Video to PyTorch inference benchmark models doc (#5094 ) (cherry picked from commit `80f7dc79b9`)	2025-08-12 11:59:51 -04:00
Parag Bhandari	c92cbaee66	Merge branch 'roc-6.4.x' into docs/6.4.3	2025-08-08 08:49:53 -04:00
Parag Bhandari	c84afacc8d	Merge branch 'develop' into roc-6.4.x	2025-08-08 08:49:39 -04:00
Parag Bhandari	843fd1b3fb	Merge branch 'roc-6.4.x' into docs/6.4.3	2025-08-08 07:21:31 -04:00
Parag Bhandari	82221c4e2d	Merge branch 'develop' into roc-6.4.x	2025-08-08 07:18:53 -04:00
pbhandar-amd	d0ebe126e7	Sync develop into docs/6.4.3	2025-08-07 09:07:38 -04:00
pbhandar-amd	74610893a9	Merge pull request #5154 from ROCm/amd/pbhandar/merge_from_develop Merge develop into docs/6.4.3	2025-08-06 11:25:50 -04:00
Parag Bhandari	afe3e21cad	Merge branch 'develop' into docs/6.4.3	2025-08-05 16:24:07 -04:00
Alex Xu	ae2440772f	upgrade rocm-docs-core to 1.22.0	2025-07-31 16:41:40 -04:00
anisha-amd	61f970a24d	Cherry pick into 6.4.3 - Docs: Adding frameworks compatibility for Megablocks and Taichi (#5138 )	2025-07-31 14:02:09 -04:00
Alex Xu	85a1682573	Merge branch 'develop' into roc-6.4.x	2025-07-21 17:22:49 -04:00
Alex Xu	87c6e320b4	Merge branch 'develop' into roc-6.4.x	2025-07-21 15:52:30 -04:00
ammallya	b50948fe6b	Fix for rocrsamples and rocr_debug_agent (#4863 ) * Fix for rocrsamples * Fix for rocr_debug_agent	2025-05-30 16:27:29 -07:00
ammallya	91407405a9	Changed naming convention for hip (#4837 ) * Changed naming convention for hip * Changed naming convention for hip	2025-05-29 10:19:28 -07:00
ammallya	8f23f63a6b	Fix for tests (#4818 ) * Fix for RBT * Fix for roctst and kfd test	2025-05-27 17:38:48 -07:00
Alex Xu	11747aaadc	Merge branch 'develop' into roc-6.4.x	2025-05-21 15:04:02 -04:00
Alex Xu	1088beefe5	Merge branch 'develop' into roc-6.4.x	2025-05-21 12:27:13 -04:00
Alex Xu	b7988925a5	Merge branch 'develop' into roc-6.4.x	2025-05-21 12:25:30 -04:00
chiranjeevipattigidi	89dafa6232	Update packages - remove broken packages (#4758 ) * Update envsetup.sh HIP_ON_ROCclr_ROOT path to hip and remove aqlprofiletest * Update packages - remove broken packages	2025-05-21 09:06:39 -07:00
chiranjeevipattigidi	8054852dad	Update envsetup.sh HIP_ON_ROCclr_ROOT path to hip and remove (#4755 ) aqlprofiletest	2025-05-20 07:59:07 -07:00
ammallya	542d7813ce	Removing aqlprofiletest	2025-04-14 15:26:24 -07:00
ammallya	bc1ffe4fcb	bypass tests	2025-04-14 13:41:34 -07:00
ammallya	09997c68bb	Removing kfd test	2025-04-14 12:55:13 -07:00
ammallya	42bc3501ac	Merge pull request #4623 from ammallya/roc-6.4.x Rebasing branch 6.4.x	2025-04-14 11:42:06 -07:00