From 8647ebcf769dce5dc670a2362ef2e8f12dadf166 Mon Sep 17 00:00:00 2001
From: peterjunpark <peter.park@amd.com>
Date: Thu, 4 Dec 2025 09:08:00 -0500
Subject: [PATCH] Update training Docker docs for Primus 25.10 (#5737)

(cherry picked from commit fb644412d598fc1ee26977541205c471515b30a4)
---
 .../megatron-lm-benchmark-models.yaml         |   32 +-
 .../megatron-lm-v25.9-benchmark-models.yaml   |   53 +
 ...rimus-megatron-v25.9-benchmark-models.yaml |   65 +
 ...primus-pytorch-v25.9-benchmark-models.yaml |   39 +
 ...torch-training-v25.9-benchmark-models.yaml |  186 +++
 .../primus-megatron-benchmark-models.yaml     |   31 +-
 .../primus-pytorch-benchmark-models.yaml      |   63 +-
 .../pytorch-training-benchmark-models.yaml    |   53 +-
 .../training/benchmark-docker/megatron-lm.rst |  122 +-
 .../previous-versions/megatron-lm-history.rst |   15 +-
 .../previous-versions/megatron-lm-v25.9.rst   | 1044 +++++++++++++++++
 .../primus-megatron-v25.9.rst                 | 1019 ++++++++++++++++
 .../primus-pytorch-v25.9.rst                  |  574 +++++++++
 .../pytorch-training-history.rst              |   15 +-
 .../pytorch-training-v25.9.rst                |  667 +++++++++++
 .../benchmark-docker/primus-megatron.rst      |  304 +++--
 .../benchmark-docker/primus-pytorch.rst       |  332 ++----
 .../benchmark-docker/pytorch-training.rst     |  104 +-
 18 files changed, 4158 insertions(+), 560 deletions(-)
 create mode 100644 docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.9-benchmark-models.yaml
 create mode 100644 docs/data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.9-benchmark-models.yaml
 create mode 100644 docs/data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.9-benchmark-models.yaml
 create mode 100644 docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.9-benchmark-models.yaml
 create mode 100644 docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.9.rst
 create mode 100644 docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.9.rst
 create mode 100644 docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-pytorch-v25.9.rst
 create mode 100644 docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.9.rst

diff --git a/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
index 1bf411207..8cb0fd12e 100644
--- a/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
@@ -1,21 +1,17 @@
-dockers:
-  MI355X and MI350X:
-    pull_tag: rocm/megatron-lm:v25.9_gfx950
-    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
-    components: &docker_components
-      ROCm: 7.0.0
-      Primus: aab4234
-      PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
-      Python: "3.10"
-      Transformer Engine: 2.2.0.dev0+54dd2bdc
-      Flash Attention: 2.8.3
-      hipBLASLt: 911283acd1
-      Triton: 3.4.0+rocm7.0.0.git56765e8c
-      RCCL: 2.26.6
-  MI325X and MI300X:
-    pull_tag: rocm/megatron-lm:v25.9_gfx942
-    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
-    components: *docker_components
+docker:
+  pull_tag: rocm/primus:v25.10
+  docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
+  components:
+    ROCm: 7.1.0
+    Primus: 0.3.0
+    Primus Turbo: 0.1.1
+    PyTorch: 2.10.0.dev20251112+rocm7.1
+    Python: "3.10"
+    Transformer Engine: 2.4.0.dev0+32e2d1d4
+    Flash Attention: 2.8.3
+    hipBLASLt: 1.2.0-09ab7153e2
+    Triton: 3.4.0
+    RCCL: 2.27.7
 model_groups:
   - group: Meta Llama
     tag: llama
diff --git a/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.9-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.9-benchmark-models.yaml
new file mode 100644
index 000000000..1bf411207
--- /dev/null
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.9-benchmark-models.yaml
@@ -0,0 +1,53 @@
+dockers:
+  MI355X and MI350X:
+    pull_tag: rocm/megatron-lm:v25.9_gfx950
+    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
+    components: &docker_components
+      ROCm: 7.0.0
+      Primus: aab4234
+      PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
+      Python: "3.10"
+      Transformer Engine: 2.2.0.dev0+54dd2bdc
+      Flash Attention: 2.8.3
+      hipBLASLt: 911283acd1
+      Triton: 3.4.0+rocm7.0.0.git56765e8c
+      RCCL: 2.26.6
+  MI325X and MI300X:
+    pull_tag: rocm/megatron-lm:v25.9_gfx942
+    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
+    components: *docker_components
+model_groups:
+  - group: Meta Llama
+    tag: llama
+    models:
+      - model: Llama 3.3 70B
+        mad_tag: pyt_megatron_lm_train_llama-3.3-70b
+      - model: Llama 3.1 8B
+        mad_tag: pyt_megatron_lm_train_llama-3.1-8b
+      - model: Llama 3.1 70B
+        mad_tag: pyt_megatron_lm_train_llama-3.1-70b
+      - model: Llama 2 7B
+        mad_tag: pyt_megatron_lm_train_llama-2-7b
+      - model: Llama 2 70B
+        mad_tag: pyt_megatron_lm_train_llama-2-70b
+  - group: DeepSeek
+    tag: deepseek
+    models:
+      - model: DeepSeek-V3 (proxy)
+        mad_tag: pyt_megatron_lm_train_deepseek-v3-proxy
+      - model: DeepSeek-V2-Lite
+        mad_tag: pyt_megatron_lm_train_deepseek-v2-lite-16b
+  - group: Mistral AI
+    tag: mistral
+    models:
+      - model: Mixtral 8x7B
+        mad_tag: pyt_megatron_lm_train_mixtral-8x7b
+      - model: Mixtral 8x22B (proxy)
+        mad_tag: pyt_megatron_lm_train_mixtral-8x22b-proxy
+  - group: Qwen
+    tag: qwen
+    models:
+      - model: Qwen 2.5 7B
+        mad_tag: pyt_megatron_lm_train_qwen2.5-7b
+      - model: Qwen 2.5 72B
+        mad_tag: pyt_megatron_lm_train_qwen2.5-72b
diff --git a/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.9-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.9-benchmark-models.yaml
new file mode 100644
index 000000000..386538cf1
--- /dev/null
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.9-benchmark-models.yaml
@@ -0,0 +1,65 @@
+dockers:
+  MI355X and MI350X:
+    pull_tag: rocm/primus:v25.9_gfx950
+    docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
+    components: &docker_components
+      ROCm: 7.0.0
+      Primus: 0.3.0
+      Primus Turbo: 0.1.1
+      PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
+      Python: "3.10"
+      Transformer Engine: 2.2.0.dev0+54dd2bdc
+      Flash Attention: 2.8.3
+      hipBLASLt: 911283acd1
+      Triton: 3.4.0+rocm7.0.0.git56765e8c
+      RCCL: 2.26.6
+  MI325X and MI300X:
+    pull_tag: rocm/primus:v25.9_gfx942
+    docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
+    components: *docker_components
+model_groups:
+  - group: Meta Llama
+    tag: llama
+    models:
+      - model: Llama 3.3 70B
+        mad_tag: primus_pyt_megatron_lm_train_llama-3.3-70b
+        config_name: llama3.3_70B-pretrain.yaml
+      - model: Llama 3.1 70B
+        mad_tag: primus_pyt_megatron_lm_train_llama-3.1-70b
+        config_name: llama3.1_70B-pretrain.yaml
+      - model: Llama 3.1 8B
+        mad_tag: primus_pyt_megatron_lm_train_llama-3.1-8b
+        config_name: llama3.1_8B-pretrain.yaml
+      - model: Llama 2 7B
+        mad_tag: primus_pyt_megatron_lm_train_llama-2-7b
+        config_name: llama2_7B-pretrain.yaml
+      - model: Llama 2 70B
+        mad_tag: primus_pyt_megatron_lm_train_llama-2-70b
+        config_name: llama2_70B-pretrain.yaml
+  - group: DeepSeek
+    tag: deepseek
+    models:
+      - model: DeepSeek-V3 (proxy)
+        mad_tag: primus_pyt_megatron_lm_train_deepseek-v3-proxy
+        config_name: deepseek_v3-pretrain.yaml
+      - model: DeepSeek-V2-Lite
+        mad_tag: primus_pyt_megatron_lm_train_deepseek-v2-lite-16b
+        config_name: deepseek_v2_lite-pretrain.yaml
+  - group: Mistral AI
+    tag: mistral
+    models:
+      - model: Mixtral 8x7B
+        mad_tag: primus_pyt_megatron_lm_train_mixtral-8x7b
+        config_name: mixtral_8x7B_v0.1-pretrain.yaml
+      - model: Mixtral 8x22B (proxy)
+        mad_tag: primus_pyt_megatron_lm_train_mixtral-8x22b-proxy
+        config_name: mixtral_8x22B_v0.1-pretrain.yaml
+  - group: Qwen
+    tag: qwen
+    models:
+      - model: Qwen 2.5 7B
+        mad_tag: primus_pyt_megatron_lm_train_qwen2.5-7b
+        config_name: primus_qwen2.5_7B-pretrain.yaml
+      - model: Qwen 2.5 72B
+        mad_tag: primus_pyt_megatron_lm_train_qwen2.5-72b
+        config_name: qwen2.5_72B-pretrain.yaml
diff --git a/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.9-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.9-benchmark-models.yaml
new file mode 100644
index 000000000..4a4c57a12
--- /dev/null
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.9-benchmark-models.yaml
@@ -0,0 +1,39 @@
+dockers:
+  MI355X and MI350X:
+    pull_tag: rocm/primus:v25.9_gfx950
+    docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
+    components: &docker_components
+      ROCm: 7.0.0
+      Primus: 0.3.0
+      Primus Turbo: 0.1.1
+      PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
+      Python: "3.10"
+      Transformer Engine: 2.2.0.dev0+54dd2bdc
+      Flash Attention: 2.8.3
+      hipBLASLt: 911283acd1
+      Triton: 3.4.0+rocm7.0.0.git56765e8c
+      RCCL: 2.26.6
+  MI325X and MI300X:
+    pull_tag: rocm/primus:v25.9_gfx942
+    docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
+    components: *docker_components
+model_groups:
+  - group: Meta Llama
+    tag: llama
+    models:
+    - model: Llama 3.1 8B
+      mad_tag: primus_pyt_train_llama-3.1-8b
+      model_repo: meta-llama/Llama-3.1-8B
+      url: https://huggingface.co/meta-llama/Llama-3.1-8B
+      precision: BF16
+      config_file:
+        bf16: "./llama3_8b_fsdp_bf16.toml"
+        fp8: "./llama3_8b_fsdp_fp8.toml"
+    - model: Llama 3.1 70B
+      mad_tag: primus_pyt_train_llama-3.1-70b
+      model_repo: meta-llama/Llama-3.1-70B
+      url: https://huggingface.co/meta-llama/Llama-3.1-70B
+      precision: BF16
+      config_file:
+        bf16: "./llama3_70b_fsdp_bf16.toml"
+        fp8: "./llama3_70b_fsdp_fp8.toml"
diff --git a/docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.9-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.9-benchmark-models.yaml
new file mode 100644
index 000000000..05c77d799
--- /dev/null
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.9-benchmark-models.yaml
@@ -0,0 +1,186 @@
+dockers:
+  MI355X and MI350X:
+    pull_tag: rocm/pytorch-training:v25.9_gfx950
+    docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
+    components: &docker_components
+      ROCm: 7.0.0
+      Primus: aab4234
+      PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
+      Python: "3.10"
+      Transformer Engine: 2.2.0.dev0+54dd2bdc
+      Flash Attention: 2.8.3
+      hipBLASLt: 911283acd1
+      Triton: 3.4.0+rocm7.0.0.git56765e8c
+      RCCL: 2.26.6
+  MI325X and MI300X:
+    pull_tag: rocm/pytorch-training:v25.9_gfx942
+    docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
+    components: *docker_components
+model_groups:
+  - group: Meta Llama
+    tag: llama
+    models:
+    - model: Llama 4 Scout 17B-16E
+      mad_tag: pyt_train_llama-4-scout-17b-16e
+      model_repo: Llama-4-17B_16E
+      url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3.3 70B
+      mad_tag: pyt_train_llama-3.3-70b
+      model_repo: Llama-3.3-70B
+      url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
+    - model: Llama 3.2 1B
+      mad_tag: pyt_train_llama-3.2-1b
+      model_repo: Llama-3.2-1B
+      url: https://huggingface.co/meta-llama/Llama-3.2-1B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3.2 3B
+      mad_tag: pyt_train_llama-3.2-3b
+      model_repo: Llama-3.2-3B
+      url: https://huggingface.co/meta-llama/Llama-3.2-3B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3.2 Vision 11B
+      mad_tag: pyt_train_llama-3.2-vision-11b
+      model_repo: Llama-3.2-Vision-11B
+      url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision
+      precision: BF16
+      training_modes: [finetune_fw]
+    - model: Llama 3.2 Vision 90B
+      mad_tag: pyt_train_llama-3.2-vision-90b
+      model_repo: Llama-3.2-Vision-90B
+      url: https://huggingface.co/meta-llama/Llama-3.2-90B-Vision
+      precision: BF16
+      training_modes: [finetune_fw]
+    - model: Llama 3.1 8B
+      mad_tag: pyt_train_llama-3.1-8b
+      model_repo: Llama-3.1-8B
+      url: https://huggingface.co/meta-llama/Llama-3.1-8B
+      precision: BF16
+      training_modes: [pretrain, finetune_fw, finetune_lora, HF_pretrain]
+    - model: Llama 3.1 70B
+      mad_tag: pyt_train_llama-3.1-70b
+      model_repo: Llama-3.1-70B
+      url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
+      precision: BF16
+      training_modes: [pretrain, finetune_fw, finetune_lora]
+    - model: Llama 3.1 405B
+      mad_tag: pyt_train_llama-3.1-405b
+      model_repo: Llama-3.1-405B
+      url: https://huggingface.co/meta-llama/Llama-3.1-405B
+      precision: BF16
+      training_modes: [finetune_qlora]
+    - model: Llama 3 8B
+      mad_tag: pyt_train_llama-3-8b
+      model_repo: Llama-3-8B
+      url: https://huggingface.co/meta-llama/Meta-Llama-3-8B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3 70B
+      mad_tag: pyt_train_llama-3-70b
+      model_repo: Llama-3-70B
+      url: https://huggingface.co/meta-llama/Meta-Llama-3-70B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 2 7B
+      mad_tag: pyt_train_llama-2-7b
+      model_repo: Llama-2-7B
+      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
+    - model: Llama 2 13B
+      mad_tag: pyt_train_llama-2-13b
+      model_repo: Llama-2-13B
+      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 2 70B
+      mad_tag: pyt_train_llama-2-70b
+      model_repo: Llama-2-70B
+      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
+      precision: BF16
+      training_modes: [finetune_lora, finetune_qlora]
+  - group: OpenAI
+    tag: openai
+    models:
+    - model: GPT OSS 20B
+      mad_tag: pyt_train_gpt_oss_20b
+      model_repo: GPT-OSS-20B
+      url: https://huggingface.co/openai/gpt-oss-20b
+      precision: BF16
+      training_modes: [HF_finetune_lora]
+    - model: GPT OSS 120B
+      mad_tag: pyt_train_gpt_oss_120b
+      model_repo: GPT-OSS-120B
+      url: https://huggingface.co/openai/gpt-oss-120b
+      precision: BF16
+      training_modes: [HF_finetune_lora]
+  - group: Qwen
+    tag: qwen
+    models:
+    - model: Qwen 3 8B
+      mad_tag: pyt_train_qwen3-8b
+      model_repo: Qwen3-8B
+      url: https://huggingface.co/Qwen/Qwen3-8B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Qwen 3 32B
+      mad_tag: pyt_train_qwen3-32b
+      model_repo: Qwen3-32
+      url: https://huggingface.co/Qwen/Qwen3-32B
+      precision: BF16
+      training_modes: [finetune_lora]
+    - model: Qwen 2.5 32B
+      mad_tag: pyt_train_qwen2.5-32b
+      model_repo: Qwen2.5-32B
+      url: https://huggingface.co/Qwen/Qwen2.5-32B
+      precision: BF16
+      training_modes: [finetune_lora]
+    - model: Qwen 2.5 72B
+      mad_tag: pyt_train_qwen2.5-72b
+      model_repo: Qwen2.5-72B
+      url: https://huggingface.co/Qwen/Qwen2.5-72B
+      precision: BF16
+      training_modes: [finetune_lora]
+    - model: Qwen 2 1.5B
+      mad_tag: pyt_train_qwen2-1.5b
+      model_repo: Qwen2-1.5B
+      url: https://huggingface.co/Qwen/Qwen2-1.5B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Qwen 2 7B
+      mad_tag: pyt_train_qwen2-7b
+      model_repo: Qwen2-7B
+      url: https://huggingface.co/Qwen/Qwen2-7B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+  - group: Stable Diffusion
+    tag: sd
+    models:
+    - model: Stable Diffusion XL
+      mad_tag: pyt_huggingface_stable_diffusion_xl_2k_lora_finetuning
+      model_repo: SDXL
+      url: https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0
+      precision: BF16
+      training_modes: [posttrain-p]
+  - group: Flux
+    tag: flux
+    models:
+    - model: FLUX.1-dev
+      mad_tag: pyt_train_flux
+      model_repo: Flux
+      url: https://huggingface.co/black-forest-labs/FLUX.1-dev
+      precision: BF16
+      training_modes: [posttrain-p]
+  - group: NCF
+    tag: ncf
+    models:
+    - model: NCF
+      mad_tag: pyt_ncf_training
+      model_repo:
+      url: https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Recommendation/NCF
+      precision: FP32
diff --git a/docs/data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
index 386538cf1..bd8dc5356 100644
--- a/docs/data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
@@ -1,22 +1,15 @@
-dockers:
-  MI355X and MI350X:
-    pull_tag: rocm/primus:v25.9_gfx950
-    docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
-    components: &docker_components
-      ROCm: 7.0.0
-      Primus: 0.3.0
-      Primus Turbo: 0.1.1
-      PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
-      Python: "3.10"
-      Transformer Engine: 2.2.0.dev0+54dd2bdc
-      Flash Attention: 2.8.3
-      hipBLASLt: 911283acd1
-      Triton: 3.4.0+rocm7.0.0.git56765e8c
-      RCCL: 2.26.6
-  MI325X and MI300X:
-    pull_tag: rocm/primus:v25.9_gfx942
-    docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
-    components: *docker_components
+docker:
+  pull_tag: rocm/primus:v25.10
+  docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
+  components:
+    ROCm: 7.1.0
+    PyTorch: 2.10.0.dev20251112+rocm7.1
+    Python: "3.10"
+    Transformer Engine: 2.4.0.dev0+32e2d1d4
+    Flash Attention: 2.8.3
+    hipBLASLt: 1.2.0-09ab7153e2
+    Triton: 3.4.0
+    RCCL: 2.27.7
 model_groups:
   - group: Meta Llama
     tag: llama
diff --git a/docs/data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
index 4a4c57a12..3db8a411b 100644
--- a/docs/data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
@@ -1,39 +1,32 @@
-dockers:
-  MI355X and MI350X:
-    pull_tag: rocm/primus:v25.9_gfx950
-    docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
-    components: &docker_components
-      ROCm: 7.0.0
-      Primus: 0.3.0
-      Primus Turbo: 0.1.1
-      PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
-      Python: "3.10"
-      Transformer Engine: 2.2.0.dev0+54dd2bdc
-      Flash Attention: 2.8.3
-      hipBLASLt: 911283acd1
-      Triton: 3.4.0+rocm7.0.0.git56765e8c
-      RCCL: 2.26.6
-  MI325X and MI300X:
-    pull_tag: rocm/primus:v25.9_gfx942
-    docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
-    components: *docker_components
+docker:
+  pull_tag: rocm/primus:v25.10
+  docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
+  components:
+    ROCm: 7.1.0
+    PyTorch: 2.10.0.dev20251112+rocm7.1
+    Python: "3.10"
+    Transformer Engine: 2.4.0.dev0+32e2d1d4
+    Flash Attention: 2.8.3
+    hipBLASLt: 1.2.0-09ab7153e2
 model_groups:
   - group: Meta Llama
     tag: llama
     models:
-    - model: Llama 3.1 8B
-      mad_tag: primus_pyt_train_llama-3.1-8b
-      model_repo: meta-llama/Llama-3.1-8B
-      url: https://huggingface.co/meta-llama/Llama-3.1-8B
-      precision: BF16
-      config_file:
-        bf16: "./llama3_8b_fsdp_bf16.toml"
-        fp8: "./llama3_8b_fsdp_fp8.toml"
-    - model: Llama 3.1 70B
-      mad_tag: primus_pyt_train_llama-3.1-70b
-      model_repo: meta-llama/Llama-3.1-70B
-      url: https://huggingface.co/meta-llama/Llama-3.1-70B
-      precision: BF16
-      config_file:
-        bf16: "./llama3_70b_fsdp_bf16.toml"
-        fp8: "./llama3_70b_fsdp_fp8.toml"
+      - model: Llama 3.1 8B
+        mad_tag: primus_pyt_train_llama-3.1-8b
+        model_repo: Llama-3.1-8B
+        url: https://huggingface.co/meta-llama/Llama-3.1-8B
+        precision: BF16
+      - model: Llama 3.1 70B
+        mad_tag: primus_pyt_train_llama-3.1-70b
+        model_repo: Llama-3.1-70B
+        url: https://huggingface.co/meta-llama/Llama-3.1-70B
+        precision: BF16
+  - group: DeepSeek
+    tag: deepseek
+    models:
+      - model: DeepSeek V2 16B
+        mad_tag: primus_pyt_train_deepseek-v2
+        model_repo: DeepSeek-V2
+        url: https://huggingface.co/deepseek-ai/DeepSeek-V2
+        precision: BF16
diff --git a/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
index 05c77d799..b037f5087 100644
--- a/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
@@ -1,21 +1,15 @@
-dockers:
-  MI355X and MI350X:
-    pull_tag: rocm/pytorch-training:v25.9_gfx950
-    docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
-    components: &docker_components
-      ROCm: 7.0.0
-      Primus: aab4234
-      PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
-      Python: "3.10"
-      Transformer Engine: 2.2.0.dev0+54dd2bdc
-      Flash Attention: 2.8.3
-      hipBLASLt: 911283acd1
-      Triton: 3.4.0+rocm7.0.0.git56765e8c
-      RCCL: 2.26.6
-  MI325X and MI300X:
-    pull_tag: rocm/pytorch-training:v25.9_gfx942
-    docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
-    components: *docker_components
+docker:
+  pull_tag: rocm/primus:v25.10
+  docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
+  components:
+    ROCm: 7.1.0
+    Primus: 0.3.0
+    Primus Turbo: 0.1.1
+    PyTorch: 2.10.0.dev20251112+rocm7.1
+    Python: "3.10"
+    Transformer Engine: 2.4.0.dev0+32e2d1d4
+    Flash Attention: 2.8.3
+    hipBLASLt: 1.2.0-09ab7153e2
 model_groups:
   - group: Meta Llama
     tag: llama
@@ -119,6 +113,15 @@ model_groups:
       url: https://huggingface.co/openai/gpt-oss-120b
       precision: BF16
       training_modes: [HF_finetune_lora]
+  - group: DeepSeek
+    tag: deepseek
+    models:
+    - model: DeepSeek V2 16B
+      mad_tag: primus_pyt_train_deepseek-v2
+      model_repo: DeepSeek-V2
+      url: https://huggingface.co/deepseek-ai/DeepSeek-V2
+      precision: BF16
+      training_modes: [pretrain]
   - group: Qwen
     tag: qwen
     models:
@@ -166,7 +169,7 @@ model_groups:
       model_repo: SDXL
       url: https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0
       precision: BF16
-      training_modes: [posttrain-p]
+      training_modes: [posttrain]
   - group: Flux
     tag: flux
     models:
@@ -175,12 +178,20 @@ model_groups:
       model_repo: Flux
       url: https://huggingface.co/black-forest-labs/FLUX.1-dev
       precision: BF16
-      training_modes: [posttrain-p]
+      training_modes: [posttrain]
   - group: NCF
     tag: ncf
     models:
     - model: NCF
       mad_tag: pyt_ncf_training
       model_repo:
-      url: https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Recommendation/NCF
+      url: https://github.com/ROCm/FluxBenchmark
       precision: FP32
+  - group: DLRM
+    tag: dlrm
+    models:
+    - model: DLRM v2
+      mad_tag: pyt_train_dlrm
+      model_repo: DLRM
+      url: https://github.com/AMD-AGI/DLRMBenchmark
+      training_modes: [pretrain]
diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
index 6c8cf154f..bfd9ad3cc 100644
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
@@ -36,12 +36,10 @@ accelerate training workloads:
 
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
 
-   {% set dockers = data.dockers %}
    .. tab-set::
 
-   {% for supported_gpus, docker in dockers.items() %}
-      .. tab-item:: {{ supported_gpus }}
-         :sync: {{ supported_gpus }}
+      .. tab-item:: {{ data.docker.pull_tag }}
+         :sync: {{ data.docker.pull_tag }}
 
          .. list-table::
             :header-rows: 1
@@ -49,12 +47,12 @@ accelerate training workloads:
             * - Software component
               - Version
 
-            {% for component_name, component_version in docker.components.items() %}
+            {% for component_name, component_version in data.docker.components.items() %}
             * - {{ component_name }}
               - {{ component_version }}
             {% endfor %}
-   {% endfor %}
-   .. _amd-megatron-lm-model-support:
+
+   .. _amd-megatron-lm-model-support-v2510:
 
    Supported models
    ================
@@ -99,7 +97,7 @@ accelerate training workloads:
    Some models, such as Llama, require an external license agreement through
    a third party (for example, Meta).
 
-.. _amd-megatron-lm-performance-measurements:
+.. _amd-megatron-lm-performance-measurements-v2510:
 
 Performance measurements
 ========================
@@ -131,7 +129,7 @@ To test for optimal performance, consult the recommended :ref:`System health ben
 <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
 system's configuration.
 
-.. _mi300x-amd-megatron-lm-training:
+.. _mi300x-amd-megatron-lm-training-v2510:
 
 Environment setup
 =================
@@ -140,52 +138,38 @@ Use the following instructions to set up the environment, configure the script t
 reproduce the benchmark results on MI300X Series GPUs with the AMD Megatron-LM Docker
 image.
 
-.. _amd-megatron-lm-requirements:
+.. _amd-megatron-lm-requirements-v2510:
 
 Download the Docker image
 -------------------------
 
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
 
-   {% set dockers = data.dockers %}
+   {% set docker = data.docker %}
    1. Use the following command to pull the Docker image from Docker Hub.
 
-      .. tab-set::
+      .. code-block:: shell
 
-         {% for supported_gpus, docker in dockers.items() %}
-         .. tab-item:: {{ supported_gpus }}
-            :sync: {{ supported_gpus }}
-
-            .. code-block:: shell
-
-               docker pull {{ docker.pull_tag }}
-         {% endfor %}
+         docker pull {{ docker.pull_tag }}
 
    2. Launch the Docker container.
 
-      .. tab-set::
+      .. code-block:: shell
 
-         {% for supported_gpus, docker in dockers.items() %}
-         .. tab-item:: {{ supported_gpus }}
-            :sync: {{ supported_gpus }}
-
-            .. code-block:: shell
-
-               docker run -it \
-                   --device /dev/dri \
-                   --device /dev/kfd \
-                   --device /dev/infiniband \
-                   --network host --ipc host \
-                   --group-add video \
-                   --cap-add SYS_PTRACE \
-                   --security-opt seccomp=unconfined \
-                   --privileged \
-                   -v $HOME:$HOME \
-                   -v $HOME/.ssh:/root/.ssh \
-                   --shm-size 128G \
-                   --name megatron_training_env \
-                   {{ docker.pull_tag }}
-         {% endfor %}
+         docker run -it \
+             --device /dev/dri \
+             --device /dev/kfd \
+             --device /dev/infiniband \
+             --network host --ipc host \
+             --group-add video \
+             --cap-add SYS_PTRACE \
+             --security-opt seccomp=unconfined \
+             --privileged \
+             -v $HOME:$HOME \
+             -v $HOME/.ssh:/root/.ssh \
+             --shm-size 128G \
+             --name megatron_training_env \
+             {{ docker.pull_tag }}
 
 3. Use these commands if you exit the ``megatron_training_env`` container and need to return to it.
 
@@ -206,7 +190,7 @@ Download the Docker image
 The Docker container hosts a verified commit of
 `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev>`__.
 
-.. _amd-megatron-lm-environment-setup:
+.. _amd-megatron-lm-environment-setup-v2510:
 
 Configuration
 =============
@@ -216,39 +200,39 @@ Configuration
    Update the ``train_llama3.sh`` configuration script in the ``examples/llama``
    directory of
    `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/llama>`__ to configure your training run.
-   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training>`.
+   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v2510>`.
 
 .. container:: model-doc pyt_megatron_lm_train_llama-2-7b pyt_megatron_lm_train_llama-2-70b
 
    Update the ``train_llama2.sh`` configuration script in the ``examples/llama``
    directory of
    `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/llama>`__ to configure your training run.
-   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training>`.
+   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v2510>`.
 
 .. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy
 
    Update the ``train_deepseekv3.sh`` configuration script in the ``examples/deepseek_v3``
    directory of
    `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/deepseek_v3>`__ to configure your training run.
-   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training>`.
+   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v2510>`.
 
 .. container:: model-doc pyt_megatron_lm_train_deepseek-v2-lite-16b
 
    Update the ``train_deepseekv2.sh`` configuration script in the ``examples/deepseek_v2``
    directory of
    `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/deepseek_v2>`__ to configure your training run.
-   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training>`.
+   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v2510>`.
 
 .. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b pyt_megatron_lm_train_mixtral-8x22b-proxy
 
    Update the ``train_mixtral_moe.sh`` configuration script in the ``examples/mixtral``
    directory of
    `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/mixtral>`__ to configure your training run.
-   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training>`.
+   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v2510>`.
 
 .. note::
 
-   See :ref:`Key options <amd-megatron-lm-benchmark-test-vars>` for more information on configuration options.
+   See :ref:`Key options <amd-megatron-lm-benchmark-test-vars-v2510>` for more information on configuration options.
 
 Multi-node configuration
 ------------------------
@@ -256,7 +240,7 @@ Multi-node configuration
 Refer to :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your environment for multi-node
 training. See :ref:`amd-megatron-lm-multi-node-examples` for example run commands.
 
-.. _amd-megatron-lm-tokenizer:
+.. _amd-megatron-lm-tokenizer-v2510:
 
 Tokenizer
 ---------
@@ -393,7 +377,7 @@ Download the dataset
 
    ``TOKENIZER_MODEL`` can be any accessible Hugging Face tokenizer.
    Remember to either pre-download the tokenizer or setup Hugging Face access
-   otherwise when needed -- see the :ref:`Tokenizer <amd-megatron-lm-tokenizer>` section.
+   otherwise when needed -- see the :ref:`Tokenizer <amd-megatron-lm-tokenizer-v2510>` section.
 
    .. note::
 
@@ -495,15 +479,38 @@ Download the dataset
 
    Ensure that the files are accessible inside the Docker container.
 
-.. _amd-megatron-lm-run-training:
+.. _amd-megatron-lm-run-training-v2510:
 
 Run training
 ============
 
 Use the following example commands to set up the environment, configure
-:ref:`key options <amd-megatron-lm-benchmark-test-vars>`, and run training on
+:ref:`key options <amd-megatron-lm-benchmark-test-vars-v2510>`, and run training on
 MI300X Series GPUs with the AMD Megatron-LM environment.
 
+Before starting training, export the following environment variables.
+
+.. tab-set::
+
+   .. tab-item:: MI355X and MI350X
+
+      .. code-block:: shell
+
+         export HSA_NO_SCRATCH_RECLAIM=1
+         export NVTE_CK_USES_BWD_V3=1
+         export NVTE_CK_USES_BWD_V3=1
+
+   .. tab-item:: MI325X and MI300X
+
+      .. code-block:: shell
+
+         export HSA_NO_SCRATCH_RECLAIM=1
+         export NVTE_CK_USES_BWD_V3=1
+         export NVTE_CK_USES_BWD_V3=1
+
+         # Set this on MI325X/MI300X only
+         export NVTE_CK_IS_V3_ATOMIC_FP32=1
+
 Single node training
 --------------------
 
@@ -913,7 +920,7 @@ Single node training
           RECOMPUTE_ACTIVATIONS=full \
           CKPT_FORMAT=torch_dist
 
-.. _amd-megatron-lm-multi-node-examples:
+.. _amd-megatron-lm-multi-node-examples-v2510:
 
 Multi-node training examples
 ----------------------------
@@ -964,7 +971,7 @@ training on 16 nodes, try the following command:
 
    sbatch examples/deepseek_v3/train_deepseek_v3_slurm.sh
 
-.. _amd-megatron-lm-benchmark-test-vars:
+.. _amd-megatron-lm-benchmark-test-vars-v2510:
 
 Key options
 -----------
@@ -1029,11 +1036,6 @@ The benchmark tests support the following sets of variables.
 ``RECOMPUTE_NUM_LAYERS``
   Number of layers used for checkpointing recompute.
 
-Known issues
-============
-
-PyTorch Profiler may produce inaccurate traces when CPU activity profiling is enabled.
-
 Previous versions
 =================
 
diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst
index 1d3c4905b..1b70f9386 100644
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst
@@ -16,14 +16,23 @@ previous releases of the ``ROCm/megatron-lm`` Docker image on `Docker Hub <https
      - Components
      - Resources
 
-   * - v25.9 (latest)
+   * - v25.10 (latest)
+     -
+       * ROCm 7.1.0
+       * PyTorch 2.10.0.dev20251112+rocm7.1
+     -
+       * :doc:`Primus Megatron documentation <../primus-megatron>`
+       * :doc:`Megatron-LM (legacy) documentation <../megatron-lm>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6>`__
+
+   * - v25.9
      -
        * ROCm 7.0.0
        * Primus 0.3.0
        * PyTorch 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
      -
-       * :doc:`Primus Megatron documentation <../primus-megatron>`
-       * :doc:`Megatron-LM (legacy) documentation <../megatron-lm>`
+       * :doc:`Primus Megatron documentation <primus-megatron-v25.9>`
+       * :doc:`Megatron-LM (legacy) documentation <megatron-lm-v25.9>`
        * `Docker Hub (gfx950) <https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6>`__
        * `Docker Hub (gfx942) <https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357>`__
 
diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.9.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.9.rst
new file mode 100644
index 000000000..7668c33b1
--- /dev/null
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.9.rst
@@ -0,0 +1,1044 @@
+:orphan:
+
+.. meta::
+   :description: How to train a model using Megatron-LM for ROCm.
+   :keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
+
+******************************************
+Training a model with Megatron-LM on ROCm
+******************************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm Megatron-LM
+   training performance documentation. See :doc:`../megatron-lm` for the latest version.
+
+   For a unified training solution on AMD GPUs with ROCm, the `rocm/megatron-lm
+   <https://hub.docker.com/r/rocm/megatron-lm/>`__ Docker Hub registry will be
+   deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
+   The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
+   including Megatron-LM and :doc:`torchtitan <../primus-pytorch>`.
+
+   Primus with Megatron is designed to replace this ROCm Megatron-LM training workflow.
+   To learn how to migrate workloads from Megatron-LM to Primus with Megatron,
+   see :doc:`megatron-lm-primus-migration-guide`.
+
+The `Megatron-LM framework for ROCm <https://github.com/ROCm/Megatron-LM>`_ is
+a specialized fork of the robust Megatron-LM, designed to enable efficient
+training of large-scale language models on AMD GPUs. By leveraging AMD
+Instinct™ GPUs, Megatron-LM delivers enhanced scalability, performance, and
+resource utilization for AI workloads. It is
+purpose-built to support models like Llama, DeepSeek, and Mixtral,
+enabling developers to train next-generation AI models more
+efficiently.
+
+AMD provides ready-to-use Docker images for MI355X, MI350X, MI325X, and MI300X
+GPUs containing essential components, including PyTorch, ROCm libraries, and
+Megatron-LM utilities. It contains the following software components to
+accelerate training workloads:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.9-benchmark-models.yaml
+
+   {% set dockers = data.dockers %}
+   .. tab-set::
+
+   {% for supported_gpus, docker in dockers.items() %}
+      .. tab-item:: {{ supported_gpus }}
+         :sync: {{ supported_gpus }}
+
+         .. list-table::
+            :header-rows: 1
+
+            * - Software component
+              - Version
+
+            {% for component_name, component_version in docker.components.items() %}
+            * - {{ component_name }}
+              - {{ component_version }}
+            {% endfor %}
+   {% endfor %}
+   .. _amd-megatron-lm-model-support:
+
+   Supported models
+   ================
+
+   The following models are supported for training performance benchmarking with Megatron-LM and ROCm
+   on AMD Instinct MI300X Series GPUs.
+   Some instructions, commands, and training recommendations in this documentation might
+   vary by model -- select one to get started.
+
+   {% set model_groups = data.model_groups %}
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+         <div class="row gx-0">
+            <div class="col-2 me-1 px-2 model-param-head">Model</div>
+            <div class="row col-10 pe-0">
+      {% for model_group in model_groups %}
+               <div class="col-3 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+      {% endfor %}
+            </div>
+         </div>
+
+         <div class="row gx-0 pt-1">
+            <div class="col-2 me-1 px-2 model-param-head">Variant</div>
+            <div class="row col-10 pe-0">
+      {% for model_group in model_groups %}
+         {% set models = model_group.models %}
+         {% for model in models %}
+            {% if models|length % 3 == 0 %}
+               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% else %}
+               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% endif %}
+         {% endfor %}
+      {% endfor %}
+            </div>
+         </div>
+      </div>
+
+.. note::
+
+   Some models, such as Llama, require an external license agreement through
+   a third party (for example, Meta).
+
+.. _amd-megatron-lm-performance-measurements:
+
+Performance measurements
+========================
+
+To evaluate performance, the
+`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`__
+page provides reference throughput and latency measurements for training
+popular AI models.
+
+.. important::
+
+   The performance data presented in
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`__
+   only reflects the latest version of this training benchmarking environment.
+   The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X GPUs or ROCm software.
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting training.
+
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
+.. _mi300x-amd-megatron-lm-training:
+
+Environment setup
+=================
+
+Use the following instructions to set up the environment, configure the script to train models, and
+reproduce the benchmark results on MI300X Series GPUs with the AMD Megatron-LM Docker
+image.
+
+.. _amd-megatron-lm-requirements:
+
+Download the Docker image
+-------------------------
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.9-benchmark-models.yaml
+
+   {% set dockers = data.dockers %}
+   1. Use the following command to pull the Docker image from Docker Hub.
+
+      .. tab-set::
+
+         {% for supported_gpus, docker in dockers.items() %}
+         .. tab-item:: {{ supported_gpus }}
+            :sync: {{ supported_gpus }}
+
+            .. code-block:: shell
+
+               docker pull {{ docker.pull_tag }}
+         {% endfor %}
+
+   2. Launch the Docker container.
+
+      .. tab-set::
+
+         {% for supported_gpus, docker in dockers.items() %}
+         .. tab-item:: {{ supported_gpus }}
+            :sync: {{ supported_gpus }}
+
+            .. code-block:: shell
+
+               docker run -it \
+                   --device /dev/dri \
+                   --device /dev/kfd \
+                   --device /dev/infiniband \
+                   --network host --ipc host \
+                   --group-add video \
+                   --cap-add SYS_PTRACE \
+                   --security-opt seccomp=unconfined \
+                   --privileged \
+                   -v $HOME:$HOME \
+                   -v $HOME/.ssh:/root/.ssh \
+                   --shm-size 128G \
+                   --name megatron_training_env \
+                   {{ docker.pull_tag }}
+         {% endfor %}
+
+3. Use these commands if you exit the ``megatron_training_env`` container and need to return to it.
+
+   .. code-block:: shell
+
+      docker start megatron_training_env
+      docker exec -it megatron_training_env bash
+
+4. **Megatron-LM backward compatibility setup** -- this Docker is primarily intended for use with Primus, but it maintains Megatron-LM compatibility with limited support.
+   To roll back to using Megatron-LM, follow these steps:
+
+   .. code-block:: shell
+
+      cd /workspace/Megatron-LM/
+      pip uninstall megatron-core
+      pip install -e .
+
+The Docker container hosts a verified commit of
+`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev>`__.
+
+.. _amd-megatron-lm-environment-setup:
+
+Configuration
+=============
+
+.. container:: model-doc pyt_megatron_lm_train_llama-3.3-70b pyt_megatron_lm_train_llama-3.1-8b pyt_megatron_lm_train_llama-3.1-70b
+
+   Update the ``train_llama3.sh`` configuration script in the ``examples/llama``
+   directory of
+   `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/llama>`__ to configure your training run.
+   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training>`.
+
+.. container:: model-doc pyt_megatron_lm_train_llama-2-7b pyt_megatron_lm_train_llama-2-70b
+
+   Update the ``train_llama2.sh`` configuration script in the ``examples/llama``
+   directory of
+   `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/llama>`__ to configure your training run.
+   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training>`.
+
+.. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy
+
+   Update the ``train_deepseekv3.sh`` configuration script in the ``examples/deepseek_v3``
+   directory of
+   `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/deepseek_v3>`__ to configure your training run.
+   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training>`.
+
+.. container:: model-doc pyt_megatron_lm_train_deepseek-v2-lite-16b
+
+   Update the ``train_deepseekv2.sh`` configuration script in the ``examples/deepseek_v2``
+   directory of
+   `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/deepseek_v2>`__ to configure your training run.
+   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training>`.
+
+.. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b pyt_megatron_lm_train_mixtral-8x22b-proxy
+
+   Update the ``train_mixtral_moe.sh`` configuration script in the ``examples/mixtral``
+   directory of
+   `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/mixtral>`__ to configure your training run.
+   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training>`.
+
+.. note::
+
+   See :ref:`Key options <amd-megatron-lm-benchmark-test-vars>` for more information on configuration options.
+
+Multi-node configuration
+------------------------
+
+Refer to :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your environment for multi-node
+training. See :ref:`amd-megatron-lm-multi-node-examples` for example run commands.
+
+.. _amd-megatron-lm-tokenizer:
+
+Tokenizer
+---------
+
+You can assign the path of an existing tokenizer to the ``TOKENIZER_MODEL`` as shown in the following examples.
+If the tokenizer is not found, it'll be downloaded if publicly available.
+
+.. container:: model-doc pyt_megatron_lm_train_llama-3.3-70b
+
+   If you do not have Llama 3.3 tokenizer locally, you need to use your
+   personal Hugging Face access token ``HF_TOKEN`` to download the tokenizer.
+   See `Llama-3.3-70B-Instruct
+   <https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct>`_. After you are
+   authorized, use your ``HF_TOKEN`` to download the tokenizer and set the
+   variable ``TOKENIZER_MODEL`` to the tokenizer path.
+
+   .. code-block:: shell
+
+      export HF_TOKEN=<Your personal Hugging Face access token>
+
+   The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path.
+
+   .. code-block:: shell
+
+      TOKENIZER_MODEL="meta-llama/Llama-3.3-70B-Instruct"
+
+.. container:: model-doc pyt_megatron_lm_train_llama-3.1-8b
+
+   The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path.
+
+   .. code-block:: shell
+
+      TOKENIZER_MODEL="meta-llama/Llama-3.1-8B"
+
+.. container:: model-doc pyt_megatron_lm_train_llama-3.1-70b
+
+   The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path.
+
+   .. code-block:: shell
+
+      TOKENIZER_MODEL="meta-llama/Llama-3.1-70B"
+
+.. container:: model-doc pyt_megatron_lm_train_llama-2-7b pyt_megatron_lm_train_llama-2-70b
+
+   The training script uses either the ``Llama2Tokenizer`` or ``HuggingFaceTokenizer`` by default.
+
+.. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy
+
+   The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path.
+
+   .. code-block:: shell
+
+      TOKENIZER_MODEL="deepseek-ai/DeepSeek-V3"
+
+.. container:: model-doc pyt_megatron_lm_train_deepseek-v2-lite-16b
+
+   The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path.
+
+   .. code-block:: shell
+
+      TOKENIZER_MODEL="deepseek-ai/DeepSeek-V2-Lite"
+
+.. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b pyt_megatron_lm_train_mixtral-8x22b-proxy
+
+   Download the Mixtral tokenizer.
+
+   .. code-block:: shell
+
+      mkdir tokenizer
+      cd tokenizer
+      export HF_TOKEN=<Your personal Hugging Face access token>
+      wget --header="Authorization: Bearer $HF_TOKEN" -O ./tokenizer.model https://huggingface.co/mistralai/Mixtral-8x7B-v0.1/resolve/main/tokenizer.model
+
+   Use the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path.
+
+   .. code-block:: shell
+
+      TOKENIZER_MODEL=tokenizer/tokenizer.model
+
+.. container:: model-doc pyt_megatron_lm_train_qwen2.5-7b
+
+   The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path.
+
+   .. code-block:: shell
+
+      TOKENIZER_MODEL="Qwen/Qwen2.5-7B"
+
+.. container:: model-doc pyt_megatron_lm_train_qwen2.5-72b
+
+   The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path.
+
+   .. code-block:: shell
+
+      TOKENIZER_MODEL="Qwen/Qwen2.5-72B"
+
+Dataset options
+---------------
+
+You can use either mock data or real data for training.
+
+* Mock data can be useful for testing and validation. Use the ``MOCK_DATA`` variable to toggle between mock and real data. The default
+  value is ``1`` for enabled.
+
+  .. code-block:: bash
+
+     MOCK_DATA=1
+
+* If you're using a real dataset, update the ``DATA_PATH`` variable to point to the location of your dataset.
+
+  .. code-block:: bash
+
+     MOCK_DATA=0
+
+     DATA_PATH="/data/bookcorpus_text_sentence"  # Change to where your dataset is stored
+
+  Ensure that the files are accessible inside the Docker container.
+
+Download the dataset
+^^^^^^^^^^^^^^^^^^^^
+
+.. container:: model-doc pyt_megatron_lm_train_llama-3.3-70b pyt_megatron_lm_train_llama-3.1-8b pyt_megatron_lm_train_llama-3.1-70b pyt_megatron_lm_train_llama-2-7b pyt_megatron_lm_train_llama-2-70b pyt_megatron_lm_train_llama-3.1-70b-proxy
+
+   For Llama models, use the `prepare_dataset.sh
+   <https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/llama>`_ script
+   to prepare your dataset.
+   To download the dataset, set the ``DATASET`` variable to the dataset you'd
+   like to use. Three datasets are supported: ``DATASET=wiki``, ``DATASET=fineweb``, and
+   ``DATASET=bookcorpus``.
+
+   .. code-block:: shell
+
+      DATASET=wiki TOKENIZER_MODEL=NousResearch/Llama-2-7b-chat-hf bash examples/llama/prepare_dataset.sh #for wiki-en dataset
+      DATASET=bookcorpus TOKENIZER_MODEL=NousResearch/Llama-2-7b-chat-hf bash examples/llama/prepare_dataset.sh #for bookcorpus dataset
+
+   ``TOKENIZER_MODEL`` can be any accessible Hugging Face tokenizer.
+   Remember to either pre-download the tokenizer or setup Hugging Face access
+   otherwise when needed -- see the :ref:`Tokenizer <amd-megatron-lm-tokenizer>` section.
+
+   .. note::
+
+      When training set ``DATA_PATH`` to the specific file name prefix pointing to the ``.bin`` or ``.idx``
+      as in the following example:
+
+      .. code-block:: shell
+
+         DATA_PATH="data/bookcorpus_text_sentence" # Change to where your dataset is stored.
+
+.. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy
+
+   If you don't already have the dataset, download the DeepSeek dataset using the following
+   commands:
+
+   .. code-block:: shell
+
+      mkdir deepseek-datasets
+      cd deepseek-datasets
+      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/SlimPajama.json
+      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-train.json
+      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-valid.json
+      cd ..
+      bash tools/run_make_pretraining_dataset_megatron.sh deepseek-datasets/SlimPajama.json DeepSeekV3Tokenizer text deepseek-datasets deepseek-ai/DeepSeek-V3
+
+   To train on this data, update the ``DATA_DIR`` variable to point to the location of your dataset.
+
+   .. code-block:: bash
+
+      MOCK_DATA=0 # Train on real data
+
+      DATA_DIR="<path-to>/deepseek-datasets"  # Change to where your dataset is stored
+
+      Ensure that the files are accessible inside the Docker container.
+
+.. container:: model-doc pyt_megatron_lm_train_deepseek-v2-lite-16b
+
+   If you don't already have the dataset, download the DeepSeek dataset using the following
+   commands:
+
+   .. code-block:: shell
+
+      mkdir deepseek-datasets
+      cd deepseek-datasets
+      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/SlimPajama.json
+      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-train.json
+      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-valid.json
+      cd ..
+      bash tools/run_make_pretraining_dataset_megatron.sh deepseek-datasets/SlimPajama.json DeepSeekV3Tokenizer text deepseek-datasets deepseek-ai/DeepSeek-V3
+
+   To train on this data, update the ``DATA_DIR`` variable to point to the location of your dataset.
+
+   .. code-block:: bash
+
+      MOCK_DATA=0 # Train on real data
+
+      DATA_DIR="<path-to>/deepseek-datasets"  # Change to where your dataset is stored
+
+.. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b pyt_megatron_lm_train_mixtral-8x22b-proxy
+
+   If you don't already have the dataset, download the Mixtral dataset using the following
+   commands:
+
+   .. code-block:: shell
+
+      mkdir mixtral-datasets
+      cd mixtral-datasets
+      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/mistral-datasets/wudao_mistralbpe_content_document.bin
+      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/mistral-datasets/wudao_mistralbpe_content_document.idx
+
+   To train on this data, update the ``DATA_DIR`` variable to point to the location of your dataset.
+
+   .. code-block:: bash
+
+      MOCK_DATA=0 # Train on real data
+
+      DATA_DIR="<path-to>/mixtral-datasets"  # Change to where your dataset is stored
+
+   Ensure that the files are accessible inside the Docker container.
+
+.. container:: model-doc pyt_megatron_lm_train_qwen2.5-7b pyt_megatron_lm_train_qwen2.5-72b
+
+   If you don't already have the dataset, download the Mixtral dataset using the following
+   commands:
+
+   .. code-block:: shell
+
+      mkdir -p temp/qwen-datasets
+      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/qwen-datasets/wudao_qwenbpe_text_document.bin
+      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/qwen-datasets/wudao_qwenbpe_text_document.idx
+
+   To train on this data, update the ``DATA_DIR`` variable to point to the location of your dataset.
+
+   .. code-block:: bash
+
+      MOCK_DATA=0 # Train on real data
+
+      DATA_DIR="<path-to>/qwen-datasets"  # Change to where your dataset is stored
+
+   Ensure that the files are accessible inside the Docker container.
+
+.. _amd-megatron-lm-run-training:
+
+Run training
+============
+
+Use the following example commands to set up the environment, configure
+:ref:`key options <amd-megatron-lm-benchmark-test-vars>`, and run training on
+MI300X Series GPUs with the AMD Megatron-LM environment.
+
+Single node training
+--------------------
+
+.. container:: model-doc pyt_megatron_lm_train_llama-3.3-70b
+
+   To run the training on a single node for Llama 3.3 70B BF16 with FSDP-v2 enabled, add the ``FSDP=1`` argument.
+   For example, use the following command:
+
+   .. code-block:: shell
+
+      TOKENIZER_MODEL=meta-llama/Llama-3.3-70B-Instruct \
+      CKPT_FORMAT=torch_dist \
+      TEE_OUTPUT=1 \
+      RECOMPUTE=1 \
+      SEQ_LENGTH=8192 \
+      MBS=2 \
+      BS=16 \
+      TE_FP8=0 \
+      TP=1 \
+      PP=1 \
+      FSDP=1 \
+      MODEL_SIZE=70 \
+      TOTAL_ITERS=50 \
+      bash examples/llama/train_llama3.sh
+
+   .. note::
+
+      It is suggested to use ``TP=1`` when FSDP is enabled for higher
+      throughput. FSDP-v2 is not supported with pipeline parallelism, expert
+      parallelism, MCore's distributed optimizer, gradient accumulation fusion,
+      or FP16.
+
+.. container:: model-doc pyt_megatron_lm_train_llama-3.1-8b
+
+   To run training on a single node for Llama 3.1 8B FP8, navigate to the Megatron-LM folder and use the
+   following command.
+
+   .. tab-set::
+
+      .. tab-item:: MI355X and MI350X
+         :sync: MI355X and MI350X
+
+         .. code-block:: shell
+
+            TEE_OUTPUT=1 \
+            MBS=4 \
+            BS=512 \
+            TP=1 \
+            TE_FP8=1 \
+            SEQ_LENGTH=8192 \
+            MODEL_SIZE=8 \
+            TOTAL_ITERS=10 \
+            GEMM_TUNING=0 \
+            bash examples/llama/train_llama3.sh
+
+      .. tab-item:: MI300X
+         :sync: MI325X and MI300X
+
+         .. code-block:: shell
+
+            TEE_OUTPUT=1 \
+            MBS=2 \
+            BS=128 \
+            TP=1 \
+            TE_FP8=1 \
+            SEQ_LENGTH=8192 \
+            MODEL_SIZE=8 \
+            TOTAL_ITERS=50 \
+            bash examples/llama/train_llama3.sh
+
+   For Llama 3.1 8B BF16, use the following command:
+
+   .. tab-set::
+
+      .. tab-item:: MI355X and MI350X
+         :sync: MI355X and MI350X
+
+         .. code-block:: shell
+
+            TEE_OUTPUT=1 \
+            MBS=4 \
+            BS=512 \
+            TP=1 \
+            TE_FP8=0 \
+            SEQ_LENGTH=8192 \
+            MODEL_SIZE=8 \
+            TOTAL_ITERS=10 \
+            GEMM_TUNING=1 \
+            bash examples/llama/train_llama3.sh
+
+      .. tab-item:: MI300X
+         :sync: MI325X and MI300X
+
+         .. code-block:: shell
+
+            TEE_OUTPUT=1 \
+            MBS=2 \
+            BS=128 \
+            TP=1 \
+            TE_FP8=0 \
+            SEQ_LENGTH=8192 \
+            MODEL_SIZE=8 \
+            TOTAL_ITERS=50 \
+            bash examples/llama/train_llama3.sh
+
+.. container:: model-doc pyt_megatron_lm_train_llama-3.1-70b
+
+   To run the training on a single node for Llama 3.1 70B BF16 with FSDP-v2 enabled, add the ``FSDP=1`` argument.
+   For example, use the following command:
+
+   .. code-block:: shell
+
+      CKPT_FORMAT=torch_dist \
+      TEE_OUTPUT=1 \
+      MBS=3 \
+      BS=24 \
+      TP=1 \
+      TE_FP8=0 \
+      FSDP=1 \
+      RECOMPUTE=1 \
+      SEQ_LENGTH=8192 \
+      MODEL_SIZE=70 \
+      TOTAL_ITERS=50 \
+      bash examples/llama/train_llama3.sh
+
+   .. note::
+
+      It is suggested to use ``TP=1`` when FSDP is enabled for higher
+      throughput. FSDP-v2 is not supported with pipeline parallelism, expert
+      parallelism, MCore's distributed optimizer, gradient accumulation fusion,
+      or FP16.
+
+   To run the training on a single node for Llama 3.1 70B FP8, use the
+   following command.
+
+   .. note::
+
+      The MI300X configuration uses a proxy model. On MI300X GPUs, use two or more nodes
+      to run the full Llama 3.1 70B model with FP8 precision. MI355X and MI350X GPUs
+      can support the full 70B model with FP8 precision on a single node.
+
+   .. tab-set::
+
+      .. tab-item:: MI355X and MI350X
+         :sync: MI355X and MI350X
+
+         .. code-block:: shell
+
+            CKPT_FORMAT=torch_dist \
+            TEE_OUTPUT=1 \
+            RECOMPUTE=1 \
+            MBS=3 \
+            BS=24 \
+            TP=1 \
+            TE_FP8=1 \
+            SEQ_LENGTH=8192 \
+            MODEL_SIZE=70 \
+            FSDP=1 \
+            TOTAL_ITERS=10 \
+            bash examples/llama/train_llama3.sh
+
+      .. tab-item:: MI300X
+         :sync: MI325X and MI300X
+
+         .. code-block:: shell
+
+            FP8_WEIGHT_TRANSPOSE_CACHE=0 \
+            CKPT_FORMAT=torch_dist \
+            TEE_OUTPUT=1 \
+            RECOMPUTE=1 \
+            MBS=3 \
+            BS=24 \
+            TP=1 \
+            TE_FP8=1 \
+            SEQ_LENGTH=8192 \
+            MODEL_SIZE=70 \
+            FSDP=1 \
+            TOTAL_ITERS=10 \
+            NUM_LAYERS=40 \
+            bash examples/llama/train_llama3.sh
+
+   .. note::
+
+      The MI300X configuration uses a proxy model. On MI300X GPUs, use two or more nodes
+      to run the full Llama 3.1 70B model with FP8 precision. MI355X and MI350X GPUs
+      can support the full 70B model with FP8 precision on a single node.
+
+   .. note::
+
+      It is suggested to use ``TP=1`` when FSDP is enabled for higher
+      throughput. FSDP-v2 is not supported with pipeline parallelism, expert
+      parallelism, MCore's distributed optimizer, gradient accumulation fusion,
+      or FP16.
+
+.. container:: model-doc pyt_megatron_lm_train_llama-2-7b
+
+   To run training on a single node for Llama 2 7B FP8, navigate to the Megatron-LM folder and use the
+   following command.
+
+   .. code-block:: shell
+
+      TEE_OUTPUT=1 \
+      MBS=4 \
+      BS=256 \
+      TP=1 \
+      TE_FP8=1 \
+      SEQ_LENGTH=4096 \
+      MODEL_SIZE=7 \
+      TOTAL_ITERS=50 \
+      bash examples/llama/train_llama2.sh
+
+   For Llama 2 7B BF16, use the following command:
+
+   .. code-block:: shell
+
+      TEE_OUTPUT=1 \
+      MBS=4 \
+      BS=256 \
+      TP=1 \
+      TE_FP8=0 \
+      SEQ_LENGTH=4096 \
+      MODEL_SIZE=7 \
+      TOTAL_ITERS=50 \
+      bash examples/llama/train_llama2.sh
+
+.. container:: model-doc pyt_megatron_lm_train_llama-2-70b
+
+   To run the training on a single node for Llama 2 70B BF16 with FSDP-v2 enabled, add the ``FSDP=1`` argument.
+   For example, use the following command:
+
+   .. code-block:: shell
+
+      CKPT_FORMAT=torch_dist \
+      TEE_OUTPUT=1 \
+      MBS=7 \
+      BS=56 \
+      TP=1 \
+      TE_FP8=0 \
+      FSDP=1 \
+      RECOMPUTE=1 \
+      SEQ_LENGTH=4096 \
+      MODEL_SIZE=70 \
+      TOTAL_ITERS=50 \
+      bash examples/llama/train_llama2.sh
+
+   .. note::
+
+      It is suggested to use ``TP=1`` when FSDP is enabled for higher
+      throughput. FSDP-v2 is not supported with pipeline parallelism, expert
+      parallelism, MCore's distributed optimizer, gradient accumulation fusion,
+      or FP16.
+
+.. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy
+
+   To run training on a single node for DeepSeek-V3 (MoE with expert parallel) with 3-layer proxy,
+   navigate to the Megatron-LM folder and use the following command.
+
+   .. code-block:: shell
+
+      export NVTE_FUSED_ATTN_CK=0
+      FORCE_BALANCE=true \
+      RUN_ENV=cluster \
+      MODEL_SIZE=671B \
+      TRAIN_ITERS=50 \
+      SEQ_LEN=4096 \
+      NUM_LAYERS=3 \
+      MICRO_BATCH_SIZE=1 GLOBAL_BATCH_SIZE=32 \
+      PR=bf16 \
+      TP=1 PP=1 ETP=1 EP=8 \
+      GEMM_TUNING=1 \
+      NVTE_CK_USES_BWD_V3=1 \
+      USE_GROUPED_GEMM=true MOE_USE_LEGACY_GROUPED_GEMM=true \
+      GPT_LAYER_IN_TE=true \
+      bash examples/deepseek_v3/train_deepseekv3.sh
+
+.. container:: model-doc pyt_megatron_lm_train_deepseek-v2-lite-16b
+
+   To run training on a single node for DeepSeek-V2-Lite (MoE with expert parallel),
+   navigate to the Megatron-LM folder and use the following command.
+
+   .. code-block:: shell
+
+      export NVTE_FUSED_ATTN_CK=0
+      GEMM_TUNING=1 \
+      PR=bf16 \
+      MBS=4 \
+      AC=none \
+      SEQ_LEN=4096 \
+      PAD_LEN=4096 \
+      TRAIN_ITERS=20 \
+      bash examples/deepseek_v2/train_deepseekv2.sh
+
+   .. note::
+
+      Note that DeepSeek-V2-Lite is experiencing instability due to GPU memory access fault
+      for large iterations.
+      For stability, it's recommended to use Primus for this workload.
+      See :doc:`../primus-megatron`.
+
+.. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b
+
+   To run training on a single node for Mixtral 8x7B (MoE with expert parallel),
+   navigate to the Megatron-LM folder and use the following command.
+
+   .. code-block:: shell
+
+      TOKENIZER_MODEL=<path/to/tokenizer/model>
+      RECOMPUTE_NUM_LAYERS=0 \
+      TEE_OUTPUT=1 \
+      MBS=1 \
+      GBS=16 \
+      TP_SIZE=1 \
+      PP_SIZE=1 \
+      AC=none \
+      PR=bf16 \
+      EP_SIZE=8 \
+      ETP_SIZE=1 \
+      SEQLEN=4096 \
+      FORCE_BALANCE=true \
+      MOCK_DATA=1 \
+      RUN_ENV=cluster \
+      MODEL_SIZE=8x7B \
+      TRAIN_ITERS=50 \
+      bash examples/mixtral/train_mixtral_moe.sh
+
+.. container:: model-doc pyt_megatron_lm_train_mixtral-8x22b-proxy
+
+   To run training on a single node for Mixtral 8x7B (MoE with expert parallel) with 4-layer proxy,
+   navigate to the Megatron-LM folder and use the following command.
+
+   .. code-block:: shell
+
+      TOKENIZER_MODEL=<path/to/tokenizer/model>
+      RECOMPUTE_NUM_LAYERS=4 \
+      TEE_OUTPUT=1 \
+      MBS=1 \
+      GBS=16 \
+      TP_SIZE=1 \
+      PP_SIZE=1 \
+      AC=full \
+      NUM_LAYERS=4 \
+      PR=bf16 \
+      EP_SIZE=8 \
+      ETP_SIZE=1 \
+      SEQLEN=8192 \
+      FORCE_BALANCE=true \
+      MOCK_DATA=1 \
+      RUN_ENV=cluster \
+      MODEL_SIZE=8x22B \
+      TRAIN_ITERS=50 \
+      bash examples/mixtral/train_mixtral_moe.sh
+
+.. container:: model-doc pyt_megatron_lm_train_qwen2.5-7b
+
+   To run training on a single node for Qwen 2.5 7B BF16, use the following
+   command.
+
+   .. code-block:: shell
+
+      bash examples/qwen/train_qwen2.sh TP=1 \
+          CP=1 \
+          PP=1 \
+          MBS=10 \
+          BS=640 \
+          TE_FP8=0 \
+          MODEL_SIZE=7 \
+          SEQ_LENGTH=2048 \
+          TOTAL_ITERS=50 \
+          MOCK_DATA=1 \
+          TOKENIZER_MODEL=Qwen/Qwen2.5-7B
+
+   For FP8, use the following command.
+
+   .. code-block:: shell
+
+      bash examples/qwen/train_qwen2.sh \
+          TP=1 \
+          CP=1 \
+          PP=1 \
+          MBS=10 \
+          BS=640 \
+          TE_FP8=1 \
+          MODEL_SIZE=7 \
+          SEQ_LENGTH=2048 \
+          TOTAL_ITERS=50 \
+          MOCK_DATA=1 \
+          TOKENIZER_MODEL=Qwen/Qwen2.5-7B
+
+.. container:: model-doc pyt_megatron_lm_train_qwen2.5-72b
+
+   To run the training on a single node for Qwen 2.5 72B BF16, use the following command.
+
+   .. code-block:: shell
+
+      bash examples/qwen/train_qwen2.sh \
+          FSDP=1 \
+          CP=1 \
+          PP=1 \
+          MBS=3 \
+          BS=24 \
+          TE_FP8=0 \
+          MODEL_SIZE=72 \
+          SEQ_LENGTH=2048 \
+          TOTAL_ITERS=50 \
+          MOCK_DATA=1 \
+          TOKENIZER_MODEL=Qwen/Qwen2.5-72B \
+          RECOMPUTE_ACTIVATIONS=full \
+          CKPT_FORMAT=torch_dist
+
+.. _amd-megatron-lm-multi-node-examples:
+
+Multi-node training examples
+----------------------------
+
+To run training on multiple nodes, launch the Docker container on each node.
+For example, for Llama 3 using a two node setup (``NODE0`` as the master node),
+use these commands.
+
+* On the master node ``NODE0``:
+
+  .. code-block:: shell
+
+     TEE_OUTPUT=1 \
+     MBS=2 \
+     BS=256 \
+     TP=1 \
+     TE_FP8=1 \
+     SEQ_LENGTH=8192 \
+     MODEL_SIZE=8  \
+     MASTER_ADDR=IP_NODE0 \
+     NNODES=2 \
+     NODE_RANK=0 \
+     bash examples/llama/train_llama3.sh
+
+* On the worker node ``NODE1``:
+
+  .. code-block:: shell
+
+     TEE_OUTPUT=1 \
+     MBS=2 \
+     BS=256 \
+     TP=1 \
+     TE_FP8=1 \
+     SEQ_LENGTH=8192 \
+     MODEL_SIZE=8  \
+     MASTER_ADDR=IP_NODE0 \
+     NNODES=2 \
+     NODE_RANK=1 \
+     bash examples/llama/train_llama3.sh
+
+Or, for DeepSeek-V3, an example script ``train_deepseek_v3_slurm.sh`` is
+provided in
+`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/deepseek_v3>`__ to
+enable training at scale under a SLURM environment. For example, to run
+training on 16 nodes, try the following command:
+
+.. code-block:: shell
+
+   sbatch examples/deepseek_v3/train_deepseek_v3_slurm.sh
+
+.. _amd-megatron-lm-benchmark-test-vars:
+
+Key options
+-----------
+
+The benchmark tests support the following sets of variables.
+
+``TEE_OUTPUT``
+  ``1`` to enable training logs or ``0`` to disable.
+
+``TE_FP8``
+  ``0`` for B16 or ``1`` for FP8 -- ``0`` by default.
+
+``GEMM_TUNING``
+  ``1`` to enable GEMM tuning, which boosts performance by using the best GEMM kernels.
+
+``USE_FLASH_ATTN``
+  ``1`` to enable Flash Attention.
+
+``FSDP``
+  ``1`` to enable PyTorch FSDP2. If FSDP is enabled, ``--use-distributed-optimizer``,
+  ``--overlap-param-gather``, and ``--sequence-parallel`` are automatically disabled.
+
+``ENABLE_PROFILING``
+  ``1`` to enable PyTorch profiling for performance analysis.
+
+``transformer-impl``
+  ``transformer_engine`` to use the Transformer Engine (TE) or ``local`` to disable TE.
+
+``MODEL_SIZE``
+  ``8B`` or ``70B`` for Llama 3 and 3.1. ``7B`` or ``70B`` for Llama 2, for example.
+
+``TOTAL_ITERS``
+  The total number of iterations -- ``10`` by default.
+
+``MOCK_DATA``
+  ``1`` to use mock data or ``0`` to use real data you provide.
+
+``MBS``
+  Micro batch size.
+
+``BS``
+  Global batch size.
+
+``TP`` / ``TP_SIZE``
+  Tensor parallel (``1``, ``2``, ``4``, ``8``). ``TP`` is disabled when ``FSDP`` is turned on.
+
+``EP`` / ``EP_SIZE``
+  Expert parallel for MoE models.
+
+``SEQ_LENGTH``
+  Input sequence length.
+
+``PR``
+  Precision for training. ``bf16`` for BF16 (default) or ``fp8`` for FP8 GEMMs.
+
+``AC``
+  Activation checkpointing (``none``, ``sel``, or ``full``) -- ``sel`` by default.
+
+``NUM_LAYERS``
+  Use reduced number of layers as a proxy model.
+
+``RECOMPUTE_NUM_LAYERS``
+  Number of layers used for checkpointing recompute.
+
+Known issues
+============
+
+PyTorch Profiler may produce inaccurate traces when CPU activity profiling is enabled.
+
+Previous versions
+=================
+
+See :doc:`megatron-lm-history` to find documentation for previous releases
+of the ``ROCm/megatron-lm`` Docker image.
diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.9.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.9.rst
new file mode 100644
index 000000000..bc544ca4a
--- /dev/null
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.9.rst
@@ -0,0 +1,1019 @@
+:orphan:
+
+.. meta::
+   :description: How to train a model using Megatron-LM for ROCm.
+   :keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
+
+********************************************
+Training a model with Primus and Megatron-LM
+********************************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm Megatron-LM
+   training performance documentation. See :doc:`../primus-megatron` for the latest version.
+
+`Primus <https://github.com/AMD-AGI/Primus>`__ is a unified and flexible
+training framework for AMD Instinct GPUs designed to support multiple training
+engine backends -- including Megatron -- to deliver scalable, high-performance
+model training. Performance acceleration is powered by `Primus Turbo
+<https://github.com/AMD-AGI/Primus-Turbo>`__ and ROCm libraries.
+
+.. note::
+
+   For a unified training solution on AMD GPUs with ROCm, the `rocm/megatron-lm
+   <https://hub.docker.com/r/rocm/megatron-lm/>`__ Docker Hub registry will be
+   deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
+   The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
+   including Megatron-LM and :doc:`torchtitan <../primus-pytorch>`.
+
+   Primus with Megatron is designed to replace the :doc:`ROCm Megatron-LM
+   training <../megatron-lm>` workflow. To learn how to migrate workloads from
+   Megatron-LM to Primus with Megatron, see
+   :doc:`megatron-lm-primus-migration-guide`.
+
+AMD provides a ready-to-use Docker images for MI355X, MI350X,
+MI325X, and MI300X GPUs containing essential components for Primus, ROCm, and
+Megatron-LM.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.9-benchmark-models.yaml
+
+   {% set dockers = data.dockers %}
+   .. tab-set::
+
+   {% for supported_gpus, docker in dockers.items() %}
+      .. tab-item:: {{ supported_gpus }}
+         :sync: {{ supported_gpus }}
+
+         .. list-table::
+            :header-rows: 1
+
+            * - Software component
+              - Version
+
+            {% for component_name, component_version in docker.components.items() %}
+            * - {{ component_name }}
+              - {{ component_version }}
+            {% endfor %}
+   {% endfor %}
+
+.. _amd-primus-megatron-lm-model-support-v259:
+
+Supported models
+================
+
+The following models are pre-optimized for performance on AMD Instinct GPUs.
+Some instructions, commands, and training examples in this documentation
+might vary by model -- select one to get started.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.9-benchmark-models.yaml
+
+   {% set model_groups = data.model_groups %}
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+         <div class="row gx-0">
+            <div class="col-2 me-1 px-2 model-param-head">Model</div>
+            <div class="row col-10 pe-0">
+      {% for model_group in model_groups %}
+               <div class="col-3 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+      {% endfor %}
+            </div>
+         </div>
+
+         <div class="row gx-0 pt-1">
+            <div class="col-2 me-1 px-2 model-param-head">Variant</div>
+            <div class="row col-10 pe-0">
+      {% for model_group in model_groups %}
+         {% set models = model_group.models %}
+         {% for model in models %}
+            {% if models|length % 3 == 0 %}
+               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% else %}
+               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% endif %}
+         {% endfor %}
+      {% endfor %}
+            </div>
+         </div>
+      </div>
+
+.. note::
+
+   Some models, such as Llama, require an external license agreement through
+   a third party (for example, Meta).
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting training.
+
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
+.. _mi300x-amd-primus-megatron-lm-training-v259:
+
+Environment setup
+=================
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.9-benchmark-models.yaml
+
+   Use the following instructions to set up the environment, configure the script to train models, and
+   reproduce the benchmark results on AMD Instinct GPUs.
+
+.. _amd-primus-megatron-lm-requirements-v259:
+
+Pull the Docker image
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.9-benchmark-models.yaml
+
+   {% set dockers = data.dockers %}
+
+   1. Pull the appropriate Docker image for your AMD GPU architecture from Docker Hub.
+
+      .. tab-set::
+
+         {% for supported_gpus, docker in dockers.items() %}
+         .. tab-item:: {{ supported_gpus }}
+            :sync: {{ supported_gpus }}
+
+            .. code-block:: shell
+
+               docker pull {{ docker.pull_tag }}
+         {% endfor %}
+
+   2. Launch the Docker container.
+
+      .. tab-set::
+
+         {% for supported_gpus, docker in dockers.items() %}
+         .. tab-item:: {{ supported_gpus }}
+            :sync: {{ supported_gpus }}
+
+            .. code-block:: shell
+
+               docker run -it \
+                   --device /dev/dri \
+                   --device /dev/kfd \
+                   --device /dev/infiniband \
+                   --network host --ipc host \
+                   --group-add video \
+                   --cap-add SYS_PTRACE \
+                   --security-opt seccomp=unconfined \
+                   --privileged \
+                   -v $HOME:$HOME \
+                   --shm-size 128G \
+                   --name primus_training_env \
+                   {{ docker.pull_tag }}
+         {% endfor %}
+
+3. Use these commands if you exit the ``primus_training_env`` container and need to return to it.
+
+   .. code-block:: shell
+
+      docker start primus_training_env
+      docker exec -it primus_training_env bash
+
+The Docker container hosts verified commit ``e16b27b`` of the `Primus
+<https://github.com/AMD-AGI/Primus/tree/e16b27b>`__ repository.
+
+.. _amd-primus-megatron-lm-environment-setup-v259:
+
+Configuration
+=============
+
+Primus defines a training configuration in YAML for each model in
+`examples/megatron/configs <https://github.com/AMD-AGI/Primus/tree/e16b27bf6c1b2798f38848fc574fee60d9a9b902/examples/megatron/configs>`__.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.9-benchmark-models.yaml
+
+   {% set model_groups = data.model_groups %}
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+   .. container:: model-doc {{ model.mad_tag }}
+
+      For example, to update training parameters for {{ model.model }}, you can
+      update ``examples/megatron/configs/{{ model.config_name }}``. Training
+      configuration YAML files for other models follow this naming convention.
+
+      {% endfor %}
+   {% endfor %}
+
+.. note::
+
+   See :ref:`Key options <amd-primus-megatron-lm-benchmark-test-vars>` for more information on configuration options.
+
+Dataset options
+---------------
+
+You can use either mock data or real data for training.
+
+* Mock data can be useful for testing and validation. Use the ``mock_data`` field to toggle between mock and real data. The default
+  value is ``true`` for enabled.
+
+  .. code-block:: yaml
+
+     mock_data: true
+
+* If you're using a real dataset, update the ``train_data_path`` field to point to the location of your dataset.
+
+  .. code-block:: bash
+
+     mock_data: false
+     train_data_path: /path/to/your/dataset
+
+  Ensure that the files are accessible inside the Docker container.
+
+.. _amd-primus-megatron-lm-tokenizer-v259:
+
+Tokenizer
+---------
+
+Set the ``HF_TOKEN`` environment variable with
+right permissions to access the tokenizer for each model.
+
+.. code-block:: bash
+
+   # Export your HF_TOKEN in the workspace
+   export HF_TOKEN=<your_hftoken>
+
+.. note::
+
+   In Primus, each model uses a tokenizer from Hugging Face. For example, Llama
+   3.1 8B model uses ``tokenizer_model: meta-llama/Llama-3.1-8B`` and
+   ``tokenizer_type: Llama3Tokenizer`` defined in the `llama3.1-8B model
+   <https://github.com/AMD-AGI/Primus/blob/e16b27bf6c1b2798f38848fc574fee60d9a9b902/examples/megatron/configs/llama3.1_8B-pretrain.yaml>`__
+   definition.
+
+.. _amd-primus-megatron-lm-run-training-v259:
+
+Run training
+============
+
+Use the following example commands to set up the environment, configure
+:ref:`key options <amd-primus-megatron-lm-benchmark-test-vars>`, and run training on
+AMD Instinct GPUs using Primus with the Megatron backend.
+
+Single node training
+--------------------
+
+To run training on a single node, navigate to ``/workspace/Primus`` and use the following setup command:
+
+.. code-block:: shell
+
+   pip install -r requirements.txt
+   export HSA_NO_SCRATCH_RECLAIM=1
+   export NVTE_CK_USES_BWD_V3=1
+
+.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.3-70b
+
+   Once setup is complete, run the appropriate training command.
+   The following run commands are tailored to Llama 3.3 70B.
+   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+
+   To run pre-training for Llama 3.3 70B BF16, run:
+
+   .. tab-set::
+
+      .. tab-item:: MI355X and MI350X
+         :sync: MI355X and MI350X
+
+         .. code-block:: shell
+
+            EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
+            bash ./examples/run_pretrain.sh \
+                --train_iters 50 \
+                --micro_batch_size 6 \
+                --global_batch_size 48 \
+
+      .. tab-item:: MI300X
+         :sync: MI325X and MI300X
+
+         .. code-block:: shell
+
+            EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
+            bash ./examples/run_pretrain.sh \
+                --train_iters 50 \
+                --micro_batch_size 2 \
+                --global_batch_size 16
+
+.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b
+
+   Once setup is complete, run the appropriate training command.
+   The following run commands are tailored to Llama 3.1 8B.
+   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+
+   To run pre-training for Llama 3.1 8B FP8, run:
+
+   .. tab-set::
+
+      .. tab-item:: MI355X and MI350X
+         :sync: MI355X and MI350X
+
+         .. code-block:: shell
+
+            EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
+            bash ./examples/run_pretrain.sh \
+                --train_iters 50 \
+                --fp8 hybrid \
+                --micro_batch_size 4 \
+                --global_batch_size 512 \
+
+      .. tab-item:: MI300X
+         :sync: MI325X and MI300X
+
+         .. code-block:: shell
+
+            EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
+            bash ./examples/run_pretrain.sh \
+                --train_iters 50 \
+                --fp8 hybrid
+
+   For Llama 3.1 8B BF16, use the following command:
+
+   .. tab-set::
+
+      .. tab-item:: MI355X and MI350X
+         :sync: MI355X and MI350X
+
+         .. code-block:: shell
+
+            EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
+            bash ./examples/run_pretrain.sh \
+                --train_iters 50 \
+                --micro_batch_size 4 \
+                --global_batch_size 512 \
+
+      .. tab-item:: MI300X
+         :sync: MI325X and MI300X
+
+         .. code-block:: shell
+
+            EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
+            bash ./examples/run_pretrain.sh \
+                --train_iters 50
+
+.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-70b
+
+   Once setup is complete, run the appropriate training command.
+   The following run commands are tailored to Llama 3.1 70B.
+   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+
+   To run pre-training for Llama 3.1 70B BF16, run:
+
+   .. tab-set::
+
+      .. tab-item:: MI355X and MI350X
+         :sync: MI355X and MI350X
+
+         .. code-block:: shell
+
+            EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
+            bash ./examples/run_pretrain.sh \
+                 --train_iters 50 \
+                 --micro_batch_size 4 \
+                 --global_batch_size 32
+
+      .. tab-item:: MI300X
+         :sync: MI325X and MI300X
+
+         .. code-block:: shell
+
+            EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
+            bash ./examples/run_pretrain.sh \
+                 --train_iters 50
+
+   To run the training on a single node for Llama 3.1 70B FP8, use the following command.
+
+   .. note::
+
+      The MI300X configuration uses a proxy model. On MI300X GPUs, use two or more nodes
+      to run the full Llama 3.1 70B model with FP8 precision. MI355X and MI350X GPUs
+      can support the full 70B model with FP8 precision on a single node.
+
+   .. tab-set::
+
+      .. tab-item:: MI355X and MI350X
+         :sync: MI355X and MI350X
+
+         .. code-block:: shell
+
+            EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
+            bash ./examples/run_pretrain.sh \
+                --train_iters 50 \
+                --fp8 hybrid \
+                --no_fp8_weight_transpose_cache true \
+                --micro_batch_size 3 \
+                --global_batch_size 24
+
+      .. tab-item:: MI300X
+         :sync: MI325X and MI300X
+
+         .. code-block:: shell
+
+            EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
+            bash ./examples/run_pretrain.sh \
+                --train_iters 50 \
+                --num_layers 40 \
+                --fp8 hybrid \
+                --no_fp8_weight_transpose_cache true
+
+.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-7b
+
+   Once setup is complete, run the appropriate training command.
+   The following run commands are tailored to Llama 2 7B.
+   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+
+   To run pre-training for Llama 2 7B FP8, run:
+
+   .. tab-set::
+
+      .. tab-item:: MI355X and MI350X
+         :sync: MI355X and MI350X
+
+         .. code-block:: shell
+
+            EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
+            bash ./examples/run_pretrain.sh \
+                --train_iters 50 \
+                --fp8 hybrid \
+                --micro_batch_size 13 \
+                --global_batch_size 416
+
+      .. tab-item:: MI300X
+         :sync: MI325X and MI300X
+
+         .. code-block:: shell
+
+            EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
+            bash ./examples/run_pretrain.sh \
+                --train_iters 50 \
+                --fp8 hybrid
+
+   To run pre-training for Llama 2 7B BF16, run:
+
+   .. tab-set::
+
+      .. tab-item:: MI355X and MI350X
+         :sync: MI355X and MI350X
+
+         .. code-block:: shell
+
+            EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
+            bash ./examples/run_pretrain.sh \
+                --train_iters 50 \
+                --micro_batch_size 10 \
+                --global_batch_size 640
+
+      .. tab-item:: MI300X
+         :sync: MI325X and MI300X
+
+         .. code-block:: shell
+
+            EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
+            bash ./examples/run_pretrain.sh \
+                --train_iters 50
+
+.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-70b
+
+   Once setup is complete, run the appropriate training command.
+   The following run commands are tailored to Llama 2 70B.
+   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+
+   To run pre-training for Llama 2 70B BF16, run:
+
+   .. tab-set::
+
+      .. tab-item:: MI355X and MI350X
+         :sync: MI355X and MI350X
+
+         .. code-block:: shell
+
+            EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
+            bash ./examples/run_pretrain.sh \
+                --train_iters 50 \
+                --micro_batch_size 17 \
+                --global_batch_size 272
+
+      .. tab-item:: MI300X
+         :sync: MI325X and MI300X
+
+         .. code-block:: shell
+
+            EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
+            bash ./examples/run_pretrain.sh \
+                --train_iters 50
+
+.. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v3-proxy
+
+   Once setup is complete, run the appropriate training command.
+   The following run commands are tailored to DeepSeek-V3.
+   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+
+   To run training on a single node for DeepSeek-V3 (MoE with expert parallel) BF16 with 3-layer proxy,
+   use the following command:
+
+   .. tab-set::
+
+      .. tab-item:: MI355X and MI350X
+         :sync: MI355X and MI350X
+
+         .. code-block:: shell
+
+            EXP=examples/megatron/configs/deepseek_v3-pretrain.yaml \
+            bash examples/run_pretrain.sh \
+                --num_layers 3 \
+                --moe_layer_freq 1 \
+                --train_iters 50 \
+                --micro_batch_size 8 \
+                --global_batch_size 64
+
+      .. tab-item:: MI300X
+         :sync: MI325X and MI300X
+
+         .. code-block:: shell
+
+            EXP=examples/megatron/configs/deepseek_v3-pretrain.yaml \
+            bash examples/run_pretrain.sh \
+                --num_layers 3 \
+                --moe_layer_freq 1 \
+                --train_iters 50
+
+.. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v2-lite-16b
+
+   Once setup is complete, run the appropriate training command.
+   The following run commands are tailored to DeepSeek-V2-Lite.
+   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+
+   To run training on a single node for DeepSeek-V2-Lite (MoE with expert parallel) BF16,
+   use the following command:
+
+   .. tab-set::
+
+      .. tab-item:: MI355X and MI350X
+         :sync: MI355X and MI350X
+
+         .. code-block:: shell
+
+            EXP=examples/megatron/configs/deepseek_v2_lite-pretrain.yaml \
+            bash examples/run_pretrain.sh \
+                --train_iters 50 \
+                --micro_batch_size 12 \
+                --global_batch_size 768
+
+      .. tab-item:: MI300X
+         :sync: MI325X and MI300X
+
+         .. code-block:: shell
+
+            EXP=examples/megatron/configs/deepseek_v2_lite-pretrain.yaml \
+            bash examples/run_pretrain.sh \
+                --train_iters 50 \
+                --global_batch_size 256
+
+.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x7b
+
+   Once setup is complete, run the appropriate training command.
+   The following run commands are tailored to Mixtral 8x7B.
+   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+
+   To run training on a single node for Mixtral 8x7B (MoE with expert parallel),
+   use the following command:
+
+   .. tab-set::
+
+      .. tab-item:: MI355X and MI350X
+         :sync: MI355X and MI350X
+
+         .. code-block:: shell
+
+            EXP=examples/megatron/configs/mixtral_8x7B_v0.1-pretrain.yaml \
+            bash examples/run_pretrain.sh \
+                --train_iters 50 \
+                --micro_batch_size 4 \
+                --global_batch_size 256
+
+      .. tab-item:: MI300X
+         :sync: MI325X and MI300X
+
+         .. code-block:: shell
+
+            EXP=examples/megatron/configs/mixtral_8x7B_v0.1-pretrain.yaml \
+            bash examples/run_pretrain.sh \
+                --train_iters 50
+
+.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x22b-proxy
+
+   Once setup is complete, run the appropriate training command.
+   The following run commands are tailored to Mixtral 8x22B.
+   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+
+   To run training on a single node for Mixtral 8x22B BF16 (MoE with expert parallel) 4-layer proxy,
+   use the following command:
+
+   .. tab-set::
+
+      .. tab-item:: MI355X and MI350X
+         :sync: MI355X and MI350X
+
+         .. code-block:: shell
+
+            EXP=examples/megatron/configs/mixtral_8x22B_v0.1-pretrain.yaml \
+            bash examples/run_pretrain.sh \
+                --train_iters 50 \
+                --num_layers 4 \
+                --pipeline_model_parallel_size 1 \
+                --micro_batch_size 2 \
+                --global_batch_size 16
+
+      .. tab-item:: MI300X
+         :sync: MI325X and MI300X
+
+         .. code-block:: shell
+
+            EXP=examples/megatron/configs/mixtral_8x22B_v0.1-pretrain.yaml \
+            bash examples/run_pretrain.sh \
+                --train_iters 50 \
+                --num_layers 4 \
+                --pipeline_model_parallel_size 1 \
+                --micro_batch_size 1 \
+                --global_batch_size 16
+
+.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-7b
+
+   Once setup is complete, run the appropriate training command.
+   The following run commands are tailored to Qwen 2.5 7B.
+   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+
+   To run training on a single node for Qwen 2.5 7B BF16, use the following
+   command:
+
+   .. tab-set::
+
+      .. tab-item:: MI355X and MI350X
+         :sync: MI355X and MI350X
+
+         .. code-block:: shell
+
+            EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \
+            bash examples/run_pretrain.sh \
+                --train_iters 50 \
+                --micro_batch_size 16 \
+                --global_batch_size 768
+
+      .. tab-item:: MI300X
+         :sync: MI325X and MI300X
+
+         .. code-block:: shell
+
+            EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \
+            bash examples/run_pretrain.sh \
+                --train_iters 50
+
+   For FP8, use the following command.
+
+   .. tab-set::
+
+      .. tab-item:: MI355X and MI350X
+         :sync: MI355X and MI350X
+
+         .. code-block:: shell
+
+            EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \
+            bash examples/run_pretrain.sh \
+                --train_iters 50 \
+                --fp8 hybrid
+                --micro_batch_size 20 \
+                --global_batch_size 800
+
+      .. tab-item:: MI300X
+         :sync: MI325X and MI300X
+
+         .. code-block:: shell
+
+            EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \
+            bash examples/run_pretrain.sh \
+                --train_iters 50 \
+                --fp8 hybrid
+
+.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-72b
+
+   Once setup is complete, run the appropriate training command.
+   The following run commands are tailored to Qwen 2.5 72B.
+   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+
+   To run the training on a single node for Qwen 2.5 72B BF16, use the following command.
+
+   .. tab-set::
+
+      .. tab-item:: MI355X and MI350X
+         :sync: MI355X and MI350X
+
+         .. code-block:: shell
+
+            EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \
+            bash examples/run_pretrain.sh \
+                --train_iters 50 \
+                --micro_batch_size 16 \
+                --global_batch_size 256
+
+      .. tab-item:: MI300X
+         :sync: MI325X and MI300X
+
+         .. code-block:: shell
+
+            EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \
+            bash examples/run_pretrain.sh \
+                --train_iters 50
+
+.. _amd-primus-megatron-multi-node-examples-v259:
+
+Multi-node training examples
+----------------------------
+
+Refer to :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your environment for multi-node
+training.
+
+To run training on multiple nodes, you can use the
+`run_slurm_pretrain.sh <https://github.com/AMD-AGI/Primus/blob/main/examples/run_slurm_pretrain.sh>`__
+to launch the multi-node workload. Use the following steps to setup your environment:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.9-benchmark-models.yaml
+
+   {% set dockers = data.dockers %}
+   .. tab-set::
+
+      {% for supported_gpus, docker in dockers.items() %}
+      .. tab-item:: {{ supported_gpus }}
+         :sync: {{ supported_gpus }}
+
+         .. code-block:: shell
+
+            git clone --recurse-submodules https://github.com/AMD-AGI/Primus.git
+            cd Primus
+            git checkout e16b27b
+
+            export DOCKER_IMAGE={{ docker.pull_tag }}
+            export HF_TOKEN=<your_HF_token>
+            export HSA_NO_SCRATCH_RECLAIM=1
+            export NVTE_CK_USES_BWD_V3=1
+            export NCCL_IB_HCA=<your_NCCL_IB_HCA> # specify which RDMA interfaces to use for communication
+            export NCCL_SOCKET_IFNAME=<your_NCCL_SOCKET_IFNAME> # your Network Interface
+            export GLOO_SOCKET_IFNAME=<your_GLOO_SOCKET_IFNAME> # your Network Interface
+            export NCCL_IB_GID_INDEX=3 # Set InfiniBand GID index for NCCL communication. Default is 3 for ROCE
+      {% endfor %}
+
+.. note::
+
+   * Make sure correct network drivers are installed on the nodes. If inside a Docker, either install the drivers inside the Docker container or pass the network drivers from the host while creating Docker container.
+   * If ``NCCL_IB_HCA`` and ``NCCL_SOCKET_IFNAME`` are not set, Primus will try to auto-detect. However, since NICs can vary accross different cluster, it is encouraged to explicitly export your NCCL parameters for the cluster.
+   * To find your network interface, you can use ``ip a``.
+   * To find RDMA interfaces, you can use ``ibv_devices`` to get the list of all the RDMA/IB  devices.
+   * Remember to set ``DOCKER_IMAGE`` and ``HF_TOKEN`` (see :ref:`amd-primus-megatron-lm-tokenizer-v259`) as appropriate.
+
+.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b
+
+   Once setup is complete, run the appropriate training command.
+   The following run commands are tailored to Llama 3.1 8B.
+   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+
+   To train Llama 3.1 8B FP8 on 8 nodes, run:
+
+   .. code-block:: shell
+
+      # Adjust the training parameters.
+      # For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case.
+      NNODES=8 \
+      EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
+      bash ./examples/run_slurm_pretrain.sh \
+          --global_batch_size 1024 \
+          --fp8 hybrid
+
+.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-7b
+
+   Once setup is complete, run the appropriate training command.
+   The following run commands are tailored to Llama 2 7B.
+   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+
+   To train Llama 2 7B FP8 on 8 nodes, run:
+
+   .. code-block:: shell
+
+      # Adjust the training parameters.
+      # For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case.
+      NNODES=8 \
+      EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
+      bash ./examples/run_slurm_pretrain.sh \
+          --global_batch_size 2048 \
+          --fp8 hybrid
+
+.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-70b
+
+   Once setup is complete, run the appropriate training command.
+   The following run commands are tailored to Llama 3.1 70B.
+   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+
+   To train Llama 3.1 70B FP8 on 8 nodes, run:
+
+   .. code-block:: shell
+
+      # Adjust the training parameters.
+      # For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case.
+      NNODES=8 \
+      EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
+      bash examples/run_slurm_pretrain.sh \
+          --micro_batch_size 4 \
+          --global_batch_size 256 \
+          --recompute_num_layers 80 \
+          --no_fp8_weight_transpose_cache true \
+          --fp8 hybrid
+
+   To train Llama 3.1 70B BF16 on 8 nodes, run:
+
+   .. code-block:: shell
+
+      NNODES=8 \
+      EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
+      bash examples/run_slurm_pretrain.sh \
+          --micro_batch_size 1 \
+          --global_batch_size 256 \
+          --recompute_num_layers 12
+
+.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-70b
+
+   Once setup is complete, run the appropriate training command.
+   The following run commands are tailored to Llama 2 70B.
+   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+
+   To train Llama 2 70B FP8 on 8 nodes, run:
+
+   .. code-block:: shell
+
+      # Adjust the training parameters.
+      # For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case.
+      NNODES=8 \
+      EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
+      bash examples/run_slurm_pretrain.sh \
+          --micro_batch_size 10 \
+          --global_batch_size 640 \
+          --recompute_num_layers 80 \
+          --no_fp8_weight_transpose_cache true \
+          --fp8 hybrid
+
+   To train Llama 2 70B BF16 on 8 nodes, run:
+
+   .. code-block:: shell
+
+      NNODES=8 \
+      EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
+      bash ./examples/run_slurm_pretrain.sh \
+          --micro_batch_size 2 \
+          --global_batch_size 1536 \
+          --recompute_num_layers 12
+
+.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.3-70b
+
+   Once setup is complete, run the appropriate training command.
+   The following run commands are tailored to Llama 3.3 70B.
+   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+
+   To train Llama 3.3 70B FP8 on 8 nodes, run:
+
+   .. code-block:: shell
+
+      # Adjust the training parameters.
+      # For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case
+      NNODES=8 \
+      EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
+      bash examples/run_slurm_pretrain.sh \
+          --micro_batch_size 4 \
+          --global_batch_size 256 \
+          --recompute_num_layers 80 \
+          --no_fp8_weight_transpose_cache true \
+          --fp8 hybrid
+
+   To train Llama 3.3 70B BF16 on 8 nodes, run:
+
+   .. code-block:: shell
+
+      NNODES=8 \
+      EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
+      bash examples/run_slurm_pretrain.sh \
+          --micro_batch_size 1 \
+          --global_batch_size 256 \
+          --recompute_num_layers 12
+
+.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x7b
+
+   Once setup is complete, run the appropriate training command.
+   The following run commands are tailored to Llama 2 70B.
+   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+
+   To train Mixtral 8x7B BF16 on 8 nodes, run:
+
+   .. code-block:: shell
+
+      # Adjust the training parameters.
+      # For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case
+      NNODES=8 \
+      EXP=examples/megatron/configs/mixtral_8x7B_v0.1-pretrain.yaml \
+      bash examples/run_slurm_pretrain.sh \
+          --micro_batch_size 2 \
+          --global_batch_size 256
+
+.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-72b
+
+   Once setup is complete, run the appropriate training command.
+   The following run commands are tailored to Llama 2 70B.
+   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+
+   To train Qwen2.5 72B FP8 on 8 nodes, run:
+
+   .. code-block:: shell
+
+      # Adjust the training parameters.
+      # For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case
+      NNODES=8 \
+      EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \
+      bash examples/run_slurm_pretrain.sh \
+          --micro_batch_size 8 \
+          --global_batch_size 512 \
+          --recompute_num_layers 80 \
+          --no_fp8_weight_transpose_cache true \
+          --fp8 hybrid
+
+.. _amd-primus-megatron-lm-benchmark-test-vars-v259:
+
+Key options
+-----------
+
+The following are key options to take note of
+
+fp8
+  ``hybrid`` enables FP8 GEMMs.
+
+use_torch_fsdp2
+  ``use_torch_fsdp2: 1``  enables torch fsdp-v2. If FSDP is enabled,
+  set ``use_distributed_optimizer`` and ``overlap_param_gather`` to ``false``.
+
+profile
+  To enable PyTorch profiling, set these parameters:
+
+  .. code-block:: yaml
+
+     profile: true
+     use_pytorch_profiler: true
+     profile_step_end: 7
+     profile_step_start: 6
+
+train_iters
+  The total number of iterations (default: 50).
+
+mock_data
+  True by default.
+
+micro_batch_size
+  Micro batch size.
+
+global_batch_size
+  Global batch size.
+
+recompute_granularity
+  For activation checkpointing.
+
+num_layers
+  For using a reduced number of layers as with proxy models.
+
+Known issues
+============
+
+PyTorch Profiler may produce inaccurate traces when CPU activity profiling is enabled.
+
+Further reading
+===============
+
+- For an introduction to Primus, see `Primus: A Lightweight, Unified Training
+  Framework for Large Models on AMD GPUs <https://rocm.blogs.amd.com/software-tools-optimization/primus/README.html>`__.
+
+- To learn more about system settings and management practices to configure your system for
+  AMD Instinct MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`megatron-lm-history` to find documentation for previous releases
+of the ``ROCm/megatron-lm`` Docker image.
+
+This training environment now uses Primus with Megatron as the primary
+configuration. Limited support for the legacy ROCm Megatron-LM is still
+available; see the :doc:`../megatron-lm` documentation.
diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-pytorch-v25.9.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-pytorch-v25.9.rst
new file mode 100644
index 000000000..964c3db27
--- /dev/null
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-pytorch-v25.9.rst
@@ -0,0 +1,574 @@
+:orphan:
+
+.. meta::
+   :description: How to train a model using PyTorch for ROCm.
+   :keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
+
+****************************************
+Training a model with Primus and PyTorch
+****************************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm Primus PyTorch training
+   performance benchmark documentation. See :doc:`../primus-pytorch` for the latest version.
+
+`Primus <https://github.com/AMD-AGI/Primus>`__ is a unified and flexible
+LLM training framework designed to streamline training. It streamlines LLM
+training on AMD Instinct GPUs using a modular, reproducible configuration paradigm.
+Primus now supports the PyTorch torchtitan backend.
+
+.. note::
+
+   For a unified training solution on AMD GPUs with ROCm, the `rocm/pytorch-training
+   <https://hub.docker.com/r/rocm/pytorch-training/>`__ Docker Hub registry will be
+   deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
+   The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
+   including torchtitan and :doc:`Megatron-LM <../primus-megatron>`.
+
+   Primus with the PyTorch torchtitan backend is designed to replace the
+   :doc:`ROCm PyTorch training <../pytorch-training>` workflow. See
+   :doc:`../pytorch-training` to see steps to run workloads without Primus.
+
+AMD provides a ready-to-use Docker image for MI355X, MI350X, MI325X, and
+MI300X GPUs containing essential components for Primus and PyTorch training
+with Primus Turbo optimizations.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.9-benchmark-models.yaml
+
+   {% set dockers = data.dockers %}
+   .. tab-set::
+
+   {% for supported_gpus, docker in dockers.items() %}
+      .. tab-item:: {{ supported_gpus }}
+         :sync: {{ supported_gpus }}
+
+         .. list-table::
+            :header-rows: 1
+
+            * - Software component
+              - Version
+
+            {% for component_name, component_version in docker.components.items() %}
+            * - {{ component_name }}
+              - {{ component_version }}
+            {% endfor %}
+   {% endfor %}
+
+.. _amd-primus-pytorch-model-support-v259:
+
+Supported models
+================
+
+The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X GPUs.
+Some instructions, commands, and training recommendations in this documentation might
+vary by model -- select one to get started.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.9-benchmark-models.yaml
+
+   {% set model_groups = data.model_groups %}
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+         <div class="row gx-0">
+            <div class="col-2 me-1 px-2 model-param-head">Model</div>
+            <div class="row col-10 pe-0">
+      {% for model_group in model_groups %}
+               <div class="col-12 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+      {% endfor %}
+            </div>
+         </div>
+
+         <div class="row gx-0 pt-1">
+            <div class="col-2 me-1 px-2 model-param-head">Variant</div>
+            <div class="row col-10 pe-0">
+      {% for model_group in model_groups %}
+         {% set models = model_group.models %}
+         {% for model in models %}
+            {% if models|length % 3 == 0 %}
+               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% else %}
+               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% endif %}
+         {% endfor %}
+      {% endfor %}
+            </div>
+         </div>
+      </div>
+
+.. seealso::
+
+   For additional workloads, including Llama 3.3, Llama 3.2, Llama 2, GPT OSS, Qwen, and Flux models,
+   see the documentation :doc:`../pytorch-training` (without Primus)
+
+.. _amd-primus-pytorch-performance-measurements-v259:
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting training.
+
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
+This Docker image is optimized for specific model configurations outlined
+below. Performance can vary for other training workloads, as AMD
+doesn’t test configurations and run conditions outside those described.
+
+Pull the Docker image
+=====================
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.9-benchmark-models.yaml
+
+   {% set dockers = data.dockers %}
+
+   Use the following command to pull the Docker image from Docker Hub.
+
+   .. tab-set::
+
+      {% for supported_gpus, docker in dockers.items() %}
+      .. tab-item:: {{ supported_gpus }}
+         :sync: {{ supported_gpus }}
+
+         .. code-block:: shell
+
+            docker pull {{ docker.pull_tag }}
+      {% endfor %}
+
+Run training
+============
+
+Once the setup is complete, choose between the following two workflows to start benchmarking training.
+For fine-tuning workloads and multi-node training examples, see :doc:`../pytorch-training` (without Primus).
+For best performance on MI325X, MI350X, and MI355X GPUs, you might need to
+tweak some configurations (such as batch sizes).
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.9-benchmark-models.yaml
+
+   {% set dockers = data.dockers %}
+   {% set model_groups = data.model_groups %}
+
+   .. tab-set::
+
+      .. tab-item:: MAD-integrated benchmarking
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+         .. container:: model-doc {{ model.mad_tag }}
+
+            The following run command is tailored to {{ model.model }}.
+            See :ref:`amd-primus-pytorch-model-support-v259` to switch to another available model.
+
+            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+               directory and install the required packages on the host machine.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD
+                  pip install -r requirements.txt
+
+            2. For example, use this command to run the performance benchmark test on the {{ model.model }} model
+               using one node with the {{ model.precision }} data type on the host machine.
+
+               .. code-block:: shell
+
+                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+                  madengine run \
+                      --tags {{ model.mad_tag }} \
+                      --keep-model-dir \
+                      --live-output \
+                      --timeout 28800
+
+               MAD launches a Docker container with the name
+               ``container_ci-{{ model.mad_tag }}``. The latency and throughput reports of the
+               model are collected in ``~/MAD/perf.csv``.
+
+               .. note::
+
+                  Currently, Primus torchtitan models are run with Primus Turbo
+                  enabled for enhanced performance. To disable Primus Turbo,
+                  modify respective configuration file
+                  ``scripts/primus/pytorch_train/primus_torchtitan_scripts/llama3_[8B|70B]-[BF16|FP8].yaml``.
+
+      {% endfor %}
+   {% endfor %}
+
+      .. tab-item:: Primus benchmarking
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+         .. container:: model-doc {{ model.mad_tag }}
+
+            The following run commands are tailored to {{ model.model }}.
+            See :ref:`amd-primus-pytorch-model-support-v259` to switch to another available model.
+
+            .. rubric:: Download the Docker image and required packages
+
+            1. Pull the appropriate Docker image for your AMD GPU architecture from Docker Hub.
+
+               .. tab-set::
+
+                  {% for supported_gpus, docker in dockers.items() %}
+                  .. tab-item:: {{ supported_gpus }}
+                     :sync: {{ supported_gpus }}
+
+                     .. code-block:: shell
+
+                        docker pull {{ docker.pull_tag }}
+                  {% endfor %}
+
+            2. Run the Docker container.
+
+               .. tab-set::
+
+                  {% for supported_gpus, docker in dockers.items() %}
+                  .. tab-item:: {{ supported_gpus }}
+                     :sync: {{ supported_gpus }}
+
+                     .. code-block:: shell
+
+                        docker run -it \
+                            --device /dev/dri \
+                            --device /dev/kfd \
+                            --network host \
+                            --ipc host \
+                            --group-add video \
+                            --cap-add SYS_PTRACE \
+                            --security-opt seccomp=unconfined \
+                            --privileged \
+                            -v $HOME:$HOME \
+                            -v $HOME/.ssh:/root/.ssh \
+                            --shm-size 64G \
+                            --name training_env \
+                            {{ docker.pull_tag }}
+                  {% endfor %}
+
+               Use these commands if you exit the ``training_env`` container and need to return to it.
+
+               .. code-block:: shell
+
+                  docker start training_env
+                  docker exec -it training_env bash
+
+            .. rubric:: Prepare training datasets and dependencies
+
+            The following benchmarking examples require downloading models and datasets
+            from Hugging Face. To ensure successful access to gated repos, set your
+            ``HF_TOKEN``.
+
+            .. code-block:: shell
+
+               export HF_TOKEN=$your_personal_hugging_face_access_token
+
+            .. rubric:: Pretraining
+
+            To get started, navigate to the ``Primus`` directory in your container.
+
+            .. code-block::
+
+               cd /workspace/Primus
+
+            Now, to start the pretraining benchmark, use the ``run_pretrain.sh`` script
+            included with Primus with the appropriate options.
+
+            .. rubric:: Benchmarking examples
+
+            .. container:: model-doc primus_pyt_train_llama-3.1-8b
+
+               Use the following command to run train Llama 3.1 8B with BF16 precision using Primus torchtitan.
+
+               .. tab-set::
+
+                  .. tab-item:: MI355X and MI350X
+                     :sync: MI355X and MI300X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \
+                        bash examples/run_pretrain.sh \
+                            --metrics.enable_tensorboard false \
+                            --profiling.enable_profiling false \
+                            --training.batch_size 5
+
+                  .. tab-item:: MI325X
+                     :sync: MI325X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \
+                        bash examples/run_pretrain.sh \
+                            --metrics.enable_tensorboard false \
+                            --profiling.enable_profiling false \
+                            --training.batch_size 6
+
+                  .. tab-item:: MI300X
+                     :sync: MI325X and MI300X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \
+                        bash examples/run_pretrain.sh \
+                            --metrics.enable_tensorboard false \
+                            --profiling.enable_profiling false \
+                            --training.batch_size 4
+
+
+               To train Llama 3.1 8B with FP8 precision, use the following command.
+
+               .. tab-set::
+
+                  .. tab-item:: MI355X and MI350X
+                     :sync: MI355X and MI300X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \
+                        bash examples/run_pretrain.sh \
+                            --metrics.enable_tensorboard false \
+                            --profiling.enable_profiling false \
+                            --training.batch_size 8
+
+                  .. tab-item:: MI325X
+                     :sync: MI325X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/llama3.1_8B-FP8-pretrain.yaml \
+                        bash examples/run_pretrain.sh \
+                            --metrics.enable_tensorboard false \
+                            --profiling.enable_profiling false \
+                            --training.batch_size 7
+
+                  .. tab-item:: MI300X
+                     :sync: MI325X and MI300X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/llama3.1_8B-FP8-pretrain.yaml \
+                        bash examples/run_pretrain.sh \
+                            --metrics.enable_tensorboard false \
+                            --profiling.enable_profiling false \
+                            --training.batch_size 5
+
+            .. container:: model-doc primus_pyt_train_llama-3.1-70b
+
+               Use the following command to run train Llama 3.1 70B with BF16 precision using Primus torchtitan.
+
+               .. tab-set::
+
+                  .. tab-item:: MI355X and MI350X
+                     :sync: MI355X and MI300X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/llama3.1_70B-BF16-pretrain.yaml \
+                        bash examples/run_pretrain.sh \
+                            --metrics.enable_tensorboard false \
+                            --profiling.enable_profiling false \
+                            --training.batch_size 8
+
+                  .. tab-item:: MI325X
+                     :sync: MI325X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/llama3.1_70B-BF16-pretrain.yaml \
+                        bash examples/run_pretrain.sh \
+                            --metrics.enable_tensorboard false \
+                            --profiling.enable_profiling false \
+                            --training.batch_size 6
+
+                  .. tab-item:: MI300X
+                     :sync: MI325X and MI300X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/llama3.1_70B-BF16-pretrain.yaml \
+                        bash examples/run_pretrain.sh \
+                            --metrics.enable_tensorboard false \
+                            --profiling.enable_profiling false \
+                            --training.batch_size 4
+
+               To train Llama 3.1 70B with FP8 precision, use the following command.
+
+               .. tab-set::
+
+                  .. tab-item:: MI355X and MI350X
+                     :sync: MI355X and MI300X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/llama3.1_70B-FP8-pretrain.yaml \
+                        bash examples/run_pretrain.sh \
+                            --metrics.enable_tensorboard false \
+                            --profiling.enable_profiling false \
+                            --training.batch_size 6
+
+                  .. tab-item:: MI325X
+                     :sync: MI325X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/llama3.1_70B-FP8-pretrain.yaml \
+                        bash examples/run_pretrain.sh \
+                            --metrics.enable_tensorboard false \
+                            --profiling.enable_profiling false \
+                            --training.batch_size 5
+
+                  .. tab-item:: MI300X
+                     :sync: MI325X and MI300X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/llama3.1_70B-FP8-pretrain.yaml \
+                        bash examples/run_pretrain.sh \
+                            --metrics.enable_tensorboard false \
+                            --profiling.enable_profiling false \
+                            --training.batch_size 3
+      {% endfor %}
+   {% endfor %}
+
+      .. tab-item:: Standalone torchtitan benchmarking
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+         .. container:: model-doc {{ model.mad_tag }}
+
+            The following run commands are tailored to {{ model.model }}.
+            See :ref:`amd-primus-pytorch-model-support-v259` to switch to another available model.
+
+            .. rubric:: Download the Docker image and required packages
+
+            1. Pull the appropriate Docker image for your AMD GPU architecture from Docker Hub.
+
+               .. tab-set::
+
+                  {% for supported_gpus, docker in dockers.items() %}
+                  .. tab-item:: {{ supported_gpus }}
+                     :sync: {{ supported_gpus }}
+
+                     .. code-block:: shell
+
+                        docker pull {{ docker.pull_tag }}
+                  {% endfor %}
+
+            2. Run the Docker container.
+
+               .. tab-set::
+
+                  {% for supported_gpus, docker in dockers.items() %}
+                  .. tab-item:: {{ supported_gpus }}
+                     :sync: {{ supported_gpus }}
+
+                     .. code-block:: shell
+
+                        docker run -it \
+                            --device /dev/dri \
+                            --device /dev/kfd \
+                            --network host \
+                            --ipc host \
+                            --group-add video \
+                            --cap-add SYS_PTRACE \
+                            --security-opt seccomp=unconfined \
+                            --privileged \
+                            -v $HOME:$HOME \
+                            -v $HOME/.ssh:/root/.ssh \
+                            --shm-size 64G \
+                            --name training_env \
+                            {{ docker.pull_tag }}
+                  {% endfor %}
+
+               Use these commands if you exit the ``training_env`` container and need to return to it.
+
+               .. code-block:: shell
+
+                  docker start training_env
+                  docker exec -it training_env bash
+
+            3. Navigate to the ``torchtitan`` workspace directory.
+
+               .. code-block:: shell
+
+                  cd /workspace/torchtitan
+
+            .. rubric:: Download the tokenizer
+
+            1. The following benchmarking examples require downloading models and datasets
+               from Hugging Face. To ensure successful access to gated repos, set your
+               ``HF_TOKEN``.
+
+               .. code-block:: shell
+
+                  export HF_TOKEN=$your_personal_hugging_face_access_token
+
+            2. Download the tokenizer for your model.
+
+               .. container:: model-doc {{ model.mad_tag }}
+
+                  .. code-block:: shell
+
+                     python3 scripts/download_tokenizer.py \
+                        --repo_id {{ model.model_repo }} \
+                        --tokenizer_path "original" \
+                        --hf_token=${HF_TOKEN}
+
+            .. rubric:: Pretraining examples
+
+            Run the training script with the appropriate configuration file.
+
+            For train with BF16 precicion, use the following command:
+
+            .. container:: model-doc {{ model.mad_tag }}
+
+               .. code-block:: shell
+
+                  CONFIG_FILE={{ model.config_file.bf16 }} \
+                  .run_train.sh
+
+            For train with BF16 precicion, use the following command:
+
+            .. container:: model-doc {{ model.mad_tag }}
+
+               .. code-block:: shell
+
+                  CONFIG_FILE={{ model.config_file.fp8 }} \
+                  .run_train.sh
+      {% endfor %}
+   {% endfor %}
+
+Known issues
+============
+
+PyTorch Profiler may produce inaccurate traces when CPU activity profiling is enabled.
+
+
+Further reading
+===============
+
+- For an introduction to Primus, see `Primus: A Lightweight, Unified Training
+  Framework for Large Models on AMD GPUs <https://rocm.blogs.amd.com/software-tools-optimization/primus/README.html>`__.
+
+- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
+
+- To learn more about system settings and management practices to configure your system for
+  AMD Instinct MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`pytorch-training-history` to find documentation for previous releases
+of the ``ROCm/pytorch-training`` Docker image.
diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst
index 5d0250179..d6487eb6f 100644
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst
@@ -16,14 +16,23 @@ previous releases of the ``ROCm/pytorch-training`` Docker image on `Docker Hub <
      - Components
      - Resources
 
-   * - v25.9 (latest)
+   * - v25.10 (latest)
+     -
+       * ROCm 7.1.0
+       * PyTorch 2.10.0.dev20251112+rocm7.1
+     -
+       * :doc:`Primus PyTorch Training documentation <../primus-pytorch>`
+       * :doc:`PyTorch training (legacy) documentation <../pytorch-training>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6>`__
+
+   * - v25.9
      -
        * ROCm 7.0.0
        * Primus 0.3.0
        * PyTorch 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
      -
-       * :doc:`Primus PyTorch Training documentation <../primus-pytorch>`
-       * :doc:`PyTorch training (legacy) documentation <../pytorch-training>`
+       * :doc:`Primus PyTorch Training documentation <primus-pytorch-v25.9>`
+       * :doc:`PyTorch training (legacy) documentation <pytorch-training-v25.9>`
        * `Docker Hub (gfx950) <https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6>`__
        * `Docker Hub (gfx942) <https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357>`__
 
diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.9.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.9.rst
new file mode 100644
index 000000000..6bafba855
--- /dev/null
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.9.rst
@@ -0,0 +1,667 @@
+:orphan:
+
+.. meta::
+   :description: How to train a model using PyTorch for ROCm.
+   :keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
+
+**************************************
+Training a model with PyTorch on ROCm
+**************************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm PyTorch training
+   performance benchmark documentation. See :doc:`../pytorch-training` for the latest version.
+
+.. note::
+
+   For a unified training solution on AMD GPUs with ROCm, the `rocm/pytorch-training
+   <https://hub.docker.com/r/rocm/pytorch-training/>`__ Docker Hub registry will be
+   deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
+   The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
+   including torchtitan and :doc:`Megatron-LM <../primus-megatron>`.
+
+   See :doc:`../primus-pytorch` for details.
+
+PyTorch is an open-source machine learning framework that is widely used for
+model training with GPU-optimized components for transformer-based models.
+The PyTorch for ROCm training Docker image provides a prebuilt optimized
+environment for fine-tuning and pretraining a model on AMD Instinct MI325X
+and MI300X GPUs. It includes the following software components to accelerate
+training workloads:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.9-benchmark-models.yaml
+
+   {% set dockers = data.dockers %}
+   .. tab-set::
+
+   {% for supported_gpus, docker in dockers.items() %}
+      .. tab-item:: {{ supported_gpus }}
+         :sync: {{ supported_gpus }}
+
+         .. list-table::
+            :header-rows: 1
+
+            * - Software component
+              - Version
+
+            {% for component_name, component_version in docker.components.items() %}
+            * - {{ component_name }}
+              - {{ component_version }}
+            {% endfor %}
+   {% endfor %}
+
+.. _amd-pytorch-training-model-support-v259:
+
+Supported models
+================
+
+The following models are pre-optimized for performance on the AMD Instinct
+MI355X, MI350X, MI325X, and MI300X GPUs. Some instructions, commands, and
+training recommendations in this documentation might vary by model -- select
+one to get started.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.9-benchmark-models.yaml
+
+   {% set model_groups = data.model_groups %}
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+         <div class="row gx-0">
+            <div class="col-2 me-1 px-2 model-param-head">Model</div>
+            <div class="row col-10 pe-0">
+      {% for model_group in model_groups %}
+               <div class="col-4 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+      {% endfor %}
+            </div>
+         </div>
+
+         <div class="row gx-0 pt-1">
+            <div class="col-2 me-1 px-2 model-param-head">Variant</div>
+            <div class="row col-10 pe-0">
+      {% for model_group in model_groups %}
+         {% set models = model_group.models %}
+         {% for model in models %}
+            {% if models|length % 3 == 0 %}
+               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% else %}
+               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% endif %}
+         {% endfor %}
+      {% endfor %}
+            </div>
+         </div>
+      </div>
+
+.. _amd-pytorch-training-supported-training-modes-v259:
+
+The following table lists supported training modes per model.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.9-benchmark-models.yaml
+
+   {% set model_groups = data.model_groups %}
+   .. dropdown:: Supported training modes
+
+      .. list-table::
+         :header-rows: 1
+
+         * - Model
+           - Supported training modes
+
+      {% for model_group in model_groups %}
+         {% set models = model_group.models %}
+         {% for model in models %}
+         {% if model.training_modes %}
+         * - {{ model.model }}
+           - ``{{ model.training_modes | join('``, ``') }}``
+
+         {% endif %}
+         {% endfor %}
+      {% endfor %}
+
+      .. note::
+
+         Some model and fine-tuning combinations are not listed. This is
+         because the `upstream torchtune repository <https://github.com/pytorch/torchtune>`__
+         doesn't provide default YAML configurations for them.
+         For advanced usage, you can create a custom configuration to enable
+         unlisted fine-tuning methods by using an existing file in the
+         ``/workspace/torchtune/recipes/configs`` directory as a template.
+
+.. _amd-pytorch-training-performance-measurements-v259:
+
+Performance measurements
+========================
+
+To evaluate performance, the
+`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
+page provides reference throughput and latency measurements for training
+popular AI models.
+
+.. note::
+
+   The performance data presented in
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
+   should not be interpreted as the peak performance achievable by AMD
+   Instinct MI325X and MI300X GPUs or ROCm software.
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting training.
+
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
+This Docker image is optimized for specific model configurations outlined
+below. Performance can vary for other training workloads, as AMD
+doesn’t test configurations and run conditions outside those described.
+
+Run training
+============
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.9-benchmark-models.yaml
+
+   {% set dockers = data.dockers %}
+   {% set model_groups = data.model_groups %}
+
+   Once the setup is complete, choose between two options to start benchmarking training:
+
+   .. tab-set::
+
+      .. tab-item:: MAD-integrated benchmarking
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+         .. container:: model-doc {{ model.mad_tag }}
+
+            The following run command is tailored to {{ model.model }}.
+            See :ref:`amd-pytorch-training-model-support-v259` to switch to another available model.
+
+            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+               directory and install the required packages on the host machine.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD
+                  pip install -r requirements.txt
+
+            2. For example, use this command to run the performance benchmark test on the {{ model.model }} model
+               using one node with the {{ model.precision }} data type on the host machine.
+
+               .. code-block:: shell
+
+                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+                  madengine run \
+                      --tags {{ model.mad_tag }} \
+                      --keep-model-dir \
+                      --live-output \
+                      --timeout 28800
+
+               MAD launches a Docker container with the name
+               ``container_ci-{{ model.mad_tag }}``. The latency and throughput reports of the
+               model are collected in ``~/MAD/perf.csv``.
+
+      {% endfor %}
+   {% endfor %}
+
+      .. tab-item:: Standalone benchmarking
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+         .. container:: model-doc {{ model.mad_tag }}
+
+            The following commands are tailored to {{ model.model }}.
+            See :ref:`amd-pytorch-training-model-support-v259` to switch to another available model.
+
+      {% endfor %}
+   {% endfor %}
+
+         .. rubric:: Download the Docker image and required packages
+
+         1. Use the following command to pull the Docker image from Docker Hub.
+
+            .. tab-set::
+
+               {% for supported_gpus, docker in dockers.items() %}
+               .. tab-item:: {{ supported_gpus }}
+                  :sync: {{ supported_gpus }}
+
+                  .. code-block:: shell
+
+                     docker pull {{ docker.pull_tag }}
+               {% endfor %}
+
+         2. Launch the Docker container.
+
+            .. tab-set::
+
+               {% for supported_gpus, docker in dockers.items() %}
+               .. tab-item:: {{ supported_gpus }}
+                  :sync: {{ supported_gpus }}
+
+                  .. code-block:: shell
+
+                     docker run -it \
+                         --device /dev/dri \
+                         --device /dev/kfd \
+                         --network host \
+                         --ipc host \
+                         --group-add video \
+                         --cap-add SYS_PTRACE \
+                         --security-opt seccomp=unconfined \
+                         --privileged \
+                         -v $HOME:$HOME \
+                         -v $HOME/.ssh:/root/.ssh \
+                         --shm-size 64G \
+                         --name training_env \
+                         {{ docker.pull_tag }}
+               {% endfor %}
+
+            Use these commands if you exit the ``training_env`` container and need to return to it.
+
+            .. code-block:: shell
+
+               docker start training_env
+               docker exec -it training_env bash
+
+         3. In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
+            repository and navigate to the benchmark scripts directory
+            ``/workspace/MAD/scripts/pytorch_train``.
+
+            .. code-block:: shell
+
+               git clone https://github.com/ROCm/MAD
+               cd MAD/scripts/pytorch_train
+
+         .. rubric:: Prepare training datasets and dependencies
+
+         1. The following benchmarking examples require downloading models and datasets
+            from Hugging Face. To ensure successful access to gated repos, set your
+            ``HF_TOKEN``.
+
+            .. code-block:: shell
+
+               export HF_TOKEN=$your_personal_hugging_face_access_token
+
+         2. Run the setup script to install libraries and datasets needed for benchmarking.
+
+            .. code-block:: shell
+
+               ./pytorch_benchmark_setup.sh
+
+            .. container:: model-doc pyt_train_llama-3.1-8b
+
+               ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 8B:
+
+               .. list-table::
+                  :header-rows: 1
+
+                  * - Library
+                    - Reference
+
+                  * - ``accelerate``
+                    - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
+
+                  * - ``datasets``
+                    - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
+
+            .. container:: model-doc pyt_train_llama-3.1-70b
+
+               ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 70B:
+
+               .. list-table::
+                  :header-rows: 1
+
+                  * - Library
+                    - Reference
+
+                  * - ``datasets``
+                    - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
+
+                  * - ``torchdata``
+                    - `TorchData <https://meta-pytorch.org/data/beta/index.html#torchdata>`__
+
+                  * - ``tomli``
+                    - `Tomli <https://pypi.org/project/tomli/>`__
+
+                  * - ``tiktoken``
+                    - `tiktoken <https://github.com/openai/tiktoken>`__
+
+                  * - ``blobfile``
+                    - `blobfile <https://pypi.org/project/blobfile/>`__
+
+                  * - ``tabulate``
+                    - `tabulate <https://pypi.org/project/tabulate/>`__
+
+                  * - ``wandb``
+                    - `Weights & Biases <https://github.com/wandb/wandb>`__
+
+                  * - ``sentencepiece``
+                    - `SentencePiece <https://github.com/google/sentencepiece>`__ 0.2.0
+
+                  * - ``tensorboard``
+                    - `TensorBoard <https://www.tensorflow.org/tensorboard>`__ 2.18.0
+
+            .. container:: model-doc pyt_train_flux
+
+               ``pytorch_benchmark_setup.sh`` installs the following libraries for FLUX:
+
+               .. list-table::
+                  :header-rows: 1
+
+                  * - Library
+                    - Reference
+
+                  * - ``accelerate``
+                    - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
+
+                  * - ``datasets``
+                    - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`__ 3.2.0
+
+                  * - ``sentencepiece``
+                    - `SentencePiece <https://github.com/google/sentencepiece>`__ 0.2.0
+
+                  * - ``tensorboard``
+                    - `TensorBoard <https://www.tensorflow.org/tensorboard>`__ 2.18.0
+
+                  * - ``csvkit``
+                    - `csvkit <https://csvkit.readthedocs.io/en/latest/>`__ 2.0.1
+
+                  * - ``deepspeed``
+                    - `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`__ 0.16.2
+
+                  * - ``diffusers``
+                    - `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`__ 0.31.0
+
+                  * - ``GitPython``
+                    - `GitPython <https://github.com/gitpython-developers/GitPython>`__ 3.1.44
+
+                  * - ``opencv-python-headless``
+                    - `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`__ 4.10.0.84
+
+                  * - ``peft``
+                    - `PEFT <https://huggingface.co/docs/peft/en/index>`__ 0.14.0
+
+                  * - ``protobuf``
+                    - `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`__ 5.29.2
+
+                  * - ``pytest``
+                    - `PyTest <https://docs.pytest.org/en/stable/>`__ 8.3.4
+
+                  * - ``python-dotenv``
+                    - `python-dotenv <https://pypi.org/project/python-dotenv/>`__ 1.0.1
+
+                  * - ``seaborn``
+                    - `Seaborn <https://seaborn.pydata.org/>`__ 0.13.2
+
+                  * - ``transformers``
+                    - `Transformers <https://huggingface.co/docs/transformers/en/index>`__ 4.47.0
+
+            ``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face:
+
+            * `frank-chieng/chinese_architecture_siheyuan <https://huggingface.co/datasets/frank-chieng/chinese_architecture_siheyuan>`__
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+         {% set training_modes = model.training_modes %}
+         {% set training_mode_descs = {
+            "pretrain": "Benchmark pre-training.",
+            "HF_pretrain": "Llama 3.1 8B pre-training with FP8 precision."
+         } %}
+         {% set available_modes = training_modes | select("in", ["pretrain", "HF_pretrain"]) | list %}
+         {% if available_modes %}
+
+         .. container:: model-doc {{ model.mad_tag }}
+
+            .. rubric:: Pre-training
+
+            To start the pre-training benchmark, use the following command with the
+            appropriate options. See the following list of options and their descriptions.
+
+            .. code-block:: shell
+
+               ./pytorch_benchmark_report.sh -t {% if available_modes | length == 1 %}{{ available_modes[0] }}{% else %}$training_mode{% endif %} \
+                   -m {{ model.model_repo }} \
+                   -p $datatype \
+                   -s $sequence_length
+
+            {% if model.mad_tag == "pyt_train_flux" %}
+            .. container:: model-doc {{ model.mad_tag }}
+
+               .. note::
+
+                  Currently, FLUX models are not supported out-of-the-box on this Docker.
+                  To use FLUX, refer to ``rocm/pytorch-training`` Docker: :doc:`previous-versions/pytorch-training-v25.6`
+
+                  Occasionally, downloading the Flux dataset might fail. In the event of this
+                  error, manually download it from Hugging Face at
+                  `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
+                  and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
+                  the required dataset.
+            {% endif %}
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Name
+                 - Options
+                 - Description
+
+               {% for mode in available_modes %}
+               * - {% if loop.first %}``$training_mode``{% endif %}
+                 - ``{{ mode }}``
+                 - {{ training_mode_descs[mode] }}
+               {% endfor %}
+
+               * - ``$datatype``
+                 - ``BF16``{% if model.mad_tag == "pyt_train_llama-3.1-8b" %} or ``FP8``{% endif %}
+                 - Only Llama 3.1 8B supports FP8 precision.
+
+               * - ``$sequence_length``
+                 - Sequence length for the language model.
+                 - Between 2048 and 8192. 8192 by default.
+         {% endif %}
+
+         {% set training_modes = model.training_modes %}
+         {% set training_mode_descs = {
+            "posttrain": "Benchmark post-training.",
+         } %}
+         {% set available_modes = training_modes | select("in", ["posttrain"]) | list %}
+         {% if available_modes %}
+
+         .. container:: model-doc {{ model.mad_tag }}
+
+            .. rubric:: Post-training
+
+            To start the post-training benchmark, use the following command with the
+            appropriate options. See the following list of options and their descriptions.
+
+            .. code-block:: shell
+
+               ./pytorch_benchmark_report.sh -t {% if available_modes | length == 1 %}{{ available_modes[0] }}{% else %}$training_mode{% endif %} \
+                   -m {{ model.model_repo }} \
+                   -p $datatype \
+                   -s $sequence_length
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Name
+                 - Options
+                 - Description
+
+               {% for mode in available_modes %}
+               * - {% if loop.first %}``$training_mode``{% endif %}
+                 - ``{{ mode }}``
+                 - {{ training_mode_descs[mode] }}
+               {% endfor %}
+
+               * - ``$datatype``
+                 - ``BF16``{% if model.mad_tag == "pyt_train_llama-3.1-8b" %} or ``FP8``{% endif %}
+                 - Only Llama 3.1 8B supports FP8 precision.
+
+               * - ``$sequence_length``
+                 - Sequence length for the language model.
+                 - Between 2048 and 8192. 8192 by default.
+         {% endif %}
+
+         {% set training_mode_descs = {
+            "finetune_fw": "Full weight fine-tuning (BF16 and FP8 supported).",
+            "finetune_lora": "LoRA fine-tuning (BF16 supported).",
+            "finetune_qlora": "QLoRA fine-tuning (BF16 supported).",
+            "HF_finetune_lora": "LoRA fine-tuning with Hugging Face PEFT.",
+         } %}
+         {% set available_modes = training_modes | select("in", ["finetune_fw", "finetune_lora", "finetune_qlora", "HF_finetune_lora"]) | list %}
+         {% if available_modes %}
+         .. container:: model-doc {{ model.mad_tag }}
+
+            .. rubric:: Fine-tuning
+
+            To start the fine-tuning benchmark, use the following command with the
+            appropriate options. See the following list of options and their descriptions.
+            See :ref:`supported training modes <amd-pytorch-training-supported-training-modes-v259>`.
+
+            .. code-block:: shell
+
+               ./pytorch_benchmark_report.sh -t $training_mode \
+                   -m {{ model.model_repo }} \
+                   -p $datatype \
+                   -s $sequence_length
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Name
+                 - Options
+                 - Description
+
+               {% for mode in available_modes %}
+               * - {% if loop.first %}``$training_mode``{% endif %}
+                 - ``{{ mode }}``
+                 - {{ training_mode_descs[mode] }}
+               {% endfor %}
+
+               * - ``$datatype``
+                 - ``BF16``{% if "finetune_fw" in available_modes %} or ``FP8``{% endif %}
+                 - All models support BF16.{% if "finetune_fw" in available_modes %} FP8 is only available for full weight fine-tuning.{% endif %}
+
+               * - ``$sequence_length``
+                 - Between 2048 and 16384.
+                 - Sequence length for the language model.
+
+            {% if model.mad_tag in ["pyt_train_llama3.2-vision-11b", "pyt_train_llama-3.2-vision-90b"] %}
+            .. note::
+
+               For LoRA and QLoRA support with vision models (Llama 3.2 11B and 90B),
+               use the following torchtune commit for compatibility:
+
+               .. code-block:: shell
+
+                  git checkout 48192e23188b1fc524dd6d127725ceb2348e7f0e
+
+            {% elif model.mad_tag in ["pyt_train_llama-2-7b", "pyt_train_llama-2-13b", "pyt_train_llama-2-70b"] %}
+            .. note::
+
+               You might encounter the following error with Llama 2: ``ValueError: seq_len (16384) of
+               input tensor should be smaller than max_seq_len (4096)``.
+               This error indicates that an input sequence is longer than the model's maximum context window.
+
+               Ensure your tokenized input does not exceed the model's ``max_seq_len`` (4096
+               tokens in this case). You can resolve this by truncating the input or splitting
+               it into smaller chunks before passing it to the model.
+
+               Note on reproducibility: The results in this guide are based on
+               commit ``b4c98ac`` from the upstream
+               `<https://github.com/pytorch/torchtune>`__ repository. For the
+               latest updates, you can use the main branch.
+
+            {% endif %}
+         {% endif %}
+      {% endfor %}
+   {% endfor %}
+
+            .. rubric:: Benchmarking examples
+
+            For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.
+
+.. _amd-pytorch-training-multinode-examples-v259:
+
+Multi-node training
+-------------------
+
+Refer to :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your environment for multi-node
+training. See :ref:`rocm-for-ai-multi-node-setup-pyt-train-example` for example Slurm run commands.
+
+Pre-training
+~~~~~~~~~~~~
+
+Multi-node training with torchtitan is supported. The provided SLURM script is pre-configured for Llama 3 70B.
+
+To launch the training job on a SLURM cluster for Llama 3 70B, run the following commands from the MAD repository.
+
+.. code-block:: shell
+
+   # In the MAD repository
+   cd scripts/pytorch_train
+   sbatch run_slurm_train.sh
+
+Fine-tuning
+~~~~~~~~~~~
+
+Multi-node training with torchtune is supported. The provided SLURM script is pre-configured for Llama 3.3 70B.
+
+To launch the training job on a SLURM cluster for Llama 3.3 70B, run the following commands from the MAD repository.
+
+.. code-block:: shell
+
+   huggingface-cli login # Get access to HF Llama model space
+   huggingface-cli download meta-llama/Llama-3.3-70B-Instruct --local-dir ./models/Llama-3.3-70B-Instruct # Download the Llama 3.3 model locally
+   # In the MAD repository
+   cd scripts/pytorch_train
+   sbatch Torchtune_Multinode.sh
+
+.. note::
+
+   Information regarding benchmark setup:
+
+   * By default, Llama 3.3 70B is fine-tuned using ``alpaca_dataset``.
+   * You can adjust the torchtune `YAML configuration file
+     <https://github.com/pytorch/torchtune/blob/main/recipes/configs/llama3_3/70B_full_multinode.yaml>`__
+     if you're using a different model.
+   * The number of nodes and other parameters can be tuned in the SLURM script ``Torchtune_Multinode.sh``.
+   * Set the ``mounting_paths`` inside the SLURM script.
+
+Once the run is finished, you can find the log files in the ``result_torchtune/`` directory.
+
+Known issues
+============
+
+PyTorch Profiler may produce inaccurate traces when CPU activity profiling is enabled.
+
+Further reading
+===============
+
+- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
+
+- To learn more about system settings and management practices to configure your system for
+  AMD Instinct MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`pytorch-training-history` to find documentation for previous releases
+of the ``ROCm/pytorch-training`` Docker image.
diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst
index 06cca9ed6..ed0c2a637 100644
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst
@@ -31,12 +31,10 @@ Megatron-LM.
 
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
 
-   {% set dockers = data.dockers %}
    .. tab-set::
 
-   {% for supported_gpus, docker in dockers.items() %}
-      .. tab-item:: {{ supported_gpus }}
-         :sync: {{ supported_gpus }}
+      .. tab-item:: {{ data.docker.pull_tag }}
+         :sync: {{ data.docker.pull_tag }}
 
          .. list-table::
             :header-rows: 1
@@ -44,13 +42,12 @@ Megatron-LM.
             * - Software component
               - Version
 
-            {% for component_name, component_version in docker.components.items() %}
+            {% for component_name, component_version in data.docker.components.items() %}
             * - {{ component_name }}
               - {{ component_version }}
             {% endfor %}
-   {% endfor %}
 
-.. _amd-primus-megatron-lm-model-support-v259:
+.. _amd-primus-megatron-lm-model-support-v2510:
 
 Supported models
 ================
@@ -111,7 +108,7 @@ To test for optimal performance, consult the recommended :ref:`System health ben
 <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
 system's configuration.
 
-.. _mi300x-amd-primus-megatron-lm-training-v259:
+.. _mi300x-amd-primus-megatron-lm-training-v2510:
 
 Environment setup
 =================
@@ -121,63 +118,49 @@ Environment setup
    Use the following instructions to set up the environment, configure the script to train models, and
    reproduce the benchmark results on AMD Instinct GPUs.
 
-.. _amd-primus-megatron-lm-requirements-v259:
+.. _amd-primus-megatron-lm-requirements-v2510:
 
 Pull the Docker image
 
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
 
-   {% set dockers = data.dockers %}
+   {% set docker = data.docker %}
 
-   1. Pull the appropriate Docker image for your AMD GPU architecture from Docker Hub.
+   1. Pull the ``{{ docker.pull_tag }}`` Docker image from Docker Hub.
 
-      .. tab-set::
+      .. code-block:: shell
 
-         {% for supported_gpus, docker in dockers.items() %}
-         .. tab-item:: {{ supported_gpus }}
-            :sync: {{ supported_gpus }}
-
-            .. code-block:: shell
-
-               docker pull {{ docker.pull_tag }}
-         {% endfor %}
+         docker pull {{ docker.pull_tag }}
 
    2. Launch the Docker container.
 
-      .. tab-set::
+      .. code-block:: shell
 
-         {% for supported_gpus, docker in dockers.items() %}
-         .. tab-item:: {{ supported_gpus }}
-            :sync: {{ supported_gpus }}
+         docker run -it \
+             --device /dev/dri \
+             --device /dev/kfd \
+             --device /dev/infiniband \
+             --network host --ipc host \
+             --group-add video \
+             --cap-add SYS_PTRACE \
+             --security-opt seccomp=unconfined \
+             --privileged \
+             -v $HOME:$HOME \
+             --shm-size 128G \
+             --name primus_training_env \
+             {{ docker.pull_tag }}
 
-            .. code-block:: shell
+      Use these commands if you exit the ``primus_training_env`` container and need to return to it.
 
-               docker run -it \
-                   --device /dev/dri \
-                   --device /dev/kfd \
-                   --device /dev/infiniband \
-                   --network host --ipc host \
-                   --group-add video \
-                   --cap-add SYS_PTRACE \
-                   --security-opt seccomp=unconfined \
-                   --privileged \
-                   -v $HOME:$HOME \
-                   --shm-size 128G \
-                   --name primus_training_env \
-                   {{ docker.pull_tag }}
-         {% endfor %}
+      .. code-block:: shell
 
-3. Use these commands if you exit the ``primus_training_env`` container and need to return to it.
+         docker start primus_training_env
+         docker exec -it primus_training_env bash
 
-   .. code-block:: shell
+The Docker container hosts verified branch ``release/v25.10`` of the `Primus
+<https://github.com/AMD-AGI/Primus/tree/release/v25.10>`__ repository.
 
-      docker start primus_training_env
-      docker exec -it primus_training_env bash
-
-The Docker container hosts verified commit ``e16b27b`` of the `Primus
-<https://github.com/AMD-AGI/Primus/tree/e16b27b>`__ repository.
-
-.. _amd-primus-megatron-lm-environment-setup-v259:
+.. _amd-primus-megatron-lm-environment-setup-v2510:
 
 Configuration
 =============
@@ -224,7 +207,7 @@ You can use either mock data or real data for training.
 
   Ensure that the files are accessible inside the Docker container.
 
-.. _amd-primus-megatron-lm-tokenizer-v259:
+.. _amd-primus-megatron-lm-tokenizer-v2510:
 
 Tokenizer
 ---------
@@ -245,7 +228,7 @@ right permissions to access the tokenizer for each model.
    <https://github.com/AMD-AGI/Primus/blob/e16b27bf6c1b2798f38848fc574fee60d9a9b902/examples/megatron/configs/llama3.1_8B-pretrain.yaml>`__
    definition.
 
-.. _amd-primus-megatron-lm-run-training-v259:
+.. _amd-primus-megatron-lm-run-training-v2510:
 
 Run training
 ============
@@ -269,7 +252,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
 
    Once setup is complete, run the appropriate training command.
    The following run commands are tailored to Llama 3.3 70B.
-   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
 
    To run pre-training for Llama 3.3 70B BF16, run:
 
@@ -280,7 +263,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
 
          .. code-block:: shell
 
-            EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
+            EXP=examples/megatron/configs/MI355X/llama3.3_70B-pretrain.yaml \
             bash ./examples/run_pretrain.sh \
                 --train_iters 50 \
                 --micro_batch_size 6 \
@@ -291,7 +274,12 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
 
          .. code-block:: shell
 
-            EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
+            # Set the variables for better performance
+            # only on MI325X and MI300X
+            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
+            export NVTE_CK_IS_V3_ATOMIC_FP32=1
+
+            EXP=examples/megatron/configs/MI300X/llama3.3_70B-pretrain.yaml \
             bash ./examples/run_pretrain.sh \
                 --train_iters 50 \
                 --micro_batch_size 2 \
@@ -301,7 +289,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
 
    Once setup is complete, run the appropriate training command.
    The following run commands are tailored to Llama 3.1 8B.
-   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
 
    To run pre-training for Llama 3.1 8B FP8, run:
 
@@ -312,7 +300,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
 
          .. code-block:: shell
 
-            EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
+            EXP=examples/megatron/configs/MI355X/llama3.1_8B-pretrain.yaml \
             bash ./examples/run_pretrain.sh \
                 --train_iters 50 \
                 --fp8 hybrid \
@@ -324,7 +312,12 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
 
          .. code-block:: shell
 
-            EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
+            # Set the variables for better performance
+            # only on MI325X and MI300X
+            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
+            export NVTE_CK_IS_V3_ATOMIC_FP32=1
+
+            EXP=examples/megatron/configs/MI300X/llama3.1_8B-pretrain.yaml \
             bash ./examples/run_pretrain.sh \
                 --train_iters 50 \
                 --fp8 hybrid
@@ -338,7 +331,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
 
          .. code-block:: shell
 
-            EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
+            EXP=examples/megatron/configs/MI355X/llama3.1_8B-pretrain.yaml \
             bash ./examples/run_pretrain.sh \
                 --train_iters 50 \
                 --micro_batch_size 4 \
@@ -349,7 +342,12 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
 
          .. code-block:: shell
 
-            EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
+            # Set the variables for better performance
+            # only on MI325X and MI300X
+            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
+            export NVTE_CK_IS_V3_ATOMIC_FP32=1
+
+            EXP=examples/megatron/configs/MI300X/llama3.1_8B-pretrain.yaml \
             bash ./examples/run_pretrain.sh \
                 --train_iters 50
 
@@ -357,7 +355,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
 
    Once setup is complete, run the appropriate training command.
    The following run commands are tailored to Llama 3.1 70B.
-   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
 
    To run pre-training for Llama 3.1 70B BF16, run:
 
@@ -368,7 +366,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
 
          .. code-block:: shell
 
-            EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
+            EXP=examples/megatron/configs/MI355X/llama3.1_70B-pretrain.yaml \
             bash ./examples/run_pretrain.sh \
                  --train_iters 50 \
                  --micro_batch_size 4 \
@@ -379,7 +377,12 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
 
          .. code-block:: shell
 
-            EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
+            # Set the variables for better performance
+            # only on MI325X and MI300X
+            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
+            export NVTE_CK_IS_V3_ATOMIC_FP32=1
+
+            EXP=examples/megatron/configs/MI300X/llama3.1_70B-pretrain.yaml \
             bash ./examples/run_pretrain.sh \
                  --train_iters 50
 
@@ -398,7 +401,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
 
          .. code-block:: shell
 
-            EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
+            EXP=examples/megatron/configs/MI355X/llama3.1_70B-pretrain.yaml \
             bash ./examples/run_pretrain.sh \
                 --train_iters 50 \
                 --fp8 hybrid \
@@ -411,7 +414,12 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
 
          .. code-block:: shell
 
-            EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
+            # Set the variables for better performance
+            # only on MI325X and MI300X
+            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
+            export NVTE_CK_IS_V3_ATOMIC_FP32=1
+
+            EXP=examples/megatron/configs/MI300X/llama3.1_70B-pretrain.yaml \
             bash ./examples/run_pretrain.sh \
                 --train_iters 50 \
                 --num_layers 40 \
@@ -422,7 +430,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
 
    Once setup is complete, run the appropriate training command.
    The following run commands are tailored to Llama 2 7B.
-   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
 
    To run pre-training for Llama 2 7B FP8, run:
 
@@ -433,7 +441,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
 
          .. code-block:: shell
 
-            EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
+            EXP=examples/megatron/configs/MI355X/llama2_7B-pretrain.yaml \
             bash ./examples/run_pretrain.sh \
                 --train_iters 50 \
                 --fp8 hybrid \
@@ -445,7 +453,12 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
 
          .. code-block:: shell
 
-            EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
+            # Set the variables for better performance
+            # only on MI325X and MI300X
+            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
+            export NVTE_CK_IS_V3_ATOMIC_FP32=1
+
+            EXP=examples/megatron/configs/MI300X/llama2_7B-pretrain.yaml \
             bash ./examples/run_pretrain.sh \
                 --train_iters 50 \
                 --fp8 hybrid
@@ -459,7 +472,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
 
          .. code-block:: shell
 
-            EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
+            EXP=examples/megatron/configs/MI355X/llama2_7B-pretrain.yaml \
             bash ./examples/run_pretrain.sh \
                 --train_iters 50 \
                 --micro_batch_size 10 \
@@ -470,7 +483,12 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
 
          .. code-block:: shell
 
-            EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
+            # Set the variables for better performance
+            # only on MI325X and MI300X
+            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
+            export NVTE_CK_IS_V3_ATOMIC_FP32=1
+
+            EXP=examples/megatron/configs/MI300X/llama2_7B-pretrain.yaml \
             bash ./examples/run_pretrain.sh \
                 --train_iters 50
 
@@ -478,7 +496,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
 
    Once setup is complete, run the appropriate training command.
    The following run commands are tailored to Llama 2 70B.
-   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
 
    To run pre-training for Llama 2 70B BF16, run:
 
@@ -489,7 +507,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
 
          .. code-block:: shell
 
-            EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
+            EXP=examples/megatron/configs/MI355X/llama2_70B-pretrain.yaml \
             bash ./examples/run_pretrain.sh \
                 --train_iters 50 \
                 --micro_batch_size 17 \
@@ -500,7 +518,12 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
 
          .. code-block:: shell
 
-            EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
+            # Set the variables for better performance
+            # only on MI325X and MI300X
+            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
+            export NVTE_CK_IS_V3_ATOMIC_FP32=1
+
+            EXP=examples/megatron/configs/MI300X/llama2_70B-pretrain.yaml \
             bash ./examples/run_pretrain.sh \
                 --train_iters 50
 
@@ -508,7 +531,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
 
    Once setup is complete, run the appropriate training command.
    The following run commands are tailored to DeepSeek-V3.
-   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
 
    To run training on a single node for DeepSeek-V3 (MoE with expert parallel) BF16 with 3-layer proxy,
    use the following command:
@@ -520,7 +543,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
 
          .. code-block:: shell
 
-            EXP=examples/megatron/configs/deepseek_v3-pretrain.yaml \
+            EXP=examples/megatron/configs/MI355X/deepseek_v3-pretrain.yaml \
             bash examples/run_pretrain.sh \
                 --num_layers 3 \
                 --moe_layer_freq 1 \
@@ -533,17 +556,24 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
 
          .. code-block:: shell
 
-            EXP=examples/megatron/configs/deepseek_v3-pretrain.yaml \
+            # Set the variables for better performance
+            # only on MI325X and MI300X
+            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
+            export NVTE_CK_IS_V3_ATOMIC_FP32=1
+
+            EXP=examples/megatron/configs/MI300X/deepseek_v3-pretrain.yaml \
             bash examples/run_pretrain.sh \
                 --num_layers 3 \
                 --moe_layer_freq 1 \
+                --micro_batch_size 3 \
+                --global_batch_size 192 \
                 --train_iters 50
 
 .. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v2-lite-16b
 
    Once setup is complete, run the appropriate training command.
    The following run commands are tailored to DeepSeek-V2-Lite.
-   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
 
    To run training on a single node for DeepSeek-V2-Lite (MoE with expert parallel) BF16,
    use the following command:
@@ -555,7 +585,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
 
          .. code-block:: shell
 
-            EXP=examples/megatron/configs/deepseek_v2_lite-pretrain.yaml \
+            EXP=examples/megatron/configs/MI355X/deepseek_v2_lite-pretrain.yaml \
             bash examples/run_pretrain.sh \
                 --train_iters 50 \
                 --micro_batch_size 12 \
@@ -566,7 +596,12 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
 
          .. code-block:: shell
 
-            EXP=examples/megatron/configs/deepseek_v2_lite-pretrain.yaml \
+            # Set the variables for better performance
+            # only on MI325X and MI300X
+            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
+            export NVTE_CK_IS_V3_ATOMIC_FP32=1
+
+            EXP=examples/megatron/configs/MI300X/deepseek_v2_lite-pretrain.yaml \
             bash examples/run_pretrain.sh \
                 --train_iters 50 \
                 --global_batch_size 256
@@ -575,7 +610,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
 
    Once setup is complete, run the appropriate training command.
    The following run commands are tailored to Mixtral 8x7B.
-   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
 
    To run training on a single node for Mixtral 8x7B (MoE with expert parallel),
    use the following command:
@@ -587,7 +622,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
 
          .. code-block:: shell
 
-            EXP=examples/megatron/configs/mixtral_8x7B_v0.1-pretrain.yaml \
+            EXP=examples/megatron/configs/MI355X/mixtral_8x7B_v0.1-pretrain.yaml \
             bash examples/run_pretrain.sh \
                 --train_iters 50 \
                 --micro_batch_size 4 \
@@ -598,7 +633,12 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
 
          .. code-block:: shell
 
-            EXP=examples/megatron/configs/mixtral_8x7B_v0.1-pretrain.yaml \
+            # Set the variables for better performance
+            # only on MI325X and MI300X
+            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
+            export NVTE_CK_IS_V3_ATOMIC_FP32=1
+
+            EXP=examples/megatron/configs/MI300X/mixtral_8x7B_v0.1-pretrain.yaml \
             bash examples/run_pretrain.sh \
                 --train_iters 50
 
@@ -606,7 +646,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
 
    Once setup is complete, run the appropriate training command.
    The following run commands are tailored to Mixtral 8x22B.
-   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
 
    To run training on a single node for Mixtral 8x22B BF16 (MoE with expert parallel) 4-layer proxy,
    use the following command:
@@ -618,7 +658,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
 
          .. code-block:: shell
 
-            EXP=examples/megatron/configs/mixtral_8x22B_v0.1-pretrain.yaml \
+            EXP=examples/megatron/configs/MI355X/mixtral_8x22B_v0.1-pretrain.yaml \
             bash examples/run_pretrain.sh \
                 --train_iters 50 \
                 --num_layers 4 \
@@ -631,7 +671,12 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
 
          .. code-block:: shell
 
-            EXP=examples/megatron/configs/mixtral_8x22B_v0.1-pretrain.yaml \
+            # Set the variables for better performance
+            # only on MI325X and MI300X
+            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
+            export NVTE_CK_IS_V3_ATOMIC_FP32=1
+
+            EXP=examples/megatron/configs/MI300X/mixtral_8x22B_v0.1-pretrain.yaml \
             bash examples/run_pretrain.sh \
                 --train_iters 50 \
                 --num_layers 4 \
@@ -643,7 +688,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
 
    Once setup is complete, run the appropriate training command.
    The following run commands are tailored to Qwen 2.5 7B.
-   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
 
    To run training on a single node for Qwen 2.5 7B BF16, use the following
    command:
@@ -655,7 +700,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
 
          .. code-block:: shell
 
-            EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \
+            EXP=examples/megatron/configs/MI355X/qwen2.5_7B-pretrain.yaml \
             bash examples/run_pretrain.sh \
                 --train_iters 50 \
                 --micro_batch_size 16 \
@@ -666,7 +711,12 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
 
          .. code-block:: shell
 
-            EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \
+            # Set the variables for better performance
+            # only on MI325X and MI300X
+            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
+            export NVTE_CK_IS_V3_ATOMIC_FP32=1
+
+            EXP=examples/megatron/configs/MI300X/qwen2.5_7B-pretrain.yaml \
             bash examples/run_pretrain.sh \
                 --train_iters 50
 
@@ -679,7 +729,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
 
          .. code-block:: shell
 
-            EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \
+            EXP=examples/megatron/configs/MI355X/qwen2.5_7B-pretrain.yaml \
             bash examples/run_pretrain.sh \
                 --train_iters 50 \
                 --fp8 hybrid
@@ -691,7 +741,12 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
 
          .. code-block:: shell
 
-            EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \
+            # Set the variables for better performance
+            # only on MI325X and MI300X
+            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
+            export NVTE_CK_IS_V3_ATOMIC_FP32=1
+
+            EXP=examples/megatron/configs/MI300X/qwen2.5_7B-pretrain.yaml \
             bash examples/run_pretrain.sh \
                 --train_iters 50 \
                 --fp8 hybrid
@@ -700,7 +755,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
 
    Once setup is complete, run the appropriate training command.
    The following run commands are tailored to Qwen 2.5 72B.
-   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
 
    To run the training on a single node for Qwen 2.5 72B BF16, use the following command.
 
@@ -711,7 +766,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
 
          .. code-block:: shell
 
-            EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \
+            EXP=examples/megatron/configs/MI355X/qwen2.5_72B-pretrain.yaml \
             bash examples/run_pretrain.sh \
                 --train_iters 50 \
                 --micro_batch_size 16 \
@@ -722,11 +777,16 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
 
          .. code-block:: shell
 
-            EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \
+            # Set the variables for better performance
+            # only on MI325X and MI300X
+            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
+            export NVTE_CK_IS_V3_ATOMIC_FP32=1
+
+            EXP=examples/megatron/configs/MI300X/qwen2.5_72B-pretrain.yaml \
             bash examples/run_pretrain.sh \
                 --train_iters 50
 
-.. _amd-primus-megatron-multi-node-examples-v259:
+.. _amd-primus-megatron-multi-node-examples-v2510:
 
 Multi-node training examples
 ----------------------------
@@ -740,28 +800,27 @@ to launch the multi-node workload. Use the following steps to setup your environ
 
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
 
-   {% set dockers = data.dockers %}
-   .. tab-set::
+   {% set docker = data.docker %}
+   .. code-block:: shell
 
-      {% for supported_gpus, docker in dockers.items() %}
-      .. tab-item:: {{ supported_gpus }}
-         :sync: {{ supported_gpus }}
+      git clone --recurse-submodules https://github.com/AMD-AGI/Primus.git
+      cd Primus
+      git checkout release/v25.10
+      git submodule update --init --recursive
 
-         .. code-block:: shell
+      export DOCKER_IMAGE={{ docker.pull_tag }}
+      export HF_TOKEN=<your_HF_token>
+      export HSA_NO_SCRATCH_RECLAIM=1
+      export NVTE_CK_USES_BWD_V3=1
+      export NCCL_IB_HCA=<your_NCCL_IB_HCA> # specify which RDMA interfaces to use for communication
+      export NCCL_SOCKET_IFNAME=<your_NCCL_SOCKET_IFNAME> # your Network Interface
+      export GLOO_SOCKET_IFNAME=<your_GLOO_SOCKET_IFNAME> # your Network Interface
+      export NCCL_IB_GID_INDEX=3 # Set InfiniBand GID index for NCCL communication. Default is 3 for ROCE
 
-            git clone --recurse-submodules https://github.com/AMD-AGI/Primus.git
-            cd Primus
-            git checkout e16b27b
-
-            export DOCKER_IMAGE={{ docker.pull_tag }}
-            export HF_TOKEN=<your_HF_token>
-            export HSA_NO_SCRATCH_RECLAIM=1
-            export NVTE_CK_USES_BWD_V3=1
-            export NCCL_IB_HCA=<your_NCCL_IB_HCA> # specify which RDMA interfaces to use for communication
-            export NCCL_SOCKET_IFNAME=<your_NCCL_SOCKET_IFNAME> # your Network Interface
-            export GLOO_SOCKET_IFNAME=<your_GLOO_SOCKET_IFNAME> # your Network Interface
-            export NCCL_IB_GID_INDEX=3 # Set InfiniBand GID index for NCCL communication. Default is 3 for ROCE
-      {% endfor %}
+      # Set the variables for better performance
+      # only on MI325X and MI300X
+      export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
+      export NVTE_CK_IS_V3_ATOMIC_FP32=1
 
 .. note::
 
@@ -769,13 +828,13 @@ to launch the multi-node workload. Use the following steps to setup your environ
    * If ``NCCL_IB_HCA`` and ``NCCL_SOCKET_IFNAME`` are not set, Primus will try to auto-detect. However, since NICs can vary accross different cluster, it is encouraged to explicitly export your NCCL parameters for the cluster.
    * To find your network interface, you can use ``ip a``.
    * To find RDMA interfaces, you can use ``ibv_devices`` to get the list of all the RDMA/IB  devices.
-   * Remember to set ``DOCKER_IMAGE`` and ``HF_TOKEN`` (see :ref:`amd-primus-megatron-lm-tokenizer-v259`) as appropriate.
+   * Remember to set ``DOCKER_IMAGE`` and ``HF_TOKEN`` (see :ref:`amd-primus-megatron-lm-tokenizer-v2510`) as appropriate.
 
 .. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b
 
    Once setup is complete, run the appropriate training command.
    The following run commands are tailored to Llama 3.1 8B.
-   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
 
    To train Llama 3.1 8B FP8 on 8 nodes, run:
 
@@ -793,7 +852,7 @@ to launch the multi-node workload. Use the following steps to setup your environ
 
    Once setup is complete, run the appropriate training command.
    The following run commands are tailored to Llama 2 7B.
-   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
 
    To train Llama 2 7B FP8 on 8 nodes, run:
 
@@ -811,7 +870,7 @@ to launch the multi-node workload. Use the following steps to setup your environ
 
    Once setup is complete, run the appropriate training command.
    The following run commands are tailored to Llama 3.1 70B.
-   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
 
    To train Llama 3.1 70B FP8 on 8 nodes, run:
 
@@ -843,7 +902,7 @@ to launch the multi-node workload. Use the following steps to setup your environ
 
    Once setup is complete, run the appropriate training command.
    The following run commands are tailored to Llama 2 70B.
-   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
 
    To train Llama 2 70B FP8 on 8 nodes, run:
 
@@ -875,7 +934,7 @@ to launch the multi-node workload. Use the following steps to setup your environ
 
    Once setup is complete, run the appropriate training command.
    The following run commands are tailored to Llama 3.3 70B.
-   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
 
    To train Llama 3.3 70B FP8 on 8 nodes, run:
 
@@ -907,7 +966,7 @@ to launch the multi-node workload. Use the following steps to setup your environ
 
    Once setup is complete, run the appropriate training command.
    The following run commands are tailored to Llama 2 70B.
-   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
 
    To train Mixtral 8x7B BF16 on 8 nodes, run:
 
@@ -925,7 +984,7 @@ to launch the multi-node workload. Use the following steps to setup your environ
 
    Once setup is complete, run the appropriate training command.
    The following run commands are tailored to Llama 2 70B.
-   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
 
    To train Qwen2.5 72B FP8 on 8 nodes, run:
 
@@ -942,7 +1001,7 @@ to launch the multi-node workload. Use the following steps to setup your environ
           --no_fp8_weight_transpose_cache true \
           --fp8 hybrid
 
-.. _amd-primus-megatron-lm-benchmark-test-vars-v259:
+.. _amd-primus-megatron-lm-benchmark-test-vars-v2510:
 
 Key options
 -----------
@@ -987,7 +1046,10 @@ num_layers
 Known issues
 ============
 
-PyTorch Profiler may produce inaccurate traces when CPU activity profiling is enabled.
+DeepSeekV3 proxy model and Mixtral 8x22B proxy model may exit with an error
+due to a memory free issue. However, this does not impacts training runs. All
+iterations, in this case 50, should have been completed before the exit and
+the results should be available in the end.
 
 Further reading
 ===============
diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch.rst
index d243800b8..046eb5dc5 100644
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch.rst
@@ -29,12 +29,10 @@ with Primus Turbo optimizations.
 
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
 
-   {% set dockers = data.dockers %}
    .. tab-set::
 
-   {% for supported_gpus, docker in dockers.items() %}
-      .. tab-item:: {{ supported_gpus }}
-         :sync: {{ supported_gpus }}
+      .. tab-item:: {{ data.docker.pull_tag }}
+         :sync: {{ data.docker.pull_tag }}
 
          .. list-table::
             :header-rows: 1
@@ -42,13 +40,12 @@ with Primus Turbo optimizations.
             * - Software component
               - Version
 
-            {% for component_name, component_version in docker.components.items() %}
+            {% for component_name, component_version in data.docker.components.items() %}
             * - {{ component_name }}
               - {{ component_version }}
             {% endfor %}
-   {% endfor %}
 
-.. _amd-primus-pytorch-model-support-v259:
+.. _amd-primus-pytorch-model-support-v2510:
 
 Supported models
 ================
@@ -67,7 +64,7 @@ vary by model -- select one to get started.
             <div class="col-2 me-1 px-2 model-param-head">Model</div>
             <div class="row col-10 pe-0">
       {% for model_group in model_groups %}
-               <div class="col-12 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+               <div class="col-6 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
       {% endfor %}
             </div>
          </div>
@@ -94,7 +91,7 @@ vary by model -- select one to get started.
    For additional workloads, including Llama 3.3, Llama 3.2, Llama 2, GPT OSS, Qwen, and Flux models,
    see the documentation :doc:`pytorch-training` (without Primus)
 
-.. _amd-primus-pytorch-performance-measurements-v259:
+.. _amd-primus-pytorch-performance-measurements-v2510:
 
 System validation
 =================
@@ -120,20 +117,11 @@ Pull the Docker image
 
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
 
-   {% set dockers = data.dockers %}
-
    Use the following command to pull the Docker image from Docker Hub.
 
-   .. tab-set::
+   .. code-block:: shell
 
-      {% for supported_gpus, docker in dockers.items() %}
-      .. tab-item:: {{ supported_gpus }}
-         :sync: {{ supported_gpus }}
-
-         .. code-block:: shell
-
-            docker pull {{ docker.pull_tag }}
-      {% endfor %}
+      docker pull {{ data.docker.pull_tag }}
 
 Run training
 ============
@@ -145,7 +133,7 @@ tweak some configurations (such as batch sizes).
 
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
 
-   {% set dockers = data.dockers %}
+   {% set docker = data.docker %}
    {% set model_groups = data.model_groups %}
 
    .. tab-set::
@@ -158,7 +146,7 @@ tweak some configurations (such as batch sizes).
          .. container:: model-doc {{ model.mad_tag }}
 
             The following run command is tailored to {{ model.model }}.
-            See :ref:`amd-primus-pytorch-model-support-v259` to switch to another available model.
+            See :ref:`amd-primus-pytorch-model-support-v2510` to switch to another available model.
 
             1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
                directory and install the required packages on the host machine.
@@ -185,13 +173,6 @@ tweak some configurations (such as batch sizes).
                ``container_ci-{{ model.mad_tag }}``. The latency and throughput reports of the
                model are collected in ``~/MAD/perf.csv``.
 
-               .. note::
-
-                  Currently, Primus torchtitan models are run with Primus Turbo
-                  enabled for enhanced performance. To disable Primus Turbo,
-                  modify respective configuration file
-                  ``scripts/primus/pytorch_train/primus_torchtitan_scripts/llama3_[8B|70B]-[BF16|FP8].yaml``.
-
       {% endfor %}
    {% endfor %}
 
@@ -203,48 +184,34 @@ tweak some configurations (such as batch sizes).
          .. container:: model-doc {{ model.mad_tag }}
 
             The following run commands are tailored to {{ model.model }}.
-            See :ref:`amd-primus-pytorch-model-support-v259` to switch to another available model.
+            See :ref:`amd-primus-pytorch-model-support-v2510` to switch to another available model.
 
             .. rubric:: Download the Docker image and required packages
 
-            1. Pull the appropriate Docker image for your AMD GPU architecture from Docker Hub.
+            1. Pull the ``{{ docker.pull_tag }}`` Docker image from Docker Hub.
 
-               .. tab-set::
+               .. code-block:: shell
 
-                  {% for supported_gpus, docker in dockers.items() %}
-                  .. tab-item:: {{ supported_gpus }}
-                     :sync: {{ supported_gpus }}
-
-                     .. code-block:: shell
-
-                        docker pull {{ docker.pull_tag }}
-                  {% endfor %}
+                  docker pull {{ docker.pull_tag }}
 
             2. Run the Docker container.
 
-               .. tab-set::
+               .. code-block:: shell
 
-                  {% for supported_gpus, docker in dockers.items() %}
-                  .. tab-item:: {{ supported_gpus }}
-                     :sync: {{ supported_gpus }}
-
-                     .. code-block:: shell
-
-                        docker run -it \
-                            --device /dev/dri \
-                            --device /dev/kfd \
-                            --network host \
-                            --ipc host \
-                            --group-add video \
-                            --cap-add SYS_PTRACE \
-                            --security-opt seccomp=unconfined \
-                            --privileged \
-                            -v $HOME:$HOME \
-                            -v $HOME/.ssh:/root/.ssh \
-                            --shm-size 64G \
-                            --name training_env \
-                            {{ docker.pull_tag }}
-                  {% endfor %}
+                  docker run -it \
+                      --device /dev/dri \
+                      --device /dev/kfd \
+                      --network host \
+                      --ipc host \
+                      --group-add video \
+                      --cap-add SYS_PTRACE \
+                      --security-opt seccomp=unconfined \
+                      --privileged \
+                      -v $HOME:$HOME \
+                      -v $HOME/.ssh:/root/.ssh \
+                      --shm-size 64G \
+                      --name training_env \
+                      {{ docker.pull_tag }}
 
                Use these commands if you exit the ``training_env`` container and need to return to it.
 
@@ -283,37 +250,28 @@ tweak some configurations (such as batch sizes).
                .. tab-set::
 
                   .. tab-item:: MI355X and MI350X
-                     :sync: MI355X and MI300X
+                     :sync: MI355X
 
                      .. code-block:: shell
 
-                        EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \
-                        bash examples/run_pretrain.sh \
-                            --metrics.enable_tensorboard false \
-                            --profiling.enable_profiling false \
-                            --training.batch_size 5
+                        EXP=examples/torchtitan/configs/MI355X/llama3.1_8B-BF16-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.batch_size 6
 
                   .. tab-item:: MI325X
                      :sync: MI325X
 
                      .. code-block:: shell
 
-                        EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \
-                        bash examples/run_pretrain.sh \
-                            --metrics.enable_tensorboard false \
-                            --profiling.enable_profiling false \
-                            --training.batch_size 6
+                        EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-BF16-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.batch_size 6
 
                   .. tab-item:: MI300X
-                     :sync: MI325X and MI300X
+                     :sync: MI300X
 
                      .. code-block:: shell
 
-                        EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \
-                        bash examples/run_pretrain.sh \
-                            --metrics.enable_tensorboard false \
-                            --profiling.enable_profiling false \
-                            --training.batch_size 4
+                        EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-BF16-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.batch_size 4
 
 
                To train Llama 3.1 8B with FP8 precision, use the following command.
@@ -321,37 +279,28 @@ tweak some configurations (such as batch sizes).
                .. tab-set::
 
                   .. tab-item:: MI355X and MI350X
-                     :sync: MI355X and MI300X
+                     :sync: MI355X
 
                      .. code-block:: shell
 
-                        EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \
-                        bash examples/run_pretrain.sh \
-                            --metrics.enable_tensorboard false \
-                            --profiling.enable_profiling false \
-                            --training.batch_size 8
+                        EXP=examples/torchtitan/configs/MI355X/llama3.1_8B-BF16-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.batch_size 8
 
                   .. tab-item:: MI325X
                      :sync: MI325X
 
                      .. code-block:: shell
 
-                        EXP=examples/torchtitan/configs/llama3.1_8B-FP8-pretrain.yaml \
-                        bash examples/run_pretrain.sh \
-                            --metrics.enable_tensorboard false \
-                            --profiling.enable_profiling false \
-                            --training.batch_size 7
+                        EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.batch_size 7
 
                   .. tab-item:: MI300X
-                     :sync: MI325X and MI300X
+                     :sync: MI300X
 
                      .. code-block:: shell
 
-                        EXP=examples/torchtitan/configs/llama3.1_8B-FP8-pretrain.yaml \
-                        bash examples/run_pretrain.sh \
-                            --metrics.enable_tensorboard false \
-                            --profiling.enable_profiling false \
-                            --training.batch_size 5
+                        EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.batch_size 5
 
             .. container:: model-doc primus_pyt_train_llama-3.1-70b
 
@@ -364,36 +313,57 @@ tweak some configurations (such as batch sizes).
 
                      .. code-block:: shell
 
-                        EXP=examples/torchtitan/configs/llama3.1_70B-BF16-pretrain.yaml \
-                        bash examples/run_pretrain.sh \
-                            --metrics.enable_tensorboard false \
-                            --profiling.enable_profiling false \
-                            --training.batch_size 8
+                        EXP=examples/torchtitan/configs/MI355X/llama3.1_70B-BF16-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.batch_size 8
 
                   .. tab-item:: MI325X
                      :sync: MI325X
 
                      .. code-block:: shell
 
-                        EXP=examples/torchtitan/configs/llama3.1_70B-BF16-pretrain.yaml \
-                        bash examples/run_pretrain.sh \
-                            --metrics.enable_tensorboard false \
-                            --profiling.enable_profiling false \
-                            --training.batch_size 6
+                        EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.batch_size 6
 
                   .. tab-item:: MI300X
-                     :sync: MI325X and MI300X
+                     :sync: MI300X
 
                      .. code-block:: shell
 
-                        EXP=examples/torchtitan/configs/llama3.1_70B-BF16-pretrain.yaml \
-                        bash examples/run_pretrain.sh \
-                            --metrics.enable_tensorboard false \
-                            --profiling.enable_profiling false \
-                            --training.batch_size 4
+                        EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.batch_size 4
 
                To train Llama 3.1 70B with FP8 precision, use the following command.
 
+               .. tab-set::
+
+                  .. tab-item:: MI355X and MI350X
+                     :sync: MI355X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/MI355X/llama3.1_70B-FP8-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.batch_size 6
+
+                  .. tab-item:: MI325X
+                     :sync: MI325X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.batch_size 5
+
+                  .. tab-item:: MI300X
+                     :sync: MI300X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.batch_size 3
+
+            .. container:: model-doc primus_pyt_train_deepseek-v2
+
+               Use the following command to run train DeepSeek V2 16B with BF16 precision using Primus torchtitan.
+
                .. tab-set::
 
                   .. tab-item:: MI355X and MI350X
@@ -401,151 +371,55 @@ tweak some configurations (such as batch sizes).
 
                      .. code-block:: shell
 
-                        EXP=examples/torchtitan/configs/llama3.1_70B-FP8-pretrain.yaml \
-                        bash examples/run_pretrain.sh \
-                            --metrics.enable_tensorboard false \
-                            --profiling.enable_profiling false \
-                            --training.batch_size 6
+                        EXP=examples/torchtitan/configs/MI355X/deepseek_v3_16b-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.batch_size 16
 
                   .. tab-item:: MI325X
                      :sync: MI325X
 
                      .. code-block:: shell
 
-                        EXP=examples/torchtitan/configs/llama3.1_70B-FP8-pretrain.yaml \
-                        bash examples/run_pretrain.sh \
-                            --metrics.enable_tensorboard false \
-                            --profiling.enable_profiling false \
-                            --training.batch_size 5
+                        EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.batch_size 10
 
                   .. tab-item:: MI300X
-                     :sync: MI325X and MI300X
+                     :sync: MI300X
 
                      .. code-block:: shell
 
-                        EXP=examples/torchtitan/configs/llama3.1_70B-FP8-pretrain.yaml \
-                        bash examples/run_pretrain.sh \
-                            --metrics.enable_tensorboard false \
-                            --profiling.enable_profiling false \
-                            --training.batch_size 3
-      {% endfor %}
-   {% endfor %}
+                        EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.batch_size 8
 
-      .. tab-item:: Standalone torchtitan benchmarking
-
-   {% for model_group in model_groups %}
-      {% for model in model_group.models %}
-
-         .. container:: model-doc {{ model.mad_tag }}
-
-            The following run commands are tailored to {{ model.model }}.
-            See :ref:`amd-primus-pytorch-model-support-v259` to switch to another available model.
-
-            .. rubric:: Download the Docker image and required packages
-
-            1. Pull the appropriate Docker image for your AMD GPU architecture from Docker Hub.
+               To train DeepSeek V2 16B with FP8 precision, use the following command.
 
                .. tab-set::
 
-                  {% for supported_gpus, docker in dockers.items() %}
-                  .. tab-item:: {{ supported_gpus }}
-                     :sync: {{ supported_gpus }}
+                  .. tab-item:: MI355X and MI350X
+                     :sync: MI355X
 
                      .. code-block:: shell
 
-                        docker pull {{ docker.pull_tag }}
-                  {% endfor %}
+                        EXP=examples/torchtitan/configs/MI355X/deepseek_v3_16b-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.batch_size 16
 
-            2. Run the Docker container.
-
-               .. tab-set::
-
-                  {% for supported_gpus, docker in dockers.items() %}
-                  .. tab-item:: {{ supported_gpus }}
-                     :sync: {{ supported_gpus }}
+                  .. tab-item:: MI325X
+                     :sync: MI325X
 
                      .. code-block:: shell
 
-                        docker run -it \
-                            --device /dev/dri \
-                            --device /dev/kfd \
-                            --network host \
-                            --ipc host \
-                            --group-add video \
-                            --cap-add SYS_PTRACE \
-                            --security-opt seccomp=unconfined \
-                            --privileged \
-                            -v $HOME:$HOME \
-                            -v $HOME/.ssh:/root/.ssh \
-                            --shm-size 64G \
-                            --name training_env \
-                            {{ docker.pull_tag }}
-                  {% endfor %}
+                        EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.batch_size 8
 
-               Use these commands if you exit the ``training_env`` container and need to return to it.
+                  .. tab-item:: MI300X
+                     :sync: MI300X
 
-               .. code-block:: shell
+                     .. code-block:: shell
 
-                  docker start training_env
-                  docker exec -it training_env bash
-
-            3. Navigate to the ``torchtitan`` workspace directory.
-
-               .. code-block:: shell
-
-                  cd /workspace/torchtitan
-
-            .. rubric:: Download the tokenizer
-
-            1. The following benchmarking examples require downloading models and datasets
-               from Hugging Face. To ensure successful access to gated repos, set your
-               ``HF_TOKEN``.
-
-               .. code-block:: shell
-
-                  export HF_TOKEN=$your_personal_hugging_face_access_token
-
-            2. Download the tokenizer for your model.
-
-               .. container:: model-doc {{ model.mad_tag }}
-
-                  .. code-block:: shell
-
-                     python3 scripts/download_tokenizer.py \
-                        --repo_id {{ model.model_repo }} \
-                        --tokenizer_path "original" \
-                        --hf_token=${HF_TOKEN}
-
-            .. rubric:: Pretraining examples
-
-            Run the training script with the appropriate configuration file.
-
-            For train with BF16 precicion, use the following command:
-
-            .. container:: model-doc {{ model.mad_tag }}
-
-               .. code-block:: shell
-
-                  CONFIG_FILE={{ model.config_file.bf16 }} \
-                  .run_train.sh
-
-            For train with BF16 precicion, use the following command:
-
-            .. container:: model-doc {{ model.mad_tag }}
-
-               .. code-block:: shell
-
-                  CONFIG_FILE={{ model.config_file.fp8 }} \
-                  .run_train.sh
+                        EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.batch_size 8
       {% endfor %}
    {% endfor %}
 
-Known issues
-============
-
-PyTorch Profiler may produce inaccurate traces when CPU activity profiling is enabled.
-
-
 Further reading
 ===============
 
diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
index 782cc61b3..f1e8c7f09 100644
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
@@ -27,12 +27,10 @@ training workloads:
 
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
 
-   {% set dockers = data.dockers %}
    .. tab-set::
 
-   {% for supported_gpus, docker in dockers.items() %}
-      .. tab-item:: {{ supported_gpus }}
-         :sync: {{ supported_gpus }}
+      .. tab-item:: {{ data.docker.pull_tag }}
+         :sync: {{ data.docker.pull_tag }}
 
          .. list-table::
             :header-rows: 1
@@ -40,13 +38,12 @@ training workloads:
             * - Software component
               - Version
 
-            {% for component_name, component_version in docker.components.items() %}
+            {% for component_name, component_version in data.docker.components.items() %}
             * - {{ component_name }}
               - {{ component_version }}
             {% endfor %}
-   {% endfor %}
 
-.. _amd-pytorch-training-model-support-v259:
+.. _amd-pytorch-training-model-support-v2510:
 
 Supported models
 ================
@@ -88,7 +85,7 @@ one to get started.
          </div>
       </div>
 
-.. _amd-pytorch-training-supported-training-modes-v259:
+.. _amd-pytorch-training-supported-training-modes-v2510:
 
 The following table lists supported training modes per model.
 
@@ -123,7 +120,7 @@ The following table lists supported training modes per model.
          unlisted fine-tuning methods by using an existing file in the
          ``/workspace/torchtune/recipes/configs`` directory as a template.
 
-.. _amd-pytorch-training-performance-measurements-v259:
+.. _amd-pytorch-training-performance-measurements-v2510:
 
 Performance measurements
 ========================
@@ -164,7 +161,7 @@ Run training
 
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
 
-   {% set dockers = data.dockers %}
+   {% set docker = data.docker %}
    {% set model_groups = data.model_groups %}
 
    Once the setup is complete, choose between two options to start benchmarking training:
@@ -179,7 +176,7 @@ Run training
          .. container:: model-doc {{ model.mad_tag }}
 
             The following run command is tailored to {{ model.model }}.
-            See :ref:`amd-pytorch-training-model-support-v259` to switch to another available model.
+            See :ref:`amd-pytorch-training-model-support-v2510` to switch to another available model.
 
             1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
                directory and install the required packages on the host machine.
@@ -217,7 +214,7 @@ Run training
          .. container:: model-doc {{ model.mad_tag }}
 
             The following commands are tailored to {{ model.model }}.
-            See :ref:`amd-pytorch-training-model-support-v259` to switch to another available model.
+            See :ref:`amd-pytorch-training-model-support-v2510` to switch to another available model.
 
       {% endfor %}
    {% endfor %}
@@ -226,42 +223,28 @@ Run training
 
          1. Use the following command to pull the Docker image from Docker Hub.
 
-            .. tab-set::
+            .. code-block:: shell
 
-               {% for supported_gpus, docker in dockers.items() %}
-               .. tab-item:: {{ supported_gpus }}
-                  :sync: {{ supported_gpus }}
-
-                  .. code-block:: shell
-
-                     docker pull {{ docker.pull_tag }}
-               {% endfor %}
+               docker pull {{ docker.pull_tag }}
 
          2. Launch the Docker container.
 
-            .. tab-set::
+            .. code-block:: shell
 
-               {% for supported_gpus, docker in dockers.items() %}
-               .. tab-item:: {{ supported_gpus }}
-                  :sync: {{ supported_gpus }}
-
-                  .. code-block:: shell
-
-                     docker run -it \
-                         --device /dev/dri \
-                         --device /dev/kfd \
-                         --network host \
-                         --ipc host \
-                         --group-add video \
-                         --cap-add SYS_PTRACE \
-                         --security-opt seccomp=unconfined \
-                         --privileged \
-                         -v $HOME:$HOME \
-                         -v $HOME/.ssh:/root/.ssh \
-                         --shm-size 64G \
-                         --name training_env \
-                         {{ docker.pull_tag }}
-               {% endfor %}
+               docker run -it \
+                   --device /dev/dri \
+                   --device /dev/kfd \
+                   --network host \
+                   --ipc host \
+                   --group-add video \
+                   --cap-add SYS_PTRACE \
+                   --security-opt seccomp=unconfined \
+                   --privileged \
+                   -v $HOME:$HOME \
+                   -v $HOME/.ssh:/root/.ssh \
+                   --shm-size 64G \
+                   --name training_env \
+                   {{ docker.pull_tag }}
 
             Use these commands if you exit the ``training_env`` container and need to return to it.
 
@@ -419,11 +402,34 @@ Run training
 
          .. container:: model-doc {{ model.mad_tag }}
 
-            .. rubric:: Pre-training
+            .. rubric:: Pretraining
 
             To start the pre-training benchmark, use the following command with the
             appropriate options. See the following list of options and their descriptions.
 
+            {% if model.mad_tag == "pyt_train_dlrm" %}
+
+            1. Go to the DLRM directory.
+
+               .. code-block:: shell
+
+                  cd /workspace/DLRMBenchmark
+
+            2. To run the single node training benchmark for DLRM-v2 with TF32 precision,
+               run the following script.
+
+               .. code-block:: shell
+
+                  ./launch_training_single_node.sh
+
+               To run with MAD within the Docker container, use the following command.
+
+               .. code-block:: shell
+
+                  ./pytorch_benchmark_report.sh -t pretrain -m DLRM
+
+            {% else %}
+
             .. code-block:: shell
 
                ./pytorch_benchmark_report.sh -t {% if available_modes | length == 1 %}{{ available_modes[0] }}{% else %}$training_mode{% endif %} \
@@ -466,6 +472,7 @@ Run training
                * - ``$sequence_length``
                  - Sequence length for the language model.
                  - Between 2048 and 8192. 8192 by default.
+            {% endif %}
          {% endif %}
 
          {% set training_modes = model.training_modes %}
@@ -525,7 +532,7 @@ Run training
 
             To start the fine-tuning benchmark, use the following command with the
             appropriate options. See the following list of options and their descriptions.
-            See :ref:`supported training modes <amd-pytorch-training-supported-training-modes-v259>`.
+            See :ref:`supported training modes <amd-pytorch-training-supported-training-modes-v2510>`.
 
             .. code-block:: shell
 
@@ -590,7 +597,7 @@ Run training
 
             For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.
 
-.. _amd-pytorch-training-multinode-examples-v259:
+.. _amd-pytorch-training-multinode-examples-v2510:
 
 Multi-node training
 -------------------
@@ -639,11 +646,6 @@ To launch the training job on a SLURM cluster for Llama 3.3 70B, run the followi
 
 Once the run is finished, you can find the log files in the ``result_torchtune/`` directory.
 
-Known issues
-============
-
-PyTorch Profiler may produce inaccurate traces when CPU activity profiling is enabled.
-
 Further reading
 ===============