From 98029db4eeb729d7ab709d79c03f867966b275a1 Mon Sep 17 00:00:00 2001
From: Peter Park <peter.park@amd.com>
Date: Thu, 21 Aug 2025 23:50:55 -0400
Subject: [PATCH] docs: Add Primus (Megatron) training Docker documentation
 (#5218)

---
 .wordlist.txt                                 |    6 +
 .../megatron-lm-benchmark-models.yaml         |   27 +-
 .../megatron-lm-v25.6-benchmark-models.yaml   |   60 +
 .../primus-megatron-benchmark-models.yaml     |   58 +
 .../training/benchmark-docker/megatron-lm.rst |   53 +-
 .../previous-versions/megatron-lm-history.rst |   12 +-
 .../megatron-lm-primus-migration-guide.rst    |  175 +++
 .../previous-versions/megatron-lm-v25.6.rst   | 1041 +++++++++++++++++
 .../benchmark-docker/primus-megatron.rst      |  602 ++++++++++
 docs/how-to/rocm-for-ai/training/index.rst    |    2 +
 docs/sphinx/_toc.yml.in                       |    4 +-
 11 files changed, 1994 insertions(+), 46 deletions(-)
 create mode 100644 docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.6-benchmark-models.yaml
 create mode 100644 docs/data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
 create mode 100644 docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-primus-migration-guide.rst
 create mode 100644 docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.6.rst
 create mode 100644 docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst

diff --git a/.wordlist.txt b/.wordlist.txt
index 7b592fc91..9d0f4d6cc 100644
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -116,6 +116,7 @@ Deprecations
 DevCap
 DirectX
 Dockerfile
+Dockerized
 Doxygen
 dropless
 ELMo
@@ -361,6 +362,7 @@ PowerEdge
 PowerShell
 Pretrained
 Pretraining
+Primus
 Profiler's
 PyPi
 Pytest
@@ -525,6 +527,7 @@ Xilinx
 Xnack
 Xteam
 YAML
+YAMLs
 YML
 YModel
 ZeRO
@@ -585,6 +588,7 @@ completers
 composable
 concretization
 config
+configs
 conformant
 constructible
 convolutional
@@ -795,7 +799,9 @@ preprocessing
 preprocessor
 prequantized
 prerequisites
+pretrain
 pretraining
+primus
 profiler
 profilers
 protobuf
diff --git a/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
index 77eaa5ba0..c743e00b6 100644
--- a/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
@@ -1,26 +1,15 @@
 dockers:
-  - pull_tag: rocm/megatron-lm:v25.6_py312
-    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py312/images/sha256-482ff906532285bceabdf2bda629bd32cb6174d2d07f4243a736378001b28df0
+  - pull_tag: rocm/megatron-lm:v25.7_py310
+    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a
     components:
-      ROCm: 6.4.1
-      PyTorch: 2.8.0a0+git7d205b2
-      Python: 3.12
-      Transformer Engine: 2.1.0.dev0+8c4a512
-      hipBLASLt: 393e413
-      Triton: 3.3.0
-      RCCL: 2.23.4.7a84c5d
-    doc_name: Ubuntu 24.04 + Python 3.12
-  - pull_tag: rocm/megatron-lm:v25.6_py310
-    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py310/images/sha256-9627bd9378684fe26cb1a10c7dd817868f553b33402e49b058355b0f095568d6
-    components:
-      ROCm: 6.4.1
-      PyTorch: 2.8.0a0+git7d205b2
+      ROCm: 6.4.2
+      Primus: v0.1.0-rc1
+      PyTorch: 2.8.0a0+gitd06a406
       Python: "3.10"
-      Transformer Engine: 2.1.0.dev0+8c4a512
-      hipBLASLt: 393e413
+      Transformer Engine: 2.1.0.dev0+ba586519
+      hipBLASLt: 37ba1d36
       Triton: 3.3.0
-      RCCL: 2.23.4.7a84c5d
-    doc_name: Ubuntu 22.04 + Python 3.10
+      RCCL: 2.22.3
 model_groups:
   - group: Meta Llama
     tag: llama
diff --git a/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.6-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.6-benchmark-models.yaml
new file mode 100644
index 000000000..77eaa5ba0
--- /dev/null
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.6-benchmark-models.yaml
@@ -0,0 +1,60 @@
+dockers:
+  - pull_tag: rocm/megatron-lm:v25.6_py312
+    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py312/images/sha256-482ff906532285bceabdf2bda629bd32cb6174d2d07f4243a736378001b28df0
+    components:
+      ROCm: 6.4.1
+      PyTorch: 2.8.0a0+git7d205b2
+      Python: 3.12
+      Transformer Engine: 2.1.0.dev0+8c4a512
+      hipBLASLt: 393e413
+      Triton: 3.3.0
+      RCCL: 2.23.4.7a84c5d
+    doc_name: Ubuntu 24.04 + Python 3.12
+  - pull_tag: rocm/megatron-lm:v25.6_py310
+    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py310/images/sha256-9627bd9378684fe26cb1a10c7dd817868f553b33402e49b058355b0f095568d6
+    components:
+      ROCm: 6.4.1
+      PyTorch: 2.8.0a0+git7d205b2
+      Python: "3.10"
+      Transformer Engine: 2.1.0.dev0+8c4a512
+      hipBLASLt: 393e413
+      Triton: 3.3.0
+      RCCL: 2.23.4.7a84c5d
+    doc_name: Ubuntu 22.04 + Python 3.10
+model_groups:
+  - group: Meta Llama
+    tag: llama
+    models:
+      - model: Llama 3.3 70B
+        mad_tag: pyt_megatron_lm_train_llama-3.3-70b
+      - model: Llama 3.1 8B
+        mad_tag: pyt_megatron_lm_train_llama-3.1-8b
+      - model: Llama 3.1 70B
+        mad_tag: pyt_megatron_lm_train_llama-3.1-70b
+      - model: Llama 3.1 70B (proxy)
+        mad_tag: pyt_megatron_lm_train_llama-3.1-70b-proxy
+      - model: Llama 2 7B
+        mad_tag: pyt_megatron_lm_train_llama-2-7b
+      - model: Llama 2 70B
+        mad_tag: pyt_megatron_lm_train_llama-2-70b
+  - group: DeepSeek
+    tag: deepseek
+    models:
+      - model: DeepSeek-V3 (proxy)
+        mad_tag: pyt_megatron_lm_train_deepseek-v3-proxy
+      - model: DeepSeek-V2-Lite
+        mad_tag: pyt_megatron_lm_train_deepseek-v2-lite-16b
+  - group: Mistral AI
+    tag: mistral
+    models:
+      - model: Mixtral 8x7B
+        mad_tag: pyt_megatron_lm_train_mixtral-8x7b
+      - model: Mixtral 8x22B (proxy)
+        mad_tag: pyt_megatron_lm_train_mixtral-8x22b-proxy
+  - group: Qwen
+    tag: qwen
+    models:
+      - model: Qwen 2.5 7B
+        mad_tag: pyt_megatron_lm_train_qwen2.5-7b
+      - model: Qwen 2.5 72B
+        mad_tag: pyt_megatron_lm_train_qwen2.5-72b
diff --git a/docs/data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
new file mode 100644
index 000000000..fec474f59
--- /dev/null
+++ b/docs/data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
@@ -0,0 +1,58 @@
+dockers:
+  - pull_tag: rocm/megatron-lm:v25.7_py310
+    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a
+    components:
+      ROCm: 6.4.2
+      Primus: v0.1.0-rc1
+      PyTorch: 2.8.0a0+gitd06a406
+      Python: "3.10"
+      Transformer Engine: 2.1.0.dev0+ba586519
+      hipBLASLt: 37ba1d36
+      Triton: 3.3.0
+      RCCL: 2.22.3
+model_groups:
+  - group: Meta Llama
+    tag: llama
+    models:
+      - model: Llama 3.3 70B
+        mad_tag: primus_pyt_megatron_lm_train_llama-3.3-70b
+        config_name: llama3.3_70B-pretrain.yaml
+      - model: Llama 3.1 70B
+        mad_tag: primus_pyt_megatron_lm_train_llama-3.1-70b
+        config_name: llama3.1_70B-pretrain.yaml
+      - model: Llama 3.1 8B
+        mad_tag: primus_pyt_megatron_lm_train_llama-3.1-8b
+        config_name: llama3.1_8B-pretrain.yaml
+      - model: Llama 2 7B
+        mad_tag: primus_pyt_megatron_lm_train_llama-2-7b
+        config_name: llama2_7B-pretrain.yaml
+      - model: Llama 2 70B
+        mad_tag: primus_pyt_megatron_lm_train_llama-2-70b
+        config_name: llama2_70B-pretrain.yaml
+  - group: DeepSeek
+    tag: deepseek
+    models:
+      - model: DeepSeek-V3 (proxy)
+        mad_tag: primus_pyt_megatron_lm_train_deepseek-v3-proxy
+        config_name: deepseek_v3-pretrain.yaml
+      - model: DeepSeek-V2-Lite
+        mad_tag: primus_pyt_megatron_lm_train_deepseek-v2-lite-16b
+        config_name: deepseek_v2_lite-pretrain.yaml
+  - group: Mistral AI
+    tag: mistral
+    models:
+      - model: Mixtral 8x7B
+        mad_tag: primus_pyt_megatron_lm_train_mixtral-8x7b
+        config_name: mixtral_8x7B_v0.1-pretrain.yaml
+      - model: Mixtral 8x22B (proxy)
+        mad_tag: primus_pyt_megatron_lm_train_mixtral-8x22b-proxy
+        config_name: mixtral_8x22B_v0.1-pretrain.yaml
+  - group: Qwen
+    tag: qwen
+    models:
+      - model: Qwen 2.5 7B
+        mad_tag: primus_pyt_megatron_lm_train_qwen2.5-7b
+        config_name: primus_qwen2.5_7B-pretrain.yaml
+      - model: Qwen 2.5 72B
+        mad_tag: primus_pyt_megatron_lm_train_qwen2.5-72b
+        config_name: qwen2.5_72B-pretrain.yaml
diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
index f9759c762..687cc514f 100644
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
@@ -1,3 +1,5 @@
+:orphan:
+
 .. meta::
    :description: How to train a model using Megatron-LM for ROCm.
    :keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
@@ -6,6 +8,14 @@
 Training a model with Megatron-LM for ROCm
 ******************************************
 
+.. caution::
+
+   The ROCm Megatron-LM framework now has limited support with this Docker
+   environment; it now focuses on Primus with Megatron-Core. See :doc:`primus-megatron`.
+
+   To learn how to migrate your existing workloads to Primus with Megatron-Core,
+   see :doc:`previous-versions/megatron-lm-primus-migration-guide`.
+
 The `Megatron-LM framework for ROCm <https://github.com/ROCm/Megatron-LM>`_ is
 a specialized fork of the robust Megatron-LM, designed to enable efficient
 training of large-scale language models on AMD GPUs. By leveraging AMD
@@ -20,13 +30,17 @@ essential components, including PyTorch, ROCm libraries, and Megatron-LM
 utilities. It contains the following software components to accelerate training
 workloads:
 
+.. note::
+
+   This Docker environment is based on Python 3.10 and Ubuntu 22.04. For an alternative environment with
+   Python 3.12 and Ubuntu 24.04, see the :doc:`previous ROCm Megatron-LM v25.6 Docker release <previous-versions/megatron-lm-v25.6>`.
+
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
 
    {% set dockers = data.dockers %}
-   {% if dockers|length > 1 %}
    .. tab-set::
 
-      {% for docker in data.dockers %}
+      {% for docker in dockers %}
       .. tab-item:: ``{{ docker.pull_tag }}``
          :sync: {{ docker.pull_tag }}
 
@@ -42,28 +56,14 @@ workloads:
 
             {% endfor %}
       {% endfor %}
-   {% elif dockers|length == 1 %}
-   .. list-table::
-      :header-rows: 1
-
-      * - Software component
-        - Version
-
-      {% for component_name, component_version in docker.components %}
-      * - {{ component_name }}
-        - {{ component_version }}
-
-      {% endfor %}
-   {% endif %}
 
    .. _amd-megatron-lm-model-support:
 
-   The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
-
    Supported models
    ================
 
-   The following models are supported for training performance benchmarking with Megatron-LM and ROCm.
+   The following models are supported for training performance benchmarking with Megatron-LM and ROCm
+   on AMD Instinct MI300X series accelerators.
    Some instructions, commands, and training recommendations in this documentation might
    vary by model -- select one to get started.
 
@@ -177,7 +177,7 @@ Download the Docker image
       {% if dockers|length > 1 %}
       .. tab-set::
 
-         {% for docker in data.dockers %}
+         {% for docker in dockers %}
          .. tab-item:: {{ docker.doc_name }}
             :sync: {{ docker.pull_tag }}
 
@@ -227,10 +227,17 @@ Download the Docker image
       docker start megatron_training_env
       docker exec -it megatron_training_env bash
 
-The Docker container includes a pre-installed, verified version of the ROCm
-Megatron-LM development branch
-`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev>`__, including necessary
-training scripts.
+4. **Megatron-LM backward compatibility setup** -- this Docker is primarily intended for use with Primus, but it maintains Megatron-LM compatibility with limited support.
+   To roll back to using Megatron-LM, follow these steps:
+
+   .. code-block:: shell
+
+      cd /workspace/Megatron-LM/
+      pip uninstall megatron-core
+      pip install -e .
+
+The Docker container hosts
+`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev>`__ at verified commit ``e8e9edc``.
 
 .. _amd-megatron-lm-environment-setup:
 
diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst
index 9dd1c8f2c..f4ed199ef 100644
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst
@@ -16,12 +16,20 @@ previous releases of the ``ROCm/megatron-lm`` Docker image on `Docker Hub <https
      - Components
      - Resources
 
-   * - v25.6 (latest)
+   * - v25.7 (latest)
+     - 
+       * ROCm 
+       * PyTorch 
+     - 
+       * :doc:`Documentation <../megatron-lm>`
+       * `Docker Hub (py310) <https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a>`__
+
+   * - v25.6
      - 
        * ROCm 6.4.1
        * PyTorch 2.8.0a0+git7d205b2
      - 
-       * :doc:`Documentation <../megatron-lm>`
+       * :doc:`Documentation <megatron-lm-v25.6>`
        * `Docker Hub (py312) <https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py312/images/sha256-482ff906532285bceabdf2bda629bd32cb6174d2d07f4243a736378001b28df0>`__
        * `Docker Hub (py310) <https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py310/images/sha256-9627bd9378684fe26cb1a10c7dd817868f553b33402e49b058355b0f095568d6>`__
 
diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-primus-migration-guide.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-primus-migration-guide.rst
new file mode 100644
index 000000000..9275c1f39
--- /dev/null
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-primus-migration-guide.rst
@@ -0,0 +1,175 @@
+:orphan:
+
+**********************************************************************
+Migrating workloads to Primus (Megatron-Core backend) from Megatron-LM
+**********************************************************************
+
+Primus supports Megatron-Core as backend optimization library,
+replacing ROCm Megatron-LM. This document outlines the steps to migrate
+workload from ROCm Megatron-LM to Primus with the Megatron-Core backend.
+
+Model architecture
+==================
+
+ROCm Megatron-LM defines model architecture parameters in the training scripts;
+for example, the Llama 3 8B model parameters are defined in
+`examples/llama/train_llama3.sh <https://github.com/ROCm/Megatron-LM/blob/rocm_dev/examples/llama/train_llama3.sh#L117>`__
+as shown below:
+
+.. code-block:: bash
+
+   HIDDEN_SIZE=4096 
+   FFN_HIDDEN_SIZE=14336 
+   NUM_LAYERS=32 
+   NUM_HEADS=32 
+   NUM_KV_HEADS=8
+
+Primus defines the model architecture through model YAML configuration files
+inside the ``primus/configs/models/megatron/`` repository. For example, Llama 3 8B
+model architecture parameters are defined in
+`primus/configs/models/megatron/llama3_8B.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/llama3_8B.yaml>`__
+as shown below:
+
+.. code-block:: yaml
+
+   bases:
+     - llama3_base.yaml
+
+   tokenizer_type: Llama3Tokenizer
+   tokenizer_model: meta-llama/Llama-3.1-8B
+
+   ffn_hidden_size: 14336
+   hidden_size: 4096
+   num_attention_heads: 32
+   num_layers: 32
+   num_query_groups: 8
+
+Primus' model config files follow a hierarchical design, meaning that new model
+config YAMLs can inherit existing model config files by importing them as
+bases. For example,
+`llama3.1_8B.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/llama3.1_8B.yaml>`__
+uses ``llama3_8B.yaml`` as a base config and overrides few parameters, as shown below.
+In this example, ``llama3.1_8B`` overrides the ``max_position_embeddings`` value:
+
+.. code-block:: yaml
+
+   bases:
+     - llama3_8B.yaml
+
+   tokenizer_type: Llama3Tokenizer
+   tokenizer_model: meta-llama/Llama-3.1-8B
+
+   max_position_embeddings: 131072
+
+.. tip::
+
+   Primus provides ``llama_base.yaml`` as the base configuration, which can be
+   used as bases for additional model architectures. For example,
+   `mixtral_base.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/mixtral_base.yaml>`__
+   and
+   `deepseek_v3_base.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/deepseek_v3_base.yaml>`__
+   define ``llama_base.yaml`` as its base.
+
+   .. code-block:: yaml
+
+      # Example mixtral_base.yaml:
+
+      bases:
+        - llama_base.yaml
+
+      init_method_std: 0.01
+      rotary_base: 1000000
+      qk_layernorm: false
+
+      group_query_attention: true
+      num_query_groups: 8
+
+      # moe parameters
+      num_experts: 8
+      moe_router_topk: 2
+      moe_router_load_balancing_type: aux_loss
+      moe_aux_loss_coeff: 1e-2
+      moe_grouped_gemm: true
+      moe_token_dispatcher_type: alltoall
+
+It is recommended to add a new ``${MODEL_NAME}_base.yaml`` to add a new
+category of model and define new models on top of it. For example, to add
+Qwen2.5 models in Primus, we define
+`qwen2.5_base.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/qwen2.5_base.yaml>`__
+and build
+`qwen2.5_7B.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/qwen2.5_7B.yaml>`__
+and
+`qwen2.5_72B.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/qwen2.5_72B.yaml>`__
+using ``qwen2.5_base.yaml`` as the base config.
+
+Training parameters
+===================
+
+ROCm Megatron-LM also defines the training parameters, like batch size,
+tensor-parallelism, precision, as so on, in the training scripts. For example,
+Llama3 8B model parameters are defined in
+`examples/llama/train_llama3.sh <https://github.com/ROCm/Megatron-LM/blob/rocm_dev/examples/llama/train_llama3.sh>`__
+as shown below:
+
+.. code-block:: bash
+
+   TP="${TP:-8}"
+   PP="${PP:-1}"
+   CP="${CP:-1}"
+   MBS="${MBS:-1}"
+   BS="${BS:-8}"
+
+Primus defines the training parameters in top-level YAML files -- see
+`examples/megatron/configs/
+<https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1/examples/megatron/configs>`__.
+For example, the `llama3.1_8B-pretrain.yaml
+<https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/examples/megatron/configs/llama3.1_8B-pretrain.yaml>`__
+configuration imports the ``llama3.1_8B.yaml`` model architecture file. Users can then override
+the default training parameters in ``llama3.1_8B-pretrain.yaml``.
+
+.. code-block:: yaml
+
+   # model to run
+   model: llama3.1_8B.yaml  # Model architecture yaml
+   overrides:
+     # log
+     # disable_wandb: false
+     # disable_tensorboard: false
+     stderr_sink_level: DEBUG
+
+     log_avg_skip_iterations: 2
+     log_avg_reset_interval: 50
+
+     train_iters: 50
+     micro_batch_size: 2
+     global_batch_size: 128
+
+     seq_length: 8192
+     max_position_embeddings: 8192
+
+     lr: 1.0e-5
+     min_lr: 0.0
+     lr_warmup_iters: 2
+     lr_decay_iters: null
+     lr_decay_style: cosine
+     weight_decay: 0.1
+     adam_beta1: 0.9
+     adam_beta2: 0.95
+     eod_mask_loss: true
+     init_method_std: 0.008
+     norm_epsilon: 1.0e-6
+
+Backward compatibility with Megatron-LM
+=======================================
+
+The Dockerized environment used for Primus maintains compatibility with Megatron-LM with
+limited support. To roll back to using Megatron-LM, follow these steps.
+
+.. code-block:: shell
+
+   cd /workspace/Megatron-LM/
+   pip uninstall megatron-core
+   pip install -e .
+
+Once Megatron-LM is installed, follow :doc:`the documentation <../megatron-lm>` to run workloads as
+usual.
diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.6.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.6.rst
new file mode 100644
index 000000000..32d72311b
--- /dev/null
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.6.rst
@@ -0,0 +1,1041 @@
+:orphan:
+
+.. meta::
+   :description: How to train a model using Megatron-LM for ROCm.
+   :keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
+
+******************************************
+Training a model with Megatron-LM for ROCm
+******************************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm Megatron-LM
+   training performance documentation. See :doc:`../megatron-lm` for the latest version.
+
+The `Megatron-LM framework for ROCm <https://github.com/ROCm/Megatron-LM>`__ is
+a specialized fork of the robust Megatron-LM, designed to enable efficient
+training of large-scale language models on AMD GPUs. By leveraging AMD
+Instinct™ MI300X series accelerators, Megatron-LM delivers enhanced
+scalability, performance, and resource utilization for AI workloads. It is
+purpose-built to support models like Llama, DeepSeek, and Mixtral,
+enabling developers to train next-generation AI models more
+efficiently.
+
+AMD provides ready-to-use Docker images for MI300X series accelerators containing
+essential components, including PyTorch, ROCm libraries, and Megatron-LM
+utilities. It contains the following software components to accelerate training
+workloads:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.6-benchmark-models.yaml
+
+   {% set dockers = data.dockers %}
+   {% if dockers|length > 1 %}
+   .. tab-set::
+
+      {% for docker in data.dockers %}
+      .. tab-item:: ``{{ docker.pull_tag }}``
+         :sync: {{ docker.pull_tag }}
+
+         .. list-table::
+            :header-rows: 1
+
+            * - Software component
+              - Version
+
+            {% for component_name, component_version in docker.components.items() %}
+            * - {{ component_name }}
+              - {{ component_version }}
+
+            {% endfor %}
+      {% endfor %}
+   {% elif dockers|length == 1 %}
+   .. list-table::
+      :header-rows: 1
+
+      * - Software component
+        - Version
+
+      {% for component_name, component_version in docker.components %}
+      * - {{ component_name }}
+        - {{ component_version }}
+
+      {% endfor %}
+   {% endif %}
+
+   .. _amd-megatron-lm-model-support-v256:
+
+   The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
+
+   Supported models
+   ================
+
+   The following models are supported for training performance benchmarking with Megatron-LM and ROCm.
+   Some instructions, commands, and training recommendations in this documentation might
+   vary by model -- select one to get started.
+
+   {% set model_groups = data.model_groups %}
+   .. raw:: html
+
+         <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+           <div class="row">
+             <div class="col-2 me-2 model-param-head">Model</div>
+             <div class="row col-10">
+      {% for model_group in model_groups %}
+               <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+      {% endfor %}
+             </div>
+           </div>
+
+           <div class="row mt-1">
+             <div class="col-2 me-2 model-param-head">Model variant</div>
+             <div class="row col-10">
+      {% for model_group in model_groups %}
+         {% set models = model_group.models %}
+         {% for model in models %}
+            {% if models|length % 3 == 0 %}
+               <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% else %}
+               <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% endif %}
+         {% endfor %}
+      {% endfor %}
+             </div>
+           </div>
+         </div>
+
+.. note::
+
+   Some models, such as Llama, require an external license agreement through
+   a third party (for example, Meta).
+
+.. _amd-megatron-lm-performance-measurements-v256:
+
+Performance measurements
+========================
+
+To evaluate performance, the
+`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`__
+page provides reference throughput and latency measurements for training
+popular AI models.
+
+.. important::
+
+   The performance data presented in
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`__
+   only reflects the latest version of this training benchmarking environment.
+   The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting training.
+
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
+.. _mi300x-amd-megatron-lm-training-v256:
+
+Environment setup
+=================
+
+Use the following instructions to set up the environment, configure the script to train models, and
+reproduce the benchmark results on MI300X series accelerators with the AMD Megatron-LM Docker
+image.
+
+.. _amd-megatron-lm-requirements-v256:
+ 
+Download the Docker image
+-------------------------
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.6-benchmark-models.yaml
+
+   {% set dockers = data.dockers %}
+   1. Use the following command to pull the Docker image from Docker Hub.
+
+      {% if dockers|length > 1 %}
+      .. tab-set:: 
+
+         {% for docker in data.dockers %}
+         .. tab-item:: {{ docker.doc_name }}
+            :sync: {{ docker.pull_tag }}
+
+            .. code-block:: shell
+
+               docker pull {{ docker.pull_tag }}
+
+         {% endfor %}
+      {% elif dockers|length == 1 %}
+      {% set docker = dockers[0] %}
+      .. code-block:: shell
+
+         docker pull {{ docker.pull_tag }}
+
+      {% endif %}
+   2. Launch the Docker container.
+
+      {% if dockers|length > 1 %}
+      .. tab-set::
+
+         {% for docker in data.dockers %}
+         .. tab-item:: {{ docker.doc_name }}
+            :sync: {{ docker.pull_tag }}
+
+            .. code-block:: shell
+
+               docker run -it \
+                   --device /dev/dri \
+                   --device /dev/kfd \
+                   --device /dev/infiniband \
+                   --network host --ipc host \
+                   --group-add video \
+                   --cap-add SYS_PTRACE \
+                   --security-opt seccomp=unconfined \
+                   --privileged \
+                   -v $HOME:$HOME \
+                   -v $HOME/.ssh:/root/.ssh \
+                   --shm-size 128G \
+                   --name megatron_training_env \
+                   {{ docker.pull_tag }}
+
+         {% endfor %}
+      {% elif dockers|length == 1 %}
+      {% set docker = dockers[0] %}
+      .. code-block:: shell
+
+         docker run -it \
+             --device /dev/dri \
+             --device /dev/kfd \
+             --device /dev/infiniband \
+             --network host --ipc host \
+             --group-add video \
+             --cap-add SYS_PTRACE \
+             --security-opt seccomp=unconfined \
+             --privileged \
+             -v $HOME:$HOME \
+             -v $HOME/.ssh:/root/.ssh \
+             --shm-size 128G \
+             --name megatron_training_env \
+             {{ docker.pull_tag }}
+
+      {% endif %}
+
+3. Use these commands if you exit the ``megatron_training_env`` container and need to return to it.
+
+   .. code-block:: shell
+
+      docker start megatron_training_env
+      docker exec -it megatron_training_env bash
+
+The Docker container includes a pre-installed, verified version of the ROCm
+Megatron-LM development branch
+`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev>`__, including necessary
+training scripts.
+
+.. _amd-megatron-lm-environment-setup-v256:
+
+Configuration
+=============
+
+.. container:: model-doc pyt_megatron_lm_train_llama-3.3-70b pyt_megatron_lm_train_llama-3.1-8b pyt_megatron_lm_train_llama-3.1-70b
+
+   Update the ``train_llama3.sh`` configuration script in the ``examples/llama``
+   directory of
+   `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/llama>`__ to configure your training run.
+   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v256>`.
+
+.. container:: model-doc pyt_megatron_lm_train_llama-2-7b pyt_megatron_lm_train_llama-2-70b
+
+   Update the ``train_llama2.sh`` configuration script in the ``examples/llama``
+   directory of
+   `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/llama>`__ to configure your training run.
+   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v256>`.
+
+.. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy
+
+   Update the ``train_deepseekv3.sh`` configuration script in the ``examples/deepseek_v3``
+   directory of
+   `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/deepseek_v3>`__ to configure your training run.
+   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v256>`.
+
+.. container:: model-doc pyt_megatron_lm_train_deepseek-v2-lite-16b
+
+   Update the ``train_deepseekv2.sh`` configuration script in the ``examples/deepseek_v2``
+   directory of
+   `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/deepseek_v2>`__ to configure your training run.
+   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v256>`.
+
+.. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b pyt_megatron_lm_train_mixtral-8x22b-proxy
+
+   Update the ``train_mixtral_moe.sh`` configuration script in the ``examples/mixtral``
+   directory of
+   `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/mixtral>`__ to configure your training run.
+   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v256>`.
+
+.. note::
+
+   See :ref:`Key options <amd-megatron-lm-benchmark-test-vars-v256>` for more information on configuration options.
+
+Network interface
+-----------------
+
+Update the network interface in the script to match your system's network interface. To
+find your network interface, run the following (outside of any Docker container):
+
+.. code-block:: bash
+
+   ip a
+
+Look for an active interface that has an IP address in the same subnet as
+your other nodes. Then, update the following variables in the script, for
+example:
+
+.. code-block:: bash
+
+   export NCCL_SOCKET_IFNAME=ens50f0np0
+
+   export GLOO_SOCKET_IFNAME=ens50f0np0
+
+.. _amd-megatron-lm-tokenizer-v256:
+
+Tokenizer
+---------
+
+You can assign the path of an existing tokenizer to the ``TOKENIZER_MODEL`` as shown in the following examples.
+If the tokenizer is not found, it'll be downloaded if publicly available.
+
+.. container:: model-doc pyt_megatron_lm_train_llama-3.3-70b
+
+   If you do not have Llama 3.3 tokenizer locally, you need to use your
+   personal Hugging Face access token ``HF_TOKEN`` to download the tokenizer.
+   See `Llama-3.3-70B-Instruct
+   <https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct>`_. After you are
+   authorized, use your ``HF_TOKEN`` to download the tokenizer and set the
+   variable ``TOKENIZER_MODEL`` to the tokenizer path.
+
+   .. code-block:: shell
+
+      export HF_TOKEN=<Your personal Hugging Face access token>
+
+   The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path.
+
+   .. code-block:: shell
+
+      TOKENIZER_MODEL="meta-llama/Llama-3.3-70B-Instruct"
+
+.. container:: model-doc pyt_megatron_lm_train_llama-3.1-8b
+
+   The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path.
+
+   .. code-block:: shell
+
+      TOKENIZER_MODEL="meta-llama/Llama-3.1-8B"
+
+.. container:: model-doc pyt_megatron_lm_train_llama-3.1-70b
+
+   The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path.
+
+   .. code-block:: shell
+
+      TOKENIZER_MODEL="meta-llama/Llama-3.1-70B"
+
+.. container:: model-doc pyt_megatron_lm_train_llama-2-7b pyt_megatron_lm_train_llama-2-70b
+
+   The training script uses either the ``Llama2Tokenizer`` or ``HuggingFaceTokenizer`` by default.
+
+.. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy
+
+   The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path.
+
+   .. code-block:: shell
+
+      TOKENIZER_MODEL="deepseek-ai/DeepSeek-V3"
+
+.. container:: model-doc pyt_megatron_lm_train_deepseek-v2-lite-16b
+
+   The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path.
+
+   .. code-block:: shell
+
+      TOKENIZER_MODEL="deepseek-ai/DeepSeek-V2-Lite"
+
+.. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b pyt_megatron_lm_train_mixtral-8x22b-proxy
+
+   Download the Mixtral tokenizer.
+
+   .. code-block:: shell
+
+      mkdir tokenizer
+      cd tokenizer
+      export HF_TOKEN=<Your personal Hugging Face access token>
+      wget --header="Authorization: Bearer $HF_TOKEN" -O ./tokenizer.model https://huggingface.co/mistralai/Mixtral-8x7B-v0.1/resolve/main/tokenizer.model
+
+   Use the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path.
+
+   .. code-block:: shell
+
+      TOKENIZER_MODEL=tokenizer/tokenizer.model
+
+.. container:: model-doc pyt_megatron_lm_train_qwen2.5-7b
+
+   The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path.
+
+   .. code-block:: shell
+
+      TOKENIZER_MODEL="Qwen/Qwen2.5-7B"
+
+.. container:: model-doc pyt_megatron_lm_train_qwen2.5-72b
+
+   The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path.
+
+   .. code-block:: shell
+
+      TOKENIZER_MODEL="Qwen/Qwen2.5-72B"
+
+Dataset options
+---------------
+
+You can use either mock data or real data for training.
+
+* Mock data can be useful for testing and validation. Use the ``MOCK_DATA`` variable to toggle between mock and real data. The default
+  value is ``1`` for enabled.
+
+  .. code-block:: bash
+
+     MOCK_DATA=1
+
+* If you're using a real dataset, update the ``DATA_PATH`` variable to point to the location of your dataset.
+
+  .. code-block:: bash
+
+     MOCK_DATA=0
+
+     DATA_PATH="/data/bookcorpus_text_sentence"  # Change to where your dataset is stored
+
+  Ensure that the files are accessible inside the Docker container.
+
+Download the dataset
+^^^^^^^^^^^^^^^^^^^^
+
+.. container:: model-doc pyt_megatron_lm_train_llama-3.3-70b pyt_megatron_lm_train_llama-3.1-8b pyt_megatron_lm_train_llama-3.1-70b pyt_megatron_lm_train_llama-2-7b pyt_megatron_lm_train_llama-2-70b pyt_megatron_lm_train_llama-3.1-70b-proxy
+
+   For Llama models, use the `prepare_dataset.sh
+   <https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/llama>`_ script
+   to prepare your dataset.
+   To download the dataset, set the ``DATASET`` variable to the dataset you'd
+   like to use. Three datasets are supported: ``DATASET=wiki``, ``DATASET=fineweb``, and
+   ``DATASET=bookcorpus``.
+
+   .. code-block:: shell
+
+      DATASET=wiki TOKENIZER_MODEL=NousResearch/Llama-2-7b-chat-hf bash examples/llama/prepare_dataset.sh #for wiki-en dataset
+      DATASET=bookcorpus TOKENIZER_MODEL=NousResearch/Llama-2-7b-chat-hf bash examples/llama/prepare_dataset.sh #for bookcorpus dataset
+
+   ``TOKENIZER_MODEL`` can be any accessible Hugging Face tokenizer.
+   Remember to either pre-download the tokenizer or setup Hugging Face access
+   otherwise when needed -- see the :ref:`Tokenizer <amd-megatron-lm-tokenizer-v256>` section.
+
+   .. note::
+
+      When training set ``DATA_PATH`` to the specific file name prefix pointing to the ``.bin`` or ``.idx``
+      as in the following example:
+
+      .. code-block:: shell
+
+         DATA_PATH="data/bookcorpus_text_sentence" # Change to where your dataset is stored.
+
+.. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy
+
+   If you don't already have the dataset, download the DeepSeek dataset using the following
+   commands:
+
+   .. code-block:: shell
+
+      mkdir deepseek-datasets
+      cd deepseek-datasets
+      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/SlimPajama.json
+      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-train.json
+      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-valid.json
+      cd ..
+      bash tools/run_make_pretraining_dataset_megatron.sh deepseek-datasets/SlimPajama.json DeepSeekV3Tokenizer text deepseek-datasets deepseek-ai/DeepSeek-V3
+
+   To train on this data, update the ``DATA_DIR`` variable to point to the location of your dataset.
+
+   .. code-block:: bash
+
+      MOCK_DATA=0 # Train on real data
+
+      DATA_DIR="<path-to>/deepseek-datasets"  # Change to where your dataset is stored
+
+      Ensure that the files are accessible inside the Docker container.
+
+.. container:: model-doc pyt_megatron_lm_train_deepseek-v2-lite-16b
+
+   If you don't already have the dataset, download the DeepSeek dataset using the following
+   commands:
+
+   .. code-block:: shell
+
+      mkdir deepseek-datasets
+      cd deepseek-datasets
+      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/SlimPajama.json
+      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-train.json
+      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-valid.json
+      cd ..
+      bash tools/run_make_pretraining_dataset_megatron.sh deepseek-datasets/SlimPajama.json DeepSeekV3Tokenizer text deepseek-datasets deepseek-ai/DeepSeek-V3
+
+   To train on this data, update the ``DATA_DIR`` variable to point to the location of your dataset.
+
+   .. code-block:: bash
+
+      MOCK_DATA=0 # Train on real data
+
+      DATA_DIR="<path-to>/deepseek-datasets"  # Change to where your dataset is stored
+
+.. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b pyt_megatron_lm_train_mixtral-8x22b-proxy
+
+   If you don't already have the dataset, download the Mixtral dataset using the following
+   commands:
+
+   .. code-block:: shell
+
+      mkdir mixtral-datasets
+      cd mixtral-datasets
+      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/mistral-datasets/wudao_mistralbpe_content_document.bin
+      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/mistral-datasets/wudao_mistralbpe_content_document.idx
+
+   To train on this data, update the ``DATA_DIR`` variable to point to the location of your dataset.
+
+   .. code-block:: bash
+
+      MOCK_DATA=0 # Train on real data
+
+      DATA_DIR="<path-to>/mixtral-datasets"  # Change to where your dataset is stored
+
+   Ensure that the files are accessible inside the Docker container.
+
+.. container:: model-doc pyt_megatron_lm_train_qwen2.5-7b pyt_megatron_lm_train_qwen2.5-72b
+
+   If you don't already have the dataset, download the Mixtral dataset using the following
+   commands:
+
+   .. code-block:: shell
+
+      mkdir -p temp/qwen-datasets
+      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/qwen-datasets/wudao_qwenbpe_text_document.bin
+      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/qwen-datasets/wudao_qwenbpe_text_document.idx
+
+   To train on this data, update the ``DATA_DIR`` variable to point to the location of your dataset.
+
+   .. code-block:: bash
+
+      MOCK_DATA=0 # Train on real data
+
+      DATA_DIR="<path-to>/qwen-datasets"  # Change to where your dataset is stored
+
+   Ensure that the files are accessible inside the Docker container.
+
+Multi-node configuration
+------------------------
+
+If you're running multi-node training, update the following environment variables. They can
+also be passed as command line arguments. Refer to the following example configurations.
+
+* Change ``localhost`` to the master node's hostname:
+
+  .. code-block:: shell
+
+     MASTER_ADDR="${MASTER_ADDR:-localhost}"
+
+* Set the number of nodes you want to train on (for instance, ``2``, ``4``, ``8``):
+
+  .. code-block:: shell
+
+     NNODES="${NNODES:-1}"
+
+* Set the rank of each node (0 for master, 1 for the first worker node, and so on):
+
+  .. code-block:: shell
+
+     NODE_RANK="${NODE_RANK:-0}"
+
+* Set ``DATA_CACHE_PATH`` to a common directory accessible by all the nodes (for example, an
+  NFS directory) for multi-node runs:
+
+  .. code-block:: shell
+
+     DATA_CACHE_PATH=/root/cache # Set to a common directory for multi-node runs
+
+* For multi-node runs, make sure the correct network drivers are installed on the nodes. If
+  inside a Docker container, either install the drivers inside the Docker container or pass the network
+  drivers from the host while creating the Docker container.
+
+  .. code-block:: shell
+
+     # Specify which RDMA interfaces to use for communication
+     export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
+
+.. _amd-megatron-lm-run-training-v256:
+
+Run training
+============
+
+Use the following example commands to set up the environment, configure
+:ref:`key options <amd-megatron-lm-benchmark-test-vars-v256>`, and run training on
+MI300X series accelerators with the AMD Megatron-LM environment.
+
+Single node training
+--------------------
+
+.. container:: model-doc pyt_megatron_lm_train_llama-3.3-70b
+
+   To run the training on a single node for Llama 3.3 70B BF16 with FSDP-v2 enabled, add the ``FSDP=1`` argument.
+   For example, use the following command:
+
+   .. code-block:: shell
+
+      TOKENIZER_MODEL=meta-llama/Llama-3.3-70B-Instruct \
+      CKPT_FORMAT=torch_dist \
+      TEE_OUTPUT=1 \
+      RECOMPUTE=1 \
+      SEQ_LENGTH=8192 \
+      MBS=2 \
+      BS=16 \
+      TE_FP8=0 \
+      TP=1 \
+      PP=1 \
+      FSDP=1 \
+      MODEL_SIZE=70 \
+      TOTAL_ITERS=50 \
+      bash examples/llama/train_llama3.sh 
+
+   .. note::
+
+      It is suggested to use ``TP=1`` when FSDP is enabled for higher
+      throughput. FSDP-v2 is not supported with pipeline parallelism, expert
+      parallelism, MCore's distributed optimizer, gradient accumulation fusion,
+      or FP16.
+
+.. container:: model-doc pyt_megatron_lm_train_llama-3.1-8b
+
+   To run training on a single node for Llama 3.1 8B FP8, navigate to the Megatron-LM folder and use the
+   following command.
+
+   .. code-block:: shell
+
+      TEE_OUTPUT=1 \
+      MBS=2 \
+      BS=128 \
+      TP=1 \
+      TE_FP8=1 \
+      SEQ_LENGTH=8192 \
+      MODEL_SIZE=8 \
+      TOTAL_ITERS=50 \
+      bash examples/llama/train_llama3.sh
+
+   For Llama 3.1 8B BF16, use the following command:
+
+   .. code-block:: shell
+
+      TEE_OUTPUT=1 \
+      MBS=2 \
+      BS=128 \
+      TP=1 \
+      TE_FP8=0 \
+      SEQ_LENGTH=8192 \
+      MODEL_SIZE=8 \
+      TOTAL_ITERS=50 \
+      bash examples/llama/train_llama3.sh
+
+.. container:: model-doc pyt_megatron_lm_train_llama-3.1-70b
+
+   To run the training on a single node for Llama 3.1 70B BF16 with FSDP-v2 enabled, add the ``FSDP=1`` argument.
+   For example, use the following command:
+
+   .. code-block:: shell
+
+      CKPT_FORMAT=torch_dist \
+      TEE_OUTPUT=1 \
+      MBS=3 \
+      BS=24 \
+      TP=1 \
+      TE_FP8=0 \
+      FSDP=1 \
+      RECOMPUTE=1 \
+      SEQ_LENGTH=8192 \
+      MODEL_SIZE=70 \
+      TOTAL_ITERS=50 \
+      bash examples/llama/train_llama3.sh
+
+   .. note::
+
+      It is suggested to use ``TP=1`` when FSDP is enabled for higher
+      throughput. FSDP-v2 is not supported with pipeline parallelism, expert
+      parallelism, MCore's distributed optimizer, gradient accumulation fusion,
+      or FP16.
+
+.. container:: model-doc pyt_megatron_lm_train_llama-3.1-70b-proxy
+
+   To run the training on a single node for Llama 3.1 70B with proxy, use the following command.
+
+   .. code-block:: shell
+
+      CKPT_FORMAT=torch_dist \
+      TEE_OUTPUT=1 \
+      RECOMPUTE=1 \
+      MBS=3 \
+      BS=24 \
+      TP=1 \
+      TE_FP8=1 \
+      SEQ_LENGTH=8192 \
+      MODEL_SIZE=70 \
+      FSDP=1 \
+      TOTAL_ITERS=10 \
+      NUM_LAYERS=40 \
+      bash examples/llama/train_llama3.sh
+
+   .. note::
+
+      Use two or more nodes to run the *full* Llama 70B model with FP8 precision.
+
+   .. note::
+
+      It is suggested to use ``TP=1`` when FSDP is enabled for higher
+      throughput. FSDP-v2 is not supported with pipeline parallelism, expert
+      parallelism, MCore's distributed optimizer, gradient accumulation fusion,
+      or FP16.
+
+.. container:: model-doc pyt_megatron_lm_train_llama-2-7b
+
+   To run training on a single node for Llama 2 7B FP8, navigate to the Megatron-LM folder and use the
+   following command.
+
+   .. code-block:: shell
+
+      TEE_OUTPUT=1 \
+      MBS=4 \
+      BS=256 \
+      TP=1 \
+      TE_FP8=1 \
+      SEQ_LENGTH=4096 \
+      MODEL_SIZE=7 \
+      TOTAL_ITERS=50 \
+      bash examples/llama/train_llama2.sh
+
+   For Llama 2 7B BF16, use the following command:
+
+   .. code-block:: shell
+
+      TEE_OUTPUT=1 \
+      MBS=4 \
+      BS=256 \
+      TP=1 \
+      TE_FP8=0 \
+      SEQ_LENGTH=4096 \
+      MODEL_SIZE=7 \
+      TOTAL_ITERS=50 \
+      bash examples/llama/train_llama2.sh
+
+.. container:: model-doc pyt_megatron_lm_train_llama-2-70b
+
+   To run the training on a single node for Llama 2 70B BF16 with FSDP-v2 enabled, add the ``FSDP=1`` argument.
+   For example, use the following command:
+
+   .. code-block:: shell
+
+      CKPT_FORMAT=torch_dist \
+      TEE_OUTPUT=1 \
+      MBS=7 \
+      BS=56 \
+      TP=1 \
+      TE_FP8=0 \
+      FSDP=1 \
+      RECOMPUTE=1 \
+      SEQ_LENGTH=4096 \
+      MODEL_SIZE=70 \
+      TOTAL_ITERS=50 \
+      bash examples/llama/train_llama2.sh
+
+   .. note::
+
+      It is suggested to use ``TP=1`` when FSDP is enabled for higher
+      throughput. FSDP-v2 is not supported with pipeline parallelism, expert
+      parallelism, MCore's distributed optimizer, gradient accumulation fusion,
+      or FP16.
+
+.. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy
+
+   To run training on a single node for DeepSeek-V3 (MoE with expert parallel) with 3-layer proxy, 
+   navigate to the Megatron-LM folder and use the following command.
+
+   .. code-block:: shell
+
+      export NVTE_FUSED_ATTN_CK=0
+      FORCE_BALANCE=true \
+      RUN_ENV=cluster \
+      MODEL_SIZE=671B \
+      TRAIN_ITERS=50 \
+      SEQ_LEN=4096 \
+      NUM_LAYERS=3 \
+      MICRO_BATCH_SIZE=1 GLOBAL_BATCH_SIZE=32 \
+      PR=bf16 \
+      TP=1 PP=1 ETP=1 EP=8 \
+      GEMM_TUNING=1 \
+      NVTE_CK_USES_BWD_V3=1 \
+      USE_GROUPED_GEMM=true MOE_USE_LEGACY_GROUPED_GEMM=true \
+      GPT_LAYER_IN_TE=true \
+      bash examples/deepseek_v3/train_deepseekv3.sh
+
+.. container:: model-doc pyt_megatron_lm_train_deepseek-v2-lite-16b
+
+   To run training on a single node for DeepSeek-V2-Lite (MoE with expert parallel),
+   navigate to the Megatron-LM folder and use the following command.
+
+   .. code-block:: shell
+
+      export NVTE_FUSED_ATTN_CK=0
+      GEMM_TUNING=1 \
+      PR=bf16 \
+      MBS=4 \
+      AC=none \
+      SEQ_LEN=4096 \
+      PAD_LEN=4096 \
+      TRAIN_ITERS=50 \
+      bash examples/deepseek_v2/train_deepseekv2.sh
+
+.. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b
+
+   To run training on a single node for Mixtral 8x7B (MoE with expert parallel),
+   navigate to the Megatron-LM folder and use the following command.
+
+   .. code-block:: shell
+
+      TOKENIZER_MODEL=<path/to/tokenizer/model>
+      RECOMPUTE_NUM_LAYERS=0 \
+      TEE_OUTPUT=1 \
+      MBS=1 \
+      GBS=16 \
+      TP_SIZE=1 \
+      PP_SIZE=1 \
+      AC=none \
+      PR=bf16 \
+      EP_SIZE=8 \
+      ETP_SIZE=1 \
+      SEQLEN=4096 \
+      FORCE_BALANCE=true \
+      MOCK_DATA=1 \
+      RUN_ENV=cluster \
+      MODEL_SIZE=8x7B \
+      TRAIN_ITERS=50 \
+      bash examples/mixtral/train_mixtral_moe.sh
+
+.. container:: model-doc pyt_megatron_lm_train_mixtral-8x22b-proxy
+
+   To run training on a single node for Mixtral 8x7B (MoE with expert parallel) with 4-layer proxy,
+   navigate to the Megatron-LM folder and use the following command.
+
+   .. code-block:: shell
+
+      TOKENIZER_MODEL=<path/to/tokenizer/model>
+      RECOMPUTE_NUM_LAYERS=4 \
+      TEE_OUTPUT=1 \
+      MBS=1 \
+      GBS=16 \
+      TP_SIZE=1 \
+      PP_SIZE=1 \
+      AC=full \
+      NUM_LAYERS=4 \
+      PR=bf16 \
+      EP_SIZE=8 \
+      ETP_SIZE=1 \
+      SEQLEN=8192 \
+      FORCE_BALANCE=true \
+      MOCK_DATA=1 \
+      RUN_ENV=cluster \
+      MODEL_SIZE=8x22B \
+      TRAIN_ITERS=50 \
+      bash examples/mixtral/train_mixtral_moe.sh
+
+.. container:: model-doc pyt_megatron_lm_train_qwen2.5-7b
+
+   To run training on a single node for Qwen 2.5 7B BF16, use the following
+   command.
+
+   .. code-block:: shell
+
+      bash examples/qwen/train_qwen2.sh TP=1 \
+          CP=1 \
+          PP=1 \
+          MBS=10 \
+          BS=640 \
+          TE_FP8=0 \
+          MODEL_SIZE=7 \
+          SEQ_LENGTH=2048 \
+          TOTAL_ITERS=50 \
+          MOCK_DATA=1 \
+          TOKENIZER_MODEL=Qwen/Qwen2.5-7B
+
+   For FP8, use the following command.
+
+   .. code-block:: shell
+
+      bash examples/qwen/train_qwen2.sh \
+          TP=1 \
+          CP=1 \
+          PP=1 \
+          MBS=10 \
+          BS=640 \
+          TE_FP8=1 \
+          MODEL_SIZE=7 \
+          SEQ_LENGTH=2048 \
+          TOTAL_ITERS=50 \
+          MOCK_DATA=1 \
+          TOKENIZER_MODEL=Qwen/Qwen2.5-7B
+
+.. container:: model-doc pyt_megatron_lm_train_qwen2.5-72b
+
+   To run the training on a single node for Qwen 2.5 72B BF16, use the following command.
+
+   .. code-block:: shell
+
+      bash examples/qwen/train_qwen2.sh \
+          FSDP=1 \
+          CP=1 \
+          PP=1 \
+          MBS=3 \
+          BS=24 \
+          TE_FP8=0 \
+          MODEL_SIZE=72 \
+          SEQ_LENGTH=2048 \
+          TOTAL_ITERS=50 \
+          MOCK_DATA=1 \
+          TOKENIZER_MODEL=Qwen/Qwen2.5-72B \
+          RECOMPUTE_ACTIVATIONS=full \
+          CKPT_FORMAT=torch_dist
+
+Multi-node training examples
+----------------------------
+
+To run training on multiple nodes, launch the Docker container on each node.
+For example, for Llama 3 using a two node setup (``NODE0`` as the master node),
+use these commands.
+
+* On the master node ``NODE0``:
+
+  .. code-block:: shell
+
+     TEE_OUTPUT=1 \
+     MBS=2 \
+     BS=256 \
+     TP=1 \
+     TE_FP8=1 \
+     SEQ_LENGTH=8192 \
+     MODEL_SIZE=8  \
+     MASTER_ADDR=IP_NODE0 \
+     NNODES=2 \
+     NODE_RANK=0 \
+     bash examples/llama/train_llama3.sh
+
+* On the worker node ``NODE1``:
+
+  .. code-block:: shell
+
+     TEE_OUTPUT=1 \
+     MBS=2 \
+     BS=256 \
+     TP=1 \
+     TE_FP8=1 \
+     SEQ_LENGTH=8192 \
+     MODEL_SIZE=8  \
+     MASTER_ADDR=IP_NODE0 \
+     NNODES=2 \
+     NODE_RANK=1 \
+     bash examples/llama/train_llama3.sh
+
+Or, for DeepSeek-V3, an example script ``train_deepseek_v3_slurm.sh`` is
+provided in
+`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/deepseek_v3>`__ to
+enable training at scale under a SLURM environment. For example, to run
+training on 16 nodes, try the following command:
+
+.. code-block:: shell
+
+   sbatch examples/deepseek_v3/train_deepseek_v3_slurm.sh
+
+.. _amd-megatron-lm-benchmark-test-vars-v256:
+
+Key options
+-----------
+
+The benchmark tests support the following sets of variables.
+
+``TEE_OUTPUT``
+  ``1`` to enable training logs or ``0`` to disable.
+
+``TE_FP8``
+  ``0`` for B16 or ``1`` for FP8 -- ``0`` by default.
+
+``GEMM_TUNING``
+  ``1`` to enable GEMM tuning, which boosts performance by using the best GEMM kernels.
+
+``USE_FLASH_ATTN``
+  ``1`` to enable Flash Attention.
+
+``FSDP``
+  ``1`` to enable PyTorch FSDP2. If FSDP is enabled, ``--use-distributed-optimizer``,
+  ``--overlap-param-gather``, and ``--sequence-parallel`` are automatically disabled.
+
+``ENABLE_PROFILING``
+  ``1`` to enable PyTorch profiling for performance analysis.
+
+``transformer-impl``
+  ``transformer_engine`` to use the Transformer Engine (TE) or ``local`` to disable TE.
+
+``MODEL_SIZE``
+  ``8B`` or ``70B`` for Llama 3 and 3.1. ``7B`` or ``70B`` for Llama 2, for example.
+
+``TOTAL_ITERS``
+  The total number of iterations -- ``10`` by default.
+
+``MOCK_DATA``
+  ``1`` to use mock data or ``0`` to use real data you provide.
+
+``MBS``
+  Micro batch size.
+
+``BS``
+  Global batch size.
+
+``TP`` / ``TP_SIZE``
+  Tensor parallel (``1``, ``2``, ``4``, ``8``). ``TP`` is disabled when ``FSDP`` is turned on.
+
+``EP`` / ``EP_SIZE``
+  Expert parallel for MoE models.
+
+``SEQ_LENGTH``
+  Input sequence length.
+
+``PR``
+  Precision for training. ``bf16`` for BF16 (default) or ``fp8`` for FP8 GEMMs.
+
+``AC``
+  Activation checkpointing (``none``, ``sel``, or ``full``) -- ``sel`` by default.
+
+``NUM_LAYERS``
+  Use reduced number of layers as a proxy model.
+
+``RECOMPUTE_NUM_LAYERS``
+  Number of layers used for checkpointing recompute.
+
+Previous versions
+=================
+
+See :doc:`megatron-lm-history` to find documentation for previous releases
+of the ``ROCm/megatron-lm`` Docker image.
diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst
new file mode 100644
index 000000000..0a80c7c9b
--- /dev/null
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst
@@ -0,0 +1,602 @@
+.. meta::
+   :description: How to train a model using Megatron-LM for ROCm.
+   :keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
+
+**********************************************
+Training a model with Primus and Megatron-Core
+**********************************************
+
+`Primus <https://github.com/AMD-AIG-AIMA/Primus>`__ is a unified and flexible
+LLM training framework designed to streamline training. It streamlines LLM
+training on AMD Instinct accelerators using a modular, reproducible configuration paradigm.
+Primus is backend-agnostic and supports multiple training engines -- including Megatron-Core.
+
+.. note::
+
+   Primus with the Megatron-Core backend is intended to replace ROCm
+   Megatron-LM in this Dockerized training environment. To learn how to migrate
+   workloads from Megatron-LM to Primus with Megatron-Core, see
+   :doc:`previous-versions/megatron-lm-primus-migration-guide`.
+
+For ease of use, AMD provides a ready-to-use Docker image for MI300 series accelerators
+containing essential components for Primus and Megatron-Core.
+
+.. note::
+
+   This Docker environment is based on Python 3.10 and Ubuntu 22.04. For an alternative environment with
+   Python 3.12 and Ubuntu 24.04, see the :doc:`previous ROCm Megatron-LM v25.6 Docker release <previous-versions/megatron-lm-v25.6>`.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
+
+   {% set dockers = data.dockers %}
+   {% set docker = dockers[0] %}
+   .. list-table::
+      :header-rows: 1
+
+      * - Software component
+        - Version
+
+      {% for component_name, component_version in docker.components.items() %}
+      * - {{ component_name }}
+        - {{ component_version }}
+      {% endfor %}
+
+.. _amd-primus-megatron-lm-model-support:
+
+Supported models
+================
+
+The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
+Some instructions, commands, and training examples in this documentation might
+vary by model -- select one to get started.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
+
+   {% set model_groups = data.model_groups %}
+   .. raw:: html
+
+         <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+           <div class="row">
+             <div class="col-2 me-2 model-param-head">Model</div>
+             <div class="row col-10">
+      {% for model_group in model_groups %}
+               <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+      {% endfor %}
+             </div>
+           </div>
+
+           <div class="row mt-1">
+             <div class="col-2 me-2 model-param-head">Model variant</div>
+             <div class="row col-10">
+      {% for model_group in model_groups %}
+         {% set models = model_group.models %}
+         {% for model in models %}
+            {% if models|length % 3 == 0 %}
+               <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% else %}
+               <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% endif %}
+         {% endfor %}
+      {% endfor %}
+             </div>
+           </div>
+         </div>
+
+.. note::
+
+   Some models, such as Llama, require an external license agreement through
+   a third party (for example, Meta).
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting training.
+
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
+.. _mi300x-amd-primus-megatron-lm-training:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
+
+   {% set dockers = data.dockers %}
+      {% set docker = dockers[0] %}
+
+   Environment setup
+   =================
+
+   Use the following instructions to set up the environment, configure the script to train models, and
+   reproduce the benchmark results on MI300X series accelerators with the ``{{ docker.pull_tag }}`` image.
+
+   .. _amd-primus-megatron-lm-requirements:
+
+   Download the Docker image
+   -------------------------
+
+   1. Use the following command to pull the Docker image from Docker Hub.
+
+      .. code-block:: shell
+
+         docker pull {{ docker.pull_tag }}
+
+   2. Launch the Docker container.
+
+      .. code-block:: shell
+
+         docker run -it \
+             --device /dev/dri \
+             --device /dev/kfd \
+             --device /dev/infiniband \
+             --network host --ipc host \
+             --group-add video \
+             --cap-add SYS_PTRACE \
+             --security-opt seccomp=unconfined \
+             --privileged \
+             -v $HOME:$HOME \
+             --shm-size 128G \
+             --name primus_training_env \
+             {{ docker.pull_tag }}
+
+3. Use these commands if you exit the ``primus_training_env`` container and need to return to it.
+
+   .. code-block:: shell
+
+      docker start primus_training_env
+      docker exec -it primus_training_env bash
+
+The Docker container hosts verified release tag ``v0.1.0-rc1`` of the `Primus
+<https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1>`__ repository.
+
+.. _amd-primus-megatron-lm-environment-setup:
+
+Configuration
+=============
+
+Primus defines a training configuration in YAML for each model in
+`examples/megatron/configs <https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1/examples/megatron/configs>`__.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
+
+   {% set model_groups = data.model_groups %}
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+   .. container:: model-doc {{ model.mad_tag }}
+
+      To update training parameters for {{ model.model }}, you can update ``examples/megatron/configs/{{ model.config_name }}``.
+      Note that training configuration YAML files for other models follow this naming convention.
+
+      {% endfor %}
+   {% endfor %}
+
+.. note::
+
+   See :ref:`Key options <amd-primus-megatron-lm-benchmark-test-vars>` for more information on configuration options.
+
+Dataset options
+---------------
+
+You can use either mock data or real data for training.
+
+* Mock data can be useful for testing and validation. Use the ``mock_data`` field to toggle between mock and real data. The default
+  value is ``true`` for enabled.
+
+  .. code-block:: yaml
+
+     mock_data: true
+
+* If you're using a real dataset, update the ``train_data_path`` field to point to the location of your dataset.
+
+  .. code-block:: bash
+
+     mock_data: false
+     train_data_path: /path/to/your/dataset
+
+  Ensure that the files are accessible inside the Docker container.
+
+.. _amd-primus-megatron-lm-tokenizer:
+
+Tokenizer
+---------
+
+In Primus, each model uses a tokenizer from Hugging Face. For example, Llama
+3.1 8B model uses ``tokenizer_model: meta-llama/Llama-3.1-8B`` and
+``tokenizer_type: Llama3Tokenizer`` defined in the `llama3.1-8B model
+<https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1/primus/configs/models/megatron/llama3.1_8B.yaml>`__
+definition. As such, you need to set the ``HF_TOKEN`` environment variable with
+right permissions to access the tokenizer for each model.
+
+.. code-block:: bash
+
+   # Export your HF_TOKEN in the workspace
+   export HF_TOKEN=<your_hftoken>
+
+.. _amd-primus-megatron-lm-run-training:
+
+Run training
+============
+
+Use the following example commands to set up the environment, configure
+:ref:`key options <amd-primus-megatron-lm-benchmark-test-vars>`, and run training on
+MI300X series accelerators with the AMD Megatron-LM environment.
+
+Single node training
+--------------------
+
+To run training on a single node, navigate to ``/workspace/Primus`` and use the following setup command:
+
+.. code-block:: shell
+
+   pip install -r requirements.txt
+   export HSA_NO_SCRATCH_RECLAIM=1
+   export NVTE_CK_USES_BWD_V3=1
+
+Once setup is complete, run the appropriate training command.
+
+.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.3-70b
+
+   To run pre-training for Llama 3.3 70B BF16, run:
+
+   .. code-block:: shell
+
+      EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
+      bash ./examples/run_pretrain.sh \
+          --micro_batch_size 2 \
+          --global_batch_size 16 \
+          --train_iters 50
+
+.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b
+
+   To run pre-training for Llama 3.1 8B FP8, run:
+
+   .. code-block:: shell
+
+      EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
+      bash ./examples/run_pretrain.sh \
+          --train_iters 50 \
+          --fp8 hybrid
+
+   For Llama 3.1 8B BF16, use the following command:
+
+   .. code-block:: shell
+
+      EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
+      bash ./examples/run_pretrain.sh --train_iters 50
+
+.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-70b
+
+   To run pre-training for Llama 3.1 70B BF16, run:
+
+   .. code-block:: shell
+
+      EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
+      bash ./examples/run_pretrain.sh \
+           --train_iters 50
+
+   To run the training on a single node for Llama 3.1 70B FP8 with proxy, use the following command:
+
+   .. code-block:: shell
+
+      EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
+      bash ./examples/run_pretrain.sh \
+          --train_iters 50 \
+          --num_layers 40 \
+          --fp8 hybrid \
+          --no_fp8_weight_transpose_cache true
+
+   .. note::
+
+      Use two or more nodes to run the *full* Llama 70B model with FP8 precision.
+
+.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-7b
+
+   To run pre-training for Llama 2 7B FP8, run:
+
+   .. code-block:: shell
+
+      EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
+      bash ./examples/run_pretrain.sh \
+          --train_iters 50 \
+          --fp8 hybrid
+
+   To run pre-training for Llama 2 7B BF16, run:
+
+   .. code-block:: shell
+
+      EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
+      bash ./examples/run_pretrain.sh --train_iters 50
+
+.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-70b
+
+   To run pre-training for Llama 2 70B BF16, run:
+
+   .. code-block:: shell
+
+      EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
+      bash ./examples/run_pretrain.sh --train_iters 50 
+
+.. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v3-proxy
+
+   To run training on a single node for DeepSeek-V3 (MoE with expert parallel) with 3-layer proxy, 
+   use the following command:
+
+   .. code-block:: shell
+
+      EXP=examples/megatron/configs/deepseek_v3-pretrain.yaml \
+      bash examples/run_pretrain.sh \
+          --num_layers 3 \
+          --moe_layer_freq 1 \
+          --train_iters 50
+
+.. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v2-lite-16b
+
+   To run training on a single node for DeepSeek-V2-Lite (MoE with expert parallel),
+   use the following command:
+
+   .. code-block:: shell
+
+      EXP=examples/megatron/configs/deepseek_v2_lite-pretrain.yaml \
+      bash examples/run_pretrain.sh \
+          --global_batch_size 256 \
+          --train_iters 50
+
+.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x7b
+
+   To run training on a single node for Mixtral 8x7B (MoE with expert parallel),
+   use the following command:
+
+   .. code-block:: shell
+
+      EXP=examples/megatron/configs/mixtral_8x7B_v0.1-pretrain.yaml \
+      bash examples/run_pretrain.sh --train_iters 50
+
+.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x22b-proxy
+
+   To run training on a single node for Mixtral 8x7B (MoE with expert parallel) with 4-layer proxy,
+   use the following command:
+
+   .. code-block:: shell
+
+      EXP=examples/megatron/configs/mixtral_8x22B_v0.1-pretrain.yaml \
+      bash examples/run_pretrain.sh \
+          --num_layers 4 \
+          --pipeline_model_parallel_size 1 \
+          --micro_batch_size 1 \
+          --global_batch_size 16 \
+          --train_iters 50
+
+.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-7b
+
+   To run training on a single node for Qwen 2.5 7B BF16, use the following
+   command:
+
+   .. code-block:: shell
+
+      EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \
+      bash examples/run_pretrain.sh --train_iters 50
+
+   For FP8, use the following command.
+
+   .. code-block:: shell
+
+      EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \
+      bash examples/run_pretrain.sh \
+          --train_iters 50 \
+          --fp8 hybrid
+
+.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-72b
+
+   To run the training on a single node for Qwen 2.5 72B BF16, use the following command.
+
+   .. code-block:: shell
+
+      EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \
+      bash examples/run_pretrain.sh --train_iters 50
+
+Multi-node training examples
+----------------------------
+
+To run training on multiple nodes, you can use the
+`run_slurm_pretrain.sh <https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1/examples/run_slurm_pretrain.sh>`__
+to launch the multi-node workload. Use the following steps to setup your environment:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
+
+   {% set dockers = data.dockers %}
+   {% set docker = dockers[0] %}
+
+   .. code-block:: shell
+
+      cd /workspace/Primus/
+      export DOCKER_IMAGE={{ docker.pull_tag }}
+      export HF_TOKEN=<your_HF_token>
+      export HSA_NO_SCRATCH_RECLAIM=1
+      export NVTE_CK_USES_BWD_V3=1
+      export NCCL_IB_HCA=<your_NCCL_IB_HCA> # specify which RDMA interfaces to use for communication
+      export NCCL_SOCKET_IFNAME=<your_NCCL_SOCKET_IFNAME> # your Network Interface
+      export GLOO_SOCKET_IFNAME=<your_GLOO_SOCKET_IFNAME> # your Network Interface
+      export NCCL_IB_GID_INDEX=3 # Set InfiniBand GID index for NCCL communication. Default is 3 for ROCE
+
+.. note::
+
+   * Make sure correct network drivers are installed on the nodes. If inside a Docker, either install the drivers inside the Docker container or pass the network drivers from the host while creating Docker container.
+   * If ``NCCL_IB_HCA`` and ``NCCL_SOCKET_IFNAME`` are not set, Primus will try to auto-detect. However, since NICs can vary accross different cluster, it is encouraged to explicitly export your NCCL parameters for the cluster.
+   * To find your network interface, you can use ``ip a``.
+   * To find RDMA interfaces, you can use ``ibv_devices`` to get the list of all the RDMA/IB  devices.
+
+.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.3-70b
+
+   To train Llama 3.3 70B FP8 on 8 nodes, run:
+
+   .. code-block:: shell
+
+      NNODES=8 EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
+      bash examples/run_slurm_pretrain.sh \
+          --micro_batch_size 4 \
+          --global_batch_size 256 \
+          --recompute_num_layers 80 \
+          --no_fp8_weight_transpose_cache true \
+          --fp8 hybrid
+
+   To train Llama 3.3 70B BF16 on 8 nodes, run:
+
+   .. code-block:: shell
+
+      NNODES=8 EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
+      bash examples/run_slurm_pretrain.sh \
+          --micro_batch_size 1 \
+          --global_batch_size 256 \
+          --recompute_num_layers 12
+
+.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b
+
+   To train Llama 3.1 8B FP8 on 8 nodes, run:
+
+   .. code-block:: shell
+
+      # Adjust the training parameters. For e.g., `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case 
+      NNODES=8 EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
+      bash ./examples/run_slurm_pretrain.sh \
+          --global_batch_size 1024 \
+          --fp8 hybrid
+
+.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-70b
+
+   To train Llama 3.1 70B FP8 on 8 nodes, run:
+
+   .. code-block:: shell
+
+      NNODES=8 EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
+      bash examples/run_slurm_pretrain.sh \
+          --micro_batch_size 4 \
+          --global_batch_size 256 \
+          --recompute_num_layers 80 \
+          --no_fp8_weight_transpose_cache true \
+          --fp8 hybrid
+
+   To train Llama 3.1 70B BF16 on 8 nodes, run:
+
+   .. code-block:: shell
+
+      NNODES=8 EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
+      bash examples/run_slurm_pretrain.sh \
+          --micro_batch_size 1 \
+          --global_batch_size 256 \
+          --recompute_num_layers 12
+
+.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-7b
+
+   To train Llama 2 8B FP8 on 8 nodes, run:
+
+   .. code-block:: shell
+
+      # Adjust the training parameters. For e.g., `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case 
+      NNODES=8 EXP=examples/megatron/configs/llama2_7B-pretrain.yaml bash ./examples/run_slurm_pretrain.sh --global_batch_size 2048 --fp8 hybrid
+
+.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-70b
+
+   To train Llama 2 70B FP8 on 8 nodes, run:
+
+   .. code-block:: shell
+
+      NNODES=8 EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
+      bash examples/run_slurm_pretrain.sh \
+          --micro_batch_size 10 \
+          --global_batch_size 640 \
+          --recompute_num_layers 80 \
+          --no_fp8_weight_transpose_cache true \
+          --fp8 hybrid
+
+   To train Llama 2 70B BF16 on 8 nodes, run:
+
+   .. code-block:: shell
+
+      NNODES=8 EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
+      bash ./examples/run_slurm_pretrain.sh \
+          --micro_batch_size 2 \
+          --global_batch_size 1536 \
+          --recompute_num_layers 12
+
+.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x7b
+
+   To train Mixtral 8x7B BF16 on 8 nodes, run:
+
+   .. code-block:: shell
+
+      NNODES=8 EXP=examples/megatron/configs/mixtral_8x7B_v0.1-pretrain.yaml \
+      bash examples/run_slurm_pretrain.sh \
+          --micro_batch_size 2 \
+          --global_batch_size 256
+
+.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-72b
+
+   To train Qwen2.5 72B FP8 on 8 nodes, run:
+
+   .. code-block:: shell
+
+      NNODES=8 EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \
+      bash examples/run_slurm_pretrain.sh \
+          --micro_batch_size 8 \
+          --global_batch_size 512 \
+          --recompute_num_layers 80 \
+          --no_fp8_weight_transpose_cache true \
+          --fp8 hybrid
+
+.. _amd-primus-megatron-lm-benchmark-test-vars:
+
+Key options
+-----------
+
+The following are key options to take note of
+
+fp8
+  ``hybrid`` enables FP8 GEMMs.
+
+use_torch_fsdp2
+  ``use_torch_fsdp2: 1``  enables torch fsdp-v2. If FSDP is enabled,
+  set ``use_distributed_optimizer`` and ``overlap_param_gather`` to ``false``.
+
+profile
+  To enable PyTorch profiling, set these parameters:
+
+  .. code-block:: yaml
+
+     profile: true
+     use_pytorch_profiler: true
+     profile_step_end: 7
+     profile_step_start: 6
+
+train_iters
+  The total number of iterations (default: 50).
+
+mock_data
+  True by default.
+
+micro_batch_size
+  Micro batch size.
+
+global_batch_size
+  Global batch size.
+
+recompute_granularity
+  For activation checkpointing.
+
+num_layers
+  For using a reduced number of layers as with proxy models.
+
+Previous versions
+=================
+
+See :doc:`previous-versions/megatron-lm-history` to find documentation for previous releases
+of the ``ROCm/megatron-lm`` Docker image.
+
+This training environment now uses Primus with Megatron as the primary
+configuration. Limited support for the legacy ROCm Megatron-LM is still
+available. For instructions on using ROCm Megatron-LM, see the
+:doc:`megatron-lm` document.
diff --git a/docs/how-to/rocm-for-ai/training/index.rst b/docs/how-to/rocm-for-ai/training/index.rst
index 13213c2e9..7f2ce1d97 100644
--- a/docs/how-to/rocm-for-ai/training/index.rst
+++ b/docs/how-to/rocm-for-ai/training/index.rst
@@ -21,6 +21,8 @@ In this guide, you'll learn about:
 
 - Training a model
 
+  - :doc:`With Primus (Megatron-LM backend) <benchmark-docker/primus-megatron>`
+
   - :doc:`With Megatron-LM <benchmark-docker/megatron-lm>`
 
   - :doc:`With PyTorch <benchmark-docker/pytorch-training>`
diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in
index 8560f0c68..db786f0c4 100644
--- a/docs/sphinx/_toc.yml.in
+++ b/docs/sphinx/_toc.yml.in
@@ -44,8 +44,8 @@ subtrees:
         title: Training
         subtrees:
         - entries:
-          - file: how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
-            title: Train a model with Megatron-LM
+          - file: how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst
+            title: Train a model with Primus and Megatron-Core
           - file: how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
             title: Train a model with PyTorch
           - file: how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst