Update Megatron-LM training benchmark doc for v25.6 release (#5064)

2026-01-09 06:38:00 -05:00 · 2025-07-18 15:57:25 -04:00
parent 7e7e15a201
commit 5bcf3b0847
10 changed files with 1296 additions and 156 deletions
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -713,11 +713,13 @@ linearized
 linter
 linux
 llvm
+lm
 localscratch
 logits
 lossy
 macOS
 matchers
+megatron
 microarchitecture
 migraphx
 migratable
@@ -789,6 +791,7 @@ quantile
 quantizer
 quasirandom
 queueing
+qwen
 radeon
 rccl
 rdc
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -105,10 +105,22 @@ article_pages = [
    {"file": "how-to/rocm-for-ai/training/index", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/train-a-model", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/prerequisite-system-validation", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/benchmark-docker/megatron-lm", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/scale-model-training", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/megatron-lm", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v24.12-dev", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.5", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.3", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.4", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry", "os": ["linux"]},

    {"file": "how-to/rocm-for-ai/fine-tuning/index", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/fine-tuning/overview", "os": ["linux"]},
@@ -120,7 +132,16 @@ article_pages = [
    {"file": "how-to/rocm-for-ai/inference/hugging-face-models", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/llm-inference-frameworks", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/vllm", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.4.3", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.4", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.6", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.7.3-20250325", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.3-20250415", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250513", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250521", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250605", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250702", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/deploy-your-model", "os": ["linux"]},

--- a/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
@@ -1,29 +1,60 @@
-megatron-lm_benchmark:
-  model_groups:
-    - group: Meta Llama
-      tag: llama
-      models:
+dockers:
+  - pull_tag: rocm/megatron-lm:v25.6_py312
+    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py312/images/sha256-482ff906532285bceabdf2bda629bd32cb6174d2d07f4243a736378001b28df0
+    components:
+      ROCm: 6.4.1
+      PyTorch: 2.8.0a0+git7d205b2
+      Python: 3.12
+      Transformer Engine: 2.1.0.dev0+8c4a512
+      hipBLASLt: 393e413
+      Triton: 3.3.0
+      RCCL: 2.23.4.7a84c5d
+    doc_name: Ubuntu 24.04 + Python 3.12
+  - pull_tag: rocm/megatron-lm:v25.6_py310
+    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py310/images/sha256-9627bd9378684fe26cb1a10c7dd817868f553b33402e49b058355b0f095568d6
+    components:
+      ROCm: 6.4.1
+      PyTorch: 2.8.0a0+git7d205b2
+      Python: "3.10"
+      Transformer Engine: 2.1.0.dev0+8c4a512
+      hipBLASLt: 393e413
+      Triton: 3.3.0
+      RCCL: 2.23.4.7a84c5d
+    doc_name: Ubuntu 22.04 + Python 3.10
+model_groups:
+  - group: Meta Llama
+    tag: llama
+    models:
      - model: Llama 3.3 70B
        mad_tag: pyt_megatron_lm_train_llama-3.3-70b
      - model: Llama 3.1 8B
        mad_tag: pyt_megatron_lm_train_llama-3.1-8b
      - model: Llama 3.1 70B
        mad_tag: pyt_megatron_lm_train_llama-3.1-70b
+      - model: Llama 3.1 70B (proxy)
+        mad_tag: pyt_megatron_lm_train_llama-3.1-70b-proxy
      - model: Llama 2 7B
        mad_tag: pyt_megatron_lm_train_llama-2-7b
      - model: Llama 2 70B
        mad_tag: pyt_megatron_lm_train_llama-2-70b
-    - group: DeepSeek
-      tag: deepseek
-      models:
-      - model: DeepSeek-V3
+  - group: DeepSeek
+    tag: deepseek
+    models:
+      - model: DeepSeek-V3 (proxy)
        mad_tag: pyt_megatron_lm_train_deepseek-v3-proxy
      - model: DeepSeek-V2-Lite
        mad_tag: pyt_megatron_lm_train_deepseek-v2-lite-16b
-    - group: Mistral AI
-      tag: mistral
-      models:
+  - group: Mistral AI
+    tag: mistral
+    models:
      - model: Mixtral 8x7B
        mad_tag: pyt_megatron_lm_train_mixtral-8x7b
-      - model: Mixtral 8x22B
+      - model: Mixtral 8x22B (proxy)
        mad_tag: pyt_megatron_lm_train_mixtral-8x22b-proxy
+  - group: Qwen
+    tag: qwen
+    models:
+      - model: Qwen 2.5 7B
+        mad_tag: pyt_megatron_lm_train_qwen2.5-7b
+      - model: Qwen 2.5 72B
+        mad_tag: pyt_megatron_lm_train_qwen2.5-72b
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.5-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.5-benchmark-models.yaml
@@ -0,0 +1,29 @@
+megatron-lm_benchmark:
+  model_groups:
+    - group: Meta Llama
+      tag: llama
+      models:
+      - model: Llama 3.3 70B
+        mad_tag: pyt_megatron_lm_train_llama-3.3-70b
+      - model: Llama 3.1 8B
+        mad_tag: pyt_megatron_lm_train_llama-3.1-8b
+      - model: Llama 3.1 70B
+        mad_tag: pyt_megatron_lm_train_llama-3.1-70b
+      - model: Llama 2 7B
+        mad_tag: pyt_megatron_lm_train_llama-2-7b
+      - model: Llama 2 70B
+        mad_tag: pyt_megatron_lm_train_llama-2-70b
+    - group: DeepSeek
+      tag: deepseek
+      models:
+      - model: DeepSeek-V3
+        mad_tag: pyt_megatron_lm_train_deepseek-v3-proxy
+      - model: DeepSeek-V2-Lite
+        mad_tag: pyt_megatron_lm_train_deepseek-v2-lite-16b
+    - group: Mistral AI
+      tag: mistral
+      models:
+      - model: Mixtral 8x7B
+        mad_tag: pyt_megatron_lm_train_mixtral-8x7b
+      - model: Mixtral 8x22B
+        mad_tag: pyt_megatron_lm_train_mixtral-8x22b-proxy
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst
@@ -17,6 +17,7 @@ previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub <https://hub.
     - Resources

   * - ``rocm/vllm:rocm6.4.1_vllm_0.9.1_20250715``
+       (latest)
     - 
       * ROCm 6.4.1
       * vLLM 0.9.1
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
@@ -15,57 +15,51 @@ purpose-built to support models like Llama, DeepSeek, and Mixtral,
 enabling developers to train next-generation AI models more
 efficiently.

-AMD provides a ready-to-use Docker image for MI300X series accelerators containing
+AMD provides ready-to-use Docker images for MI300X series accelerators containing
 essential components, including PyTorch, ROCm libraries, and Megatron-LM
 utilities. It contains the following software components to accelerate training
 workloads:

-+--------------------------+--------------------------------+
-| Software component       | Version                        |
-+==========================+================================+
-| ROCm                     | 6.3.4                          |
-+--------------------------+--------------------------------+
-| PyTorch                  | 2.8.0a0+gite2f9759             |
-+--------------------------+--------------------------------+
-| Python                   | 3.12 or 3.10                   |
-+--------------------------+--------------------------------+
-| Transformer Engine       | 1.13.0+bb061ade                |
-+--------------------------+--------------------------------+
-| Flash Attention          | 3.0.0                          |
-+--------------------------+--------------------------------+
-| hipBLASLt                | 0.13.0-4f18bf6                 |
-+--------------------------+--------------------------------+
-| Triton                   | 3.3.0                          |
-+--------------------------+--------------------------------+
-| RCCL                     | 2.22.3                         |
-+--------------------------+--------------------------------+
-
-Megatron-LM provides the following key features to train large language models efficiently:
-
- Transformer Engine (TE)
-
- APEX
-
- GEMM tuning
-
- Torch.compile
-
- 3D parallelism: TP + SP + CP
-
- Distributed optimizer
-
- Flash Attention (FA) 3
-
- Fused kernels
-
- Pre-training
-
-.. _amd-megatron-lm-model-support:
-
-The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
-
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml

+   {% set dockers = data.dockers %}
+   {% if dockers|length > 1 %}
+   .. tab-set::
+
+      {% for docker in data.dockers %}
+      .. tab-item:: ``{{ docker.pull_tag }}``
+         :sync: {{ docker.pull_tag }}
+
+         .. list-table::
+            :header-rows: 1
+
+            * - Software component
+              - Version
+
+            {% for component_name, component_version in docker.components.items() %}
+            * - {{ component_name }}
+              - {{ component_version }}
+
+            {% endfor %}
+      {% endfor %}
+   {% elif dockers|length == 1 %}
+   .. list-table::
+      :header-rows: 1
+
+      * - Software component
+        - Version
+
+      {% for component_name, component_version in docker.components %}
+      * - {{ component_name }}
+        - {{ component_version }}
+
+      {% endfor %}
+   {% endif %}
+
+   .. _amd-megatron-lm-model-support:
+
+   The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
+
   Supported models
   ================

@@ -73,8 +67,7 @@ The following models are pre-optimized for performance on AMD Instinct MI300X se
   Some instructions, commands, and training recommendations in this documentation might
   vary by model -- select one to get started.

-   {% set model_groups = data["megatron-lm_benchmark"].model_groups %}
-
+   {% set model_groups = data.model_groups %}
   .. raw:: html

         <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
@@ -82,7 +75,7 @@ The following models are pre-optimized for performance on AMD Instinct MI300X se
             <div class="col-2 me-2 model-param-head">Model</div>
             <div class="row col-10">
      {% for model_group in model_groups %}
-               <div class="col-4 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+               <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
      {% endfor %}
             </div>
           </div>
@@ -155,42 +148,77 @@ image.
 Download the Docker image
 -------------------------

-1. Use the following command to pull the Docker image from Docker Hub.
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml

-   .. tab-set:: 
+   {% set dockers = data.dockers %}
+   1. Use the following command to pull the Docker image from Docker Hub.

-      .. tab-item:: Ubuntu 24.04 + Python 3.12
-         :sync: py312
+      {% if dockers|length > 1 %}
+      .. tab-set:: 

-         .. code-block:: shell
+         {% for docker in data.dockers %}
+         .. tab-item:: {{ docker.doc_name }}
+            :sync: {{ docker.pull_tag }}

-            docker pull rocm/megatron-lm:v25.5_py312
+            .. code-block:: shell

-      .. tab-item:: Ubuntu 22.04 + Python 3.10
-         :sync: py310
+               docker pull {{ docker.pull_tag }}

-         .. code-block:: shell
+         {% endfor %}
+      {% elif dockers|length == 1 %}
+      {% set docker = dockers[0] %}
+      .. code-block:: shell

-            docker pull rocm/megatron-lm:v25.5_py310
+         docker pull {{ docker.pull_tag }}

-2. Launch the Docker container.
+      {% endif %}
+   2. Launch the Docker container.

-   .. tab-set::
+      {% if dockers|length > 1 %}
+      .. tab-set::

-      .. tab-item:: Ubuntu 24.04 + Python 3.12
-         :sync: py312
+         {% for docker in data.dockers %}
+         .. tab-item:: {{ docker.doc_name }}
+            :sync: {{ docker.pull_tag }}

-         .. code-block:: shell
+            .. code-block:: shell

-            docker run -it --device /dev/dri --device /dev/kfd --device /dev/infiniband --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 128G --name megatron_training_env rocm/megatron-lm:v25.5_py312
+               docker run -it \
+                   --device /dev/dri \
+                   --device /dev/kfd \
+                   --device /dev/infiniband \
+                   --network host --ipc host \
+                   --group-add video \
+                   --cap-add SYS_PTRACE \
+                   --security-opt seccomp=unconfined \
+                   --privileged \
+                   -v $HOME:$HOME \
+                   -v $HOME/.ssh:/root/.ssh \
+                   --shm-size 128G \
+                   --name megatron_training_env \
+                   {{ docker.pull_tag }}

+         {% endfor %}
+      {% elif dockers|length == 1 %}
+      {% set docker = dockers[0] %}
+      .. code-block:: shell

-      .. tab-item:: Ubuntu 22.04 + Python 3.10
-         :sync: py310
+         docker run -it \
+             --device /dev/dri \
+             --device /dev/kfd \
+             --device /dev/infiniband \
+             --network host --ipc host \
+             --group-add video \
+             --cap-add SYS_PTRACE \
+             --security-opt seccomp=unconfined \
+             --privileged \
+             -v $HOME:$HOME \
+             -v $HOME/.ssh:/root/.ssh \
+             --shm-size 128G \
+             --name megatron_training_env \
+             {{ docker.pull_tag }}

-         .. code-block:: shell
-
-            docker run -it --device /dev/dri --device /dev/kfd --device /dev/infiniband --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 128G --name megatron_training_env rocm/megatron-lm:v25.5_py310
+      {% endif %}

 3. Use these commands if you exit the ``megatron_training_env`` container and need to return to it.

@@ -348,6 +376,22 @@ If the tokenizer is not found, it'll be downloaded if publicly available.

      TOKENIZER_MODEL=tokenizer/tokenizer.model

+.. container:: model-doc pyt_megatron_lm_train_qwen2.5-7b
+
+   The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path.
+
+   .. code-block:: shell
+
+      TOKENIZER_MODEL="Qwen/Qwen2.5-7B"
+
+.. container:: model-doc pyt_megatron_lm_train_qwen2.5-72b
+
+   The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path.
+
+   .. code-block:: shell
+
+      TOKENIZER_MODEL="Qwen/Qwen2.5-72B"
+
 Dataset options
 ---------------

@@ -373,7 +417,7 @@ You can use either mock data or real data for training.
 Download the dataset
 ^^^^^^^^^^^^^^^^^^^^

-.. container:: model-doc pyt_megatron_lm_train_llama-3.3-70b pyt_megatron_lm_train_llama-3.1-8b pyt_megatron_lm_train_llama-3.1-70b pyt_megatron_lm_train_llama-2-7b pyt_megatron_lm_train_llama-2-70b
+.. container:: model-doc pyt_megatron_lm_train_llama-3.3-70b pyt_megatron_lm_train_llama-3.1-8b pyt_megatron_lm_train_llama-3.1-70b pyt_megatron_lm_train_llama-2-7b pyt_megatron_lm_train_llama-2-70b pyt_megatron_lm_train_llama-3.1-70b-proxy

   For Llama models, use the `prepare_dataset.sh
   <https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/llama>`_ script
@@ -412,8 +456,8 @@ Download the dataset
      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/SlimPajama.json
      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-train.json
      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-valid.json
-      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.bin
-      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.idx
+      cd ..
+      bash tools/run_make_pretraining_dataset_megatron.sh deepseek-datasets/SlimPajama.json DeepSeekV3Tokenizer text deepseek-datasets deepseek-ai/DeepSeek-V3

   To train on this data, update the ``DATA_DIR`` variable to point to the location of your dataset.

@@ -437,8 +481,8 @@ Download the dataset
      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/SlimPajama.json
      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-train.json
      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-valid.json
-      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.bin
-      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.idx
+      cd ..
+      bash tools/run_make_pretraining_dataset_megatron.sh deepseek-datasets/SlimPajama.json DeepSeekV3Tokenizer text deepseek-datasets deepseek-ai/DeepSeek-V3

   To train on this data, update the ``DATA_DIR`` variable to point to the location of your dataset.

@@ -448,8 +492,6 @@ Download the dataset

      DATA_DIR="<path-to>/deepseek-datasets"  # Change to where your dataset is stored

-      Ensure that the files are accessible inside the Docker container.
-
 .. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b pyt_megatron_lm_train_mixtral-8x22b-proxy

   If you don't already have the dataset, download the Mixtral dataset using the following
@@ -472,6 +514,27 @@ Download the dataset

   Ensure that the files are accessible inside the Docker container.

+.. container:: model-doc pyt_megatron_lm_train_qwen2.5-7b pyt_megatron_lm_train_qwen2.5-72b
+
+   If you don't already have the dataset, download the Mixtral dataset using the following
+   commands:
+
+   .. code-block:: shell
+
+      mkdir -p temp/qwen-datasets
+      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/qwen-datasets/wudao_qwenbpe_text_document.bin
+      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/qwen-datasets/wudao_qwenbpe_text_document.idx
+
+   To train on this data, update the ``DATA_DIR`` variable to point to the location of your dataset.
+
+   .. code-block:: bash
+
+      MOCK_DATA=0 # Train on real data
+
+      DATA_DIR="<path-to>/qwen-datasets"  # Change to where your dataset is stored
+
+   Ensure that the files are accessible inside the Docker container.
+
 Multi-node configuration
 ------------------------

@@ -512,27 +575,17 @@ also be passed as command line arguments. Refer to the following example configu
     # Specify which RDMA interfaces to use for communication
     export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7

-Getting started
-===============
-
-The prebuilt Megatron-LM with ROCm training environment allows users to quickly validate
-system performance, conduct training benchmarks, and achieve superior
-performance for models like Llama, DeepSeek, and Mixtral. This container should not be
-expected to provide generalized performance across all training workloads. You
-can expect the container to perform in the model configurations described in
-the following section, but other configurations are not validated by AMD.
-
 .. _amd-megatron-lm-run-training:

 Run training
------------
+============

 Use the following example commands to set up the environment, configure
 :ref:`key options <amd-megatron-lm-benchmark-test-vars>`, and run training on
 MI300X series accelerators with the AMD Megatron-LM environment.

 Single node training
-^^^^^^^^^^^^^^^^^^^^
+--------------------

 .. container:: model-doc pyt_megatron_lm_train_llama-3.3-70b

@@ -541,7 +594,20 @@ Single node training

   .. code-block:: shell

-      TEE_OUTPUT=1 RECOMPUTE=1 SEQ_LENGTH=8192 MBS=2 BS=16 TE_FP8=0 TP=1 PP=1 FSDP=1 MODEL_SIZE=70 TOTAL_ITERS=50 bash examples/llama/train_llama3.sh 
+      TOKENIZER_MODEL=meta-llama/Llama-3.3-70B-Instruct \
+      CKPT_FORMAT=torch_dist \
+      TEE_OUTPUT=1 \
+      RECOMPUTE=1 \
+      SEQ_LENGTH=8192 \
+      MBS=2 \
+      BS=16 \
+      TE_FP8=0 \
+      TP=1 \
+      PP=1 \
+      FSDP=1 \
+      MODEL_SIZE=70 \
+      TOTAL_ITERS=50 \
+      bash examples/llama/train_llama3.sh 

   .. note::

@@ -550,8 +616,6 @@ Single node training
      parallelism, MCore's distributed optimizer, gradient accumulation fusion,
      or FP16.

-      Currently, FSDP is only compatible with BF16 precision.
-
 .. container:: model-doc pyt_megatron_lm_train_llama-3.1-8b

   To run training on a single node for Llama 3.1 8B FP8, navigate to the Megatron-LM folder and use the
@@ -559,13 +623,29 @@ Single node training

   .. code-block:: shell

-      TEE_OUTPUT=1 MBS=2 BS=128 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 TOTAL_ITERS=50 bash examples/llama/train_llama3.sh
+      TEE_OUTPUT=1 \
+      MBS=2 \
+      BS=128 \
+      TP=1 \
+      TE_FP8=1 \
+      SEQ_LENGTH=8192 \
+      MODEL_SIZE=8 \
+      TOTAL_ITERS=50 \
+      bash examples/llama/train_llama3.sh

   For Llama 3.1 8B BF16, use the following command:

   .. code-block:: shell

-      TEE_OUTPUT=1 MBS=2 BS=128 TP=1 TE_FP8=0 SEQ_LENGTH=8192 MODEL_SIZE=8 TOTAL_ITERS=50 bash examples/llama/train_llama3.sh
+      TEE_OUTPUT=1 \
+      MBS=2 \
+      BS=128 \
+      TP=1 \
+      TE_FP8=0 \
+      SEQ_LENGTH=8192 \
+      MODEL_SIZE=8 \
+      TOTAL_ITERS=50 \
+      bash examples/llama/train_llama3.sh

 .. container:: model-doc pyt_megatron_lm_train_llama-3.1-70b

@@ -574,7 +654,18 @@ Single node training

   .. code-block:: shell

-      TEE_OUTPUT=1 MBS=3 BS=24 TP=1 TE_FP8=0 FSDP=1 RECOMPUTE=1 SEQ_LENGTH=8192 MODEL_SIZE=70 TOTAL_ITERS=50 bash examples/llama/train_llama3.sh
+      CKPT_FORMAT=torch_dist \
+      TEE_OUTPUT=1 \
+      MBS=3 \
+      BS=24 \
+      TP=1 \
+      TE_FP8=0 \
+      FSDP=1 \
+      RECOMPUTE=1 \
+      SEQ_LENGTH=8192 \
+      MODEL_SIZE=70 \
+      TOTAL_ITERS=50 \
+      bash examples/llama/train_llama3.sh

   .. note::

@@ -583,7 +674,36 @@ Single node training
      parallelism, MCore's distributed optimizer, gradient accumulation fusion,
      or FP16.

-      Currently, FSDP is only compatible with BF16 precision.
+.. container:: model-doc pyt_megatron_lm_train_llama-3.1-70b-proxy
+
+   To run the training on a single node for Llama 3.1 70B with proxy, use the following command.
+
+   .. code-block:: shell
+
+      CKPT_FORMAT=torch_dist \
+      TEE_OUTPUT=1 \
+      RECOMPUTE=1 \
+      MBS=3 \
+      BS=24 \
+      TP=1 \
+      TE_FP8=1 \
+      SEQ_LENGTH=8192 \
+      MODEL_SIZE=70 \
+      FSDP=1 \
+      TOTAL_ITERS=10 \
+      NUM_LAYERS=40 \
+      bash examples/llama/train_llama3.sh
+
+   .. note::
+
+      Use two or more nodes to run the *full* Llama 70B model with FP8 precision.
+
+   .. note::
+
+      It is suggested to use ``TP=1`` when FSDP is enabled for higher
+      throughput. FSDP-v2 is not supported with pipeline parallelism, expert
+      parallelism, MCore's distributed optimizer, gradient accumulation fusion,
+      or FP16.

 .. container:: model-doc pyt_megatron_lm_train_llama-2-7b

@@ -592,13 +712,29 @@ Single node training

   .. code-block:: shell

-      TEE_OUTPUT=1 MBS=4 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=4096 MODEL_SIZE=7 TOTAL_ITERS=50 bash examples/llama/train_llama2.sh
+      TEE_OUTPUT=1 \
+      MBS=4 \
+      BS=256 \
+      TP=1 \
+      TE_FP8=1 \
+      SEQ_LENGTH=4096 \
+      MODEL_SIZE=7 \
+      TOTAL_ITERS=50 \
+      bash examples/llama/train_llama2.sh

   For Llama 2 7B BF16, use the following command:

   .. code-block:: shell

-      TEE_OUTPUT=1 MBS=4 BS=256 TP=1 TE_FP8=0 SEQ_LENGTH=4096 MODEL_SIZE=7 TOTAL_ITERS=50 bash examples/llama/train_llama2.sh
+      TEE_OUTPUT=1 \
+      MBS=4 \
+      BS=256 \
+      TP=1 \
+      TE_FP8=0 \
+      SEQ_LENGTH=4096 \
+      MODEL_SIZE=7 \
+      TOTAL_ITERS=50 \
+      bash examples/llama/train_llama2.sh

 .. container:: model-doc pyt_megatron_lm_train_llama-2-70b

@@ -607,7 +743,18 @@ Single node training

   .. code-block:: shell

-      TEE_OUTPUT=1 MBS=7 BS=56 TP=1 TE_FP8=0 FSDP=1 RECOMPUTE=1 SEQ_LENGTH=4096 MODEL_SIZE=70 TOTAL_ITERS=50 bash examples/llama/train_llama2.sh
+      CKPT_FORMAT=torch_dist \
+      TEE_OUTPUT=1 \
+      MBS=7 \
+      BS=56 \
+      TP=1 \
+      TE_FP8=0 \
+      FSDP=1 \
+      RECOMPUTE=1 \
+      SEQ_LENGTH=4096 \
+      MODEL_SIZE=70 \
+      TOTAL_ITERS=50 \
+      bash examples/llama/train_llama2.sh

   .. note::

@@ -616,8 +763,6 @@ Single node training
      parallelism, MCore's distributed optimizer, gradient accumulation fusion,
      or FP16.

-      Currently, FSDP is only compatible with BF16 precision.
-
 .. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy

   To run training on a single node for DeepSeek-V3 (MoE with expert parallel) with 3-layer proxy, 
@@ -625,7 +770,8 @@ Single node training

   .. code-block:: shell

-      FORCE_BANLANCE=true \
+      export NVTE_FUSED_ATTN_CK=0
+      FORCE_BALANCE=true \
      RUN_ENV=cluster \
      MODEL_SIZE=671B \
      TRAIN_ITERS=50 \
@@ -647,7 +793,15 @@ Single node training

   .. code-block:: shell

-      GEMM_TUNING=1 PR=bf16 MBS=4 AC=none SEQ_LEN=4096 PAD_LEN=4096 TRAIN_ITERS=50 bash examples/deepseek_v2/train_deepseekv2.sh
+      export NVTE_FUSED_ATTN_CK=0
+      GEMM_TUNING=1 \
+      PR=bf16 \
+      MBS=4 \
+      AC=none \
+      SEQ_LEN=4096 \
+      PAD_LEN=4096 \
+      TRAIN_ITERS=50 \
+      bash examples/deepseek_v2/train_deepseekv2.sh

 .. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b

@@ -656,7 +810,24 @@ Single node training

   .. code-block:: shell

-      RECOMPUTE_NUM_LAYERS=0 TEE_OUTPUT=1 MBS=1 GBS=16 TP_SIZE=1 PP_SIZE=1 AC=none PR=bf16 EP_SIZE=8 ETP_SIZE=1 SEQLEN=4096 FORCE_BALANCE=true MOCK_DATA=1 RUN_ENV=cluster MODEL_SIZE=8x7B TRAIN_ITERS=50 bash examples/mixtral/train_mixtral_moe.sh
+      TOKENIZER_MODEL=<path/to/tokenizer/model>
+      RECOMPUTE_NUM_LAYERS=0 \
+      TEE_OUTPUT=1 \
+      MBS=1 \
+      GBS=16 \
+      TP_SIZE=1 \
+      PP_SIZE=1 \
+      AC=none \
+      PR=bf16 \
+      EP_SIZE=8 \
+      ETP_SIZE=1 \
+      SEQLEN=4096 \
+      FORCE_BALANCE=true \
+      MOCK_DATA=1 \
+      RUN_ENV=cluster \
+      MODEL_SIZE=8x7B \
+      TRAIN_ITERS=50 \
+      bash examples/mixtral/train_mixtral_moe.sh

 .. container:: model-doc pyt_megatron_lm_train_mixtral-8x22b-proxy

@@ -665,10 +836,85 @@ Single node training

   .. code-block:: shell

-      RECOMPUTE_NUM_LAYERS=4 TEE_OUTPUT=1 MBS=1 GBS=16 TP_SIZE=1 PP_SIZE=1 AC=full NUM_LAYERS=4 PR=bf16 EP_SIZE=8 ETP_SIZE=1 SEQLEN=8192 FORCE_BALANCE=true MOCK_DATA=1 RUN_ENV=cluster MODEL_SIZE=8x22B TRAIN_ITERS=50 bash examples/mixtral/train_mixtral_moe.sh
+      TOKENIZER_MODEL=<path/to/tokenizer/model>
+      RECOMPUTE_NUM_LAYERS=4 \
+      TEE_OUTPUT=1 \
+      MBS=1 \
+      GBS=16 \
+      TP_SIZE=1 \
+      PP_SIZE=1 \
+      AC=full \
+      NUM_LAYERS=4 \
+      PR=bf16 \
+      EP_SIZE=8 \
+      ETP_SIZE=1 \
+      SEQLEN=8192 \
+      FORCE_BALANCE=true \
+      MOCK_DATA=1 \
+      RUN_ENV=cluster \
+      MODEL_SIZE=8x22B \
+      TRAIN_ITERS=50 \
+      bash examples/mixtral/train_mixtral_moe.sh

-Multi-node training
-^^^^^^^^^^^^^^^^^^^
+.. container:: model-doc pyt_megatron_lm_train_qwen2.5-7b
+
+   To run training on a single node for Qwen 2.5 7B BF16, use the following
+   command.
+
+   .. code-block:: shell
+
+      bash examples/qwen/train_qwen2.sh TP=1 \
+          CP=1 \
+          PP=1 \
+          MBS=10 \
+          BS=640 \
+          TE_FP8=0 \
+          MODEL_SIZE=7 \
+          SEQ_LENGTH=2048 \
+          TOTAL_ITERS=50 \
+          MOCK_DATA=1 \
+          TOKENIZER_MODEL=Qwen/Qwen2.5-7B
+
+   For FP8, use the following command.
+
+   .. code-block:: shell
+
+      bash examples/qwen/train_qwen2.sh \
+          TP=1 \
+          CP=1 \
+          PP=1 \
+          MBS=10 \
+          BS=640 \
+          TE_FP8=1 \
+          MODEL_SIZE=7 \
+          SEQ_LENGTH=2048 \
+          TOTAL_ITERS=50 \
+          MOCK_DATA=1 \
+          TOKENIZER_MODEL=Qwen/Qwen2.5-7B
+
+.. container:: model-doc pyt_megatron_lm_train_qwen2.5-72b
+
+   To run the training on a single node for Qwen 2.5 72B BF16, use the following command.
+
+   .. code-block:: shell
+
+      bash examples/qwen/train_qwen2.sh \
+          FSDP=1 \
+          CP=1 \
+          PP=1 \
+          MBS=3 \
+          BS=24 \
+          TE_FP8=0 \
+          MODEL_SIZE=72 \
+          SEQ_LENGTH=2048 \
+          TOTAL_ITERS=50 \
+          MOCK_DATA=1 \
+          TOKENIZER_MODEL=Qwen/Qwen2.5-72B \
+          RECOMPUTE_ACTIVATIONS=full \
+          CKPT_FORMAT=torch_dist
+
+Multi-node training examples
+----------------------------

 To run training on multiple nodes, launch the Docker container on each node.
 For example, for Llama 3 using a two node setup (``NODE0`` as the master node),
@@ -678,13 +924,33 @@ use these commands.

  .. code-block:: shell

-     TEE_OUTPUT=1 MBS=2 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8  MASTER_ADDR=IP_NODE0 NNODES=2 NODE_RANK=0 bash examples/llama/train_llama3.sh
+     TEE_OUTPUT=1 \
+     MBS=2 \
+     BS=256 \
+     TP=1 \
+     TE_FP8=1 \
+     SEQ_LENGTH=8192 \
+     MODEL_SIZE=8  \
+     MASTER_ADDR=IP_NODE0 \
+     NNODES=2 \
+     NODE_RANK=0 \
+     bash examples/llama/train_llama3.sh

 * On the worker node ``NODE1``:

  .. code-block:: shell

-     TEE_OUTPUT=1 MBS=2 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8  MASTER_ADDR=IP_NODE0 NNODES=2 NODE_RANK=1 bash examples/llama/train_llama3.sh
+     TEE_OUTPUT=1 \
+     MBS=2 \
+     BS=256 \
+     TP=1 \
+     TE_FP8=1 \
+     SEQ_LENGTH=8192 \
+     MODEL_SIZE=8  \
+     MASTER_ADDR=IP_NODE0 \
+     NNODES=2 \
+     NODE_RANK=1 \
+     bash examples/llama/train_llama3.sh

 Or, for DeepSeek-V3, an example script ``train_deepseek_v3_slurm.sh`` is
 provided in
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history.rst
@@ -12,23 +12,23 @@ previous releases of the ``ROCm/jax-training`` Docker image on `Docker Hub <http

 .. list-table::
   :header-rows: 1
-   :stub-columns: 1

   * - Image version
-     - ROCm version
-     - JAX version
+     - Components
     - Resources

-   * - 25.5
-     - 6.3.4
-     - 0.4.35
+   * - 25.5 (latest)
+     - 
+       * ROCm 6.3.4
+       * JAX 0.4.35
     - 
       * :doc:`Documentation <../jax-maxtext>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.5/images/sha256-4e0516358a227cae8f552fb866ec07e2edcf244756f02e7b40212abfbab5217b>`__

   * - 25.4
-     - 6.3.0
-     - 0.4.31
+     - 
+       * ROCm 6.3.0
+       * JAX 0.4.31
     - 
       * :doc:`Documentation <jax-maxtext-v25.4>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.4/images/sha256-fb3eb71cd74298a7b3044b7130cf84113f14d518ff05a2cd625c11ea5f6a7b01>`__
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst
@@ -11,37 +11,49 @@ previous releases of the ``ROCm/megatron-lm`` Docker image on `Docker Hub <https

 .. list-table::
   :header-rows: 1
-   :stub-columns: 1

   * - Image version
-     - ROCm version
-     - PyTorch version
+     - Components
     - Resources

-   * - v25.5
-     - 6.3.4
-     - 2.8.0a0+gite2f9759
+   * - v25.6 (latest)
+     - 
+       * ROCm 6.4.1
+       * PyTorch 2.8.0a0+git7d205b2
     - 
       * :doc:`Documentation <../megatron-lm>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/megatron-lm/v25.5_py312/images/sha256-4506f18ba188d24189c6b1f95130b425f52c528a543bb3f420351824edceadc2>`__
+       * `Docker Hub (py312) <https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py312/images/sha256-482ff906532285bceabdf2bda629bd32cb6174d2d07f4243a736378001b28df0>`__
+       * `Docker Hub (py310) <https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py310/images/sha256-9627bd9378684fe26cb1a10c7dd817868f553b33402e49b058355b0f095568d6>`__
+
+   * - v25.5
+     - 
+       * ROCm 6.3.4
+       * PyTorch 2.8.0a0+gite2f9759
+     - 
+       * :doc:`Documentation <megatron-lm-v25.5>`
+       * `Docker Hub (py312) <https://hub.docker.com/layers/rocm/megatron-lm/v25.5_py312/images/sha256-4506f18ba188d24189c6b1f95130b425f52c528a543bb3f420351824edceadc2>`__
+       * `Docker Hub (py310) <https://hub.docker.com/layers/rocm/megatron-lm/v25.5_py310/images/sha256-743fbf1ceff7a44c4452f938d783a7abf143737d1c15b2b95f6f8a62e0fd048b>`__

   * - v25.4
-     - 6.3.0
-     - 2.7.0a0+git637433 
+     - 
+       * ROCm 6.3.0
+       * PyTorch 2.7.0a0+git637433 
     - 
       * :doc:`Documentation <megatron-lm-v25.4>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/megatron-lm/v25.4/images/sha256-941aa5387918ea91c376c13083aa1e6c9cab40bb1875abbbb73bbb65d8736b3f>`__

   * - v25.3
-     - 6.3.0
-     - 2.7.0a0+git637433 
+     - 
+       * ROCm 6.3.0
+       * PyTorch 2.7.0a0+git637433 
     - 
       * :doc:`Documentation <megatron-lm-v25.3>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/megatron-lm/v25.3/images/sha256-1e6ed9bdc3f4ca397300d5a9907e084ab5e8ad1519815ee1f868faf2af1e04e2>`__

   * - v24.12-dev
-     - 6.1.0
-     - 2.4.0
+     - 
+       * ROCm 6.1.0
+       * PyTorch 2.4.0
     - 
       * :doc:`Documentation <megatron-lm-v24.12-dev>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/megatron-lm/24.12-dev/images/sha256-5818c50334ce3d69deeeb8f589d83ec29003817da34158ebc9e2d112b929bf2e>`__
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.5.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.5.rst
@@ -0,0 +1,775 @@
+:orphan:
+
+.. meta::
+   :description: How to train a model using Megatron-LM for ROCm.
+   :keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
+
+******************************************
+Training a model with Megatron-LM for ROCm
+******************************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm Megatron-LM
+   training performance documentation. See :doc:`../megatron-lm` for the latest version.
+
+The `Megatron-LM framework for ROCm <https://github.com/ROCm/Megatron-LM>`_ is
+a specialized fork of the robust Megatron-LM, designed to enable efficient
+training of large-scale language models on AMD GPUs. By leveraging AMD
+Instinct™ MI300X series accelerators, Megatron-LM delivers enhanced
+scalability, performance, and resource utilization for AI workloads. It is
+purpose-built to support models like Llama, DeepSeek, and Mixtral,
+enabling developers to train next-generation AI models more
+efficiently.
+
+AMD provides a ready-to-use Docker image for MI300X series accelerators containing
+essential components, including PyTorch, ROCm libraries, and Megatron-LM
+utilities. It contains the following software components to accelerate training
+workloads:
+
+--------------------------+--------------------------------+
+| Software component       | Version                        |
+==========================+================================+
+| ROCm                     | 6.3.4                          |
+--------------------------+--------------------------------+
+| PyTorch                  | 2.8.0a0+gite2f9759             |
+--------------------------+--------------------------------+
+| Python                   | 3.12 or 3.10                   |
+--------------------------+--------------------------------+
+| Transformer Engine       | 1.13.0+bb061ade                |
+--------------------------+--------------------------------+
+| Flash Attention          | 3.0.0                          |
+--------------------------+--------------------------------+
+| hipBLASLt                | 0.13.0-4f18bf6                 |
+--------------------------+--------------------------------+
+| Triton                   | 3.3.0                          |
+--------------------------+--------------------------------+
+| RCCL                     | 2.22.3                         |
+--------------------------+--------------------------------+
+
+Megatron-LM provides the following key features to train large language models efficiently:
+
+- Transformer Engine (TE)
+
+- APEX
+
+- GEMM tuning
+
+- Torch.compile
+
+- 3D parallelism: TP + SP + CP
+
+- Distributed optimizer
+
+- Flash Attention (FA) 3
+
+- Fused kernels
+
+- Pre-training
+
+.. _amd-megatron-lm-model-support-v255:
+
+The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.5-benchmark-models.yaml
+
+   Supported models
+   ================
+
+   The following models are supported for training performance benchmarking with Megatron-LM and ROCm.
+   Some instructions, commands, and training recommendations in this documentation might
+   vary by model -- select one to get started.
+
+   {% set model_groups = data["megatron-lm_benchmark"].model_groups %}
+
+   .. raw:: html
+
+         <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+           <div class="row">
+             <div class="col-2 me-2 model-param-head">Model</div>
+             <div class="row col-10">
+      {% for model_group in model_groups %}
+               <div class="col-4 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+      {% endfor %}
+             </div>
+           </div>
+
+           <div class="row mt-1">
+             <div class="col-2 me-2 model-param-head">Model variant</div>
+             <div class="row col-10">
+      {% for model_group in model_groups %}
+         {% set models = model_group.models %}
+         {% for model in models %}
+            {% if models|length % 3 == 0 %}
+               <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% else %}
+               <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% endif %}
+         {% endfor %}
+      {% endfor %}
+             </div>
+           </div>
+         </div>
+
+.. note::
+
+   Some models, such as Llama, require an external license agreement through
+   a third party (for example, Meta).
+
+.. _amd-megatron-lm-performance-measurements-v255:
+
+Performance measurements
+========================
+
+To evaluate performance, the
+`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`__
+page provides reference throughput and latency measurements for training
+popular AI models.
+
+.. important::
+
+   The performance data presented in
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`__
+   only reflects the latest version of this training benchmarking environment.
+   The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting training.
+
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
+.. _mi300x-amd-megatron-lm-training-v255:
+
+Environment setup
+=================
+
+Use the following instructions to set up the environment, configure the script to train models, and
+reproduce the benchmark results on MI300X series accelerators with the AMD Megatron-LM Docker
+image.
+
+.. _amd-megatron-lm-requirements-v255:
+ 
+Download the Docker image
+-------------------------
+
+1. Use the following command to pull the Docker image from Docker Hub.
+
+   .. tab-set:: 
+
+      .. tab-item:: Ubuntu 24.04 + Python 3.12
+         :sync: py312
+
+         .. code-block:: shell
+
+            docker pull rocm/megatron-lm:v25.5_py312
+
+      .. tab-item:: Ubuntu 22.04 + Python 3.10
+         :sync: py310
+
+         .. code-block:: shell
+
+            docker pull rocm/megatron-lm:v25.5_py310
+
+2. Launch the Docker container.
+
+   .. tab-set::
+
+      .. tab-item:: Ubuntu 24.04 + Python 3.12
+         :sync: py312
+
+         .. code-block:: shell
+
+            docker run -it --device /dev/dri --device /dev/kfd --device /dev/infiniband --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 128G --name megatron_training_env rocm/megatron-lm:v25.5_py312
+
+
+      .. tab-item:: Ubuntu 22.04 + Python 3.10
+         :sync: py310
+
+         .. code-block:: shell
+
+            docker run -it --device /dev/dri --device /dev/kfd --device /dev/infiniband --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 128G --name megatron_training_env rocm/megatron-lm:v25.5_py310
+
+3. Use these commands if you exit the ``megatron_training_env`` container and need to return to it.
+
+   .. code-block:: shell
+
+      docker start megatron_training_env
+      docker exec -it megatron_training_env bash
+
+The Docker container includes a pre-installed, verified version of the ROCm
+Megatron-LM development branch
+`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev>`__, including necessary
+training scripts.
+
+.. _amd-megatron-lm-environment-setup-v255:
+
+Configuration
+=============
+
+.. container:: model-doc pyt_megatron_lm_train_llama-3.3-70b pyt_megatron_lm_train_llama-3.1-8b pyt_megatron_lm_train_llama-3.1-70b
+
+   Update the ``train_llama3.sh`` configuration script in the ``examples/llama``
+   directory of
+   `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/llama>`__ to configure your training run.
+   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v255>`.
+
+.. container:: model-doc pyt_megatron_lm_train_llama-2-7b pyt_megatron_lm_train_llama-2-70b
+
+   Update the ``train_llama2.sh`` configuration script in the ``examples/llama``
+   directory of
+   `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/llama>`__ to configure your training run.
+   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v255>`.
+
+.. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy
+
+   Update the ``train_deepseekv3.sh`` configuration script in the ``examples/deepseek_v3``
+   directory of
+   `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/deepseek_v3>`__ to configure your training run.
+   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v255>`.
+
+.. container:: model-doc pyt_megatron_lm_train_deepseek-v2-lite-16b
+
+   Update the ``train_deepseekv2.sh`` configuration script in the ``examples/deepseek_v2``
+   directory of
+   `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/deepseek_v2>`__ to configure your training run.
+   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v255>`.
+
+.. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b pyt_megatron_lm_train_mixtral-8x22b-proxy
+
+   Update the ``train_mixtral_moe.sh`` configuration script in the ``examples/mixtral``
+   directory of
+   `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/mixtral>`__ to configure your training run.
+   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v255>`.
+
+.. note::
+
+   See :ref:`Key options <amd-megatron-lm-benchmark-test-vars-v255>` for more information on configuration options.
+
+Network interface
+-----------------
+
+Update the network interface in the script to match your system's network interface. To
+find your network interface, run the following (outside of any Docker container):
+
+.. code-block:: bash
+
+   ip a
+
+Look for an active interface that has an IP address in the same subnet as
+your other nodes. Then, update the following variables in the script, for
+example:
+
+.. code-block:: bash
+
+   export NCCL_SOCKET_IFNAME=ens50f0np0
+
+   export GLOO_SOCKET_IFNAME=ens50f0np0
+
+.. _amd-megatron-lm-tokenizer-v255:
+
+Tokenizer
+---------
+
+You can assign the path of an existing tokenizer to the ``TOKENIZER_MODEL`` as shown in the following examples.
+If the tokenizer is not found, it'll be downloaded if publicly available.
+
+.. container:: model-doc pyt_megatron_lm_train_llama-3.3-70b
+
+   If you do not have Llama 3.3 tokenizer locally, you need to use your
+   personal Hugging Face access token ``HF_TOKEN`` to download the tokenizer.
+   See `Llama-3.3-70B-Instruct
+   <https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct>`_. After you are
+   authorized, use your ``HF_TOKEN`` to download the tokenizer and set the
+   variable ``TOKENIZER_MODEL`` to the tokenizer path.
+
+   .. code-block:: shell
+
+      export HF_TOKEN=<Your personal Hugging Face access token>
+
+   The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path.
+
+   .. code-block:: shell
+
+      TOKENIZER_MODEL="meta-llama/Llama-3.3-70B-Instruct"
+
+.. container:: model-doc pyt_megatron_lm_train_llama-3.1-8b
+
+   The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path.
+
+   .. code-block:: shell
+
+      TOKENIZER_MODEL="meta-llama/Llama-3.1-8B"
+
+.. container:: model-doc pyt_megatron_lm_train_llama-3.1-70b
+
+   The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path.
+
+   .. code-block:: shell
+
+      TOKENIZER_MODEL="meta-llama/Llama-3.1-70B"
+
+.. container:: model-doc pyt_megatron_lm_train_llama-2-7b pyt_megatron_lm_train_llama-2-70b
+
+   The training script uses either the ``Llama2Tokenizer`` or ``HuggingFaceTokenizer`` by default.
+
+.. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy
+
+   The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path.
+
+   .. code-block:: shell
+
+      TOKENIZER_MODEL="deepseek-ai/DeepSeek-V3"
+
+.. container:: model-doc pyt_megatron_lm_train_deepseek-v2-lite-16b
+
+   The training script uses the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path.
+
+   .. code-block:: shell
+
+      TOKENIZER_MODEL="deepseek-ai/DeepSeek-V2-Lite"
+
+.. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b pyt_megatron_lm_train_mixtral-8x22b-proxy
+
+   Download the Mixtral tokenizer.
+
+   .. code-block:: shell
+
+      mkdir tokenizer
+      cd tokenizer
+      export HF_TOKEN=<Your personal Hugging Face access token>
+      wget --header="Authorization: Bearer $HF_TOKEN" -O ./tokenizer.model https://huggingface.co/mistralai/Mixtral-8x7B-v0.1/resolve/main/tokenizer.model
+
+   Use the ``HuggingFaceTokenizer``. Set ``TOKENIZER_MODEL`` to the appropriate Hugging Face model path.
+
+   .. code-block:: shell
+
+      TOKENIZER_MODEL=tokenizer/tokenizer.model
+
+Dataset options
+---------------
+
+You can use either mock data or real data for training.
+
+* Mock data can be useful for testing and validation. Use the ``MOCK_DATA`` variable to toggle between mock and real data. The default
+  value is ``1`` for enabled.
+
+  .. code-block:: bash
+
+     MOCK_DATA=1
+
+* If you're using a real dataset, update the ``DATA_PATH`` variable to point to the location of your dataset.
+
+  .. code-block:: bash
+
+     MOCK_DATA=0
+
+     DATA_PATH="/data/bookcorpus_text_sentence"  # Change to where your dataset is stored
+
+  Ensure that the files are accessible inside the Docker container.
+
+Download the dataset
+^^^^^^^^^^^^^^^^^^^^
+
+.. container:: model-doc pyt_megatron_lm_train_llama-3.3-70b pyt_megatron_lm_train_llama-3.1-8b pyt_megatron_lm_train_llama-3.1-70b pyt_megatron_lm_train_llama-2-7b pyt_megatron_lm_train_llama-2-70b
+
+   For Llama models, use the `prepare_dataset.sh
+   <https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/llama>`_ script
+   to prepare your dataset.
+   To download the dataset, set the ``DATASET`` variable to the dataset you'd
+   like to use. Three datasets are supported: ``DATASET=wiki``, ``DATASET=fineweb``, and
+   ``DATASET=bookcorpus``.
+
+   .. code-block:: shell
+
+      DATASET=wiki TOKENIZER_MODEL=NousResearch/Llama-2-7b-chat-hf bash examples/llama/prepare_dataset.sh #for wiki-en dataset
+      DATASET=bookcorpus TOKENIZER_MODEL=NousResearch/Llama-2-7b-chat-hf bash examples/llama/prepare_dataset.sh #for bookcorpus dataset
+
+   ``TOKENIZER_MODEL`` can be any accessible Hugging Face tokenizer.
+   Remember to either pre-download the tokenizer or setup Hugging Face access
+   otherwise when needed -- see the :ref:`Tokenizer <amd-megatron-lm-tokenizer-v255>` section.
+
+   .. note::
+
+      When training set ``DATA_PATH`` to the specific file name prefix pointing to the ``.bin`` or ``.idx``
+      as in the following example:
+
+      .. code-block:: shell
+
+         DATA_PATH="data/bookcorpus_text_sentence" # Change to where your dataset is stored.
+
+.. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy
+
+   If you don't already have the dataset, download the DeepSeek dataset using the following
+   commands:
+
+   .. code-block:: shell
+
+      mkdir deepseek-datasets
+      cd deepseek-datasets
+      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/SlimPajama.json
+      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-train.json
+      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-valid.json
+      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.bin
+      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.idx
+
+   To train on this data, update the ``DATA_DIR`` variable to point to the location of your dataset.
+
+   .. code-block:: bash
+
+      MOCK_DATA=0 # Train on real data
+
+      DATA_DIR="<path-to>/deepseek-datasets"  # Change to where your dataset is stored
+
+      Ensure that the files are accessible inside the Docker container.
+
+.. container:: model-doc pyt_megatron_lm_train_deepseek-v2-lite-16b
+
+   If you don't already have the dataset, download the DeepSeek dataset using the following
+   commands:
+
+   .. code-block:: shell
+
+      mkdir deepseek-datasets
+      cd deepseek-datasets
+      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/SlimPajama.json
+      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-train.json
+      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-valid.json
+      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.bin
+      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.idx
+
+   To train on this data, update the ``DATA_DIR`` variable to point to the location of your dataset.
+
+   .. code-block:: bash
+
+      MOCK_DATA=0 # Train on real data
+
+      DATA_DIR="<path-to>/deepseek-datasets"  # Change to where your dataset is stored
+
+      Ensure that the files are accessible inside the Docker container.
+
+.. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b pyt_megatron_lm_train_mixtral-8x22b-proxy
+
+   If you don't already have the dataset, download the Mixtral dataset using the following
+   commands:
+
+   .. code-block:: shell
+
+      mkdir mixtral-datasets
+      cd mixtral-datasets
+      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/mistral-datasets/wudao_mistralbpe_content_document.bin
+      wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/mistral-datasets/wudao_mistralbpe_content_document.idx
+
+   To train on this data, update the ``DATA_DIR`` variable to point to the location of your dataset.
+
+   .. code-block:: bash
+
+      MOCK_DATA=0 # Train on real data
+
+      DATA_DIR="<path-to>/mixtral-datasets"  # Change to where your dataset is stored
+
+   Ensure that the files are accessible inside the Docker container.
+
+Multi-node configuration
+------------------------
+
+If you're running multi-node training, update the following environment variables. They can
+also be passed as command line arguments. Refer to the following example configurations.
+
+* Change ``localhost`` to the master node's hostname:
+
+  .. code-block:: shell
+
+     MASTER_ADDR="${MASTER_ADDR:-localhost}"
+
+* Set the number of nodes you want to train on (for instance, ``2``, ``4``, ``8``):
+
+  .. code-block:: shell
+
+     NNODES="${NNODES:-1}"
+
+* Set the rank of each node (0 for master, 1 for the first worker node, and so on):
+
+  .. code-block:: shell
+
+     NODE_RANK="${NODE_RANK:-0}"
+
+* Set ``DATA_CACHE_PATH`` to a common directory accessible by all the nodes (for example, an
+  NFS directory) for multi-node runs:
+
+  .. code-block:: shell
+
+     DATA_CACHE_PATH=/root/cache # Set to a common directory for multi-node runs
+
+* For multi-node runs, make sure the correct network drivers are installed on the nodes. If
+  inside a Docker container, either install the drivers inside the Docker container or pass the network
+  drivers from the host while creating the Docker container.
+
+  .. code-block:: shell
+
+     # Specify which RDMA interfaces to use for communication
+     export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
+
+Getting started
+===============
+
+The prebuilt Megatron-LM with ROCm training environment allows users to quickly validate
+system performance, conduct training benchmarks, and achieve superior
+performance for models like Llama, DeepSeek, and Mixtral. This container should not be
+expected to provide generalized performance across all training workloads. You
+can expect the container to perform in the model configurations described in
+the following section, but other configurations are not validated by AMD.
+
+.. _amd-megatron-lm-run-training-v255:
+
+Run training
+------------
+
+Use the following example commands to set up the environment, configure
+:ref:`key options <amd-megatron-lm-benchmark-test-vars-v255>`, and run training on
+MI300X series accelerators with the AMD Megatron-LM environment.
+
+Single node training
+^^^^^^^^^^^^^^^^^^^^
+
+.. container:: model-doc pyt_megatron_lm_train_llama-3.3-70b
+
+   To run the training on a single node for Llama 3.3 70B BF16 with FSDP-v2 enabled, add the ``FSDP=1`` argument.
+   For example, use the following command:
+
+   .. code-block:: shell
+
+      TEE_OUTPUT=1 RECOMPUTE=1 SEQ_LENGTH=8192 MBS=2 BS=16 TE_FP8=0 TP=1 PP=1 FSDP=1 MODEL_SIZE=70 TOTAL_ITERS=50 bash examples/llama/train_llama3.sh 
+
+   .. note::
+
+      It is suggested to use ``TP=1`` when FSDP is enabled for higher
+      throughput. FSDP-v2 is not supported with pipeline parallelism, expert
+      parallelism, MCore's distributed optimizer, gradient accumulation fusion,
+      or FP16.
+
+      Currently, FSDP is only compatible with BF16 precision.
+
+.. container:: model-doc pyt_megatron_lm_train_llama-3.1-8b
+
+   To run training on a single node for Llama 3.1 8B FP8, navigate to the Megatron-LM folder and use the
+   following command.
+
+   .. code-block:: shell
+
+      TEE_OUTPUT=1 MBS=2 BS=128 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 TOTAL_ITERS=50 bash examples/llama/train_llama3.sh
+
+   For Llama 3.1 8B BF16, use the following command:
+
+   .. code-block:: shell
+
+      TEE_OUTPUT=1 MBS=2 BS=128 TP=1 TE_FP8=0 SEQ_LENGTH=8192 MODEL_SIZE=8 TOTAL_ITERS=50 bash examples/llama/train_llama3.sh
+
+.. container:: model-doc pyt_megatron_lm_train_llama-3.1-70b
+
+   To run the training on a single node for Llama 3.1 70B BF16 with FSDP-v2 enabled, add the ``FSDP=1`` argument.
+   For example, use the following command:
+
+   .. code-block:: shell
+
+      TEE_OUTPUT=1 MBS=3 BS=24 TP=1 TE_FP8=0 FSDP=1 RECOMPUTE=1 SEQ_LENGTH=8192 MODEL_SIZE=70 TOTAL_ITERS=50 bash examples/llama/train_llama3.sh
+
+   .. note::
+
+      It is suggested to use ``TP=1`` when FSDP is enabled for higher
+      throughput. FSDP-v2 is not supported with pipeline parallelism, expert
+      parallelism, MCore's distributed optimizer, gradient accumulation fusion,
+      or FP16.
+
+      Currently, FSDP is only compatible with BF16 precision.
+
+.. container:: model-doc pyt_megatron_lm_train_llama-2-7b
+
+   To run training on a single node for Llama 2 7B FP8, navigate to the Megatron-LM folder and use the
+   following command.
+
+   .. code-block:: shell
+
+      TEE_OUTPUT=1 MBS=4 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=4096 MODEL_SIZE=7 TOTAL_ITERS=50 bash examples/llama/train_llama2.sh
+
+   For Llama 2 7B BF16, use the following command:
+
+   .. code-block:: shell
+
+      TEE_OUTPUT=1 MBS=4 BS=256 TP=1 TE_FP8=0 SEQ_LENGTH=4096 MODEL_SIZE=7 TOTAL_ITERS=50 bash examples/llama/train_llama2.sh
+
+.. container:: model-doc pyt_megatron_lm_train_llama-2-70b
+
+   To run the training on a single node for Llama 2 70B BF16 with FSDP-v2 enabled, add the ``FSDP=1`` argument.
+   For example, use the following command:
+
+   .. code-block:: shell
+
+      TEE_OUTPUT=1 MBS=7 BS=56 TP=1 TE_FP8=0 FSDP=1 RECOMPUTE=1 SEQ_LENGTH=4096 MODEL_SIZE=70 TOTAL_ITERS=50 bash examples/llama/train_llama2.sh
+
+   .. note::
+
+      It is suggested to use ``TP=1`` when FSDP is enabled for higher
+      throughput. FSDP-v2 is not supported with pipeline parallelism, expert
+      parallelism, MCore's distributed optimizer, gradient accumulation fusion,
+      or FP16.
+
+      Currently, FSDP is only compatible with BF16 precision.
+
+.. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy
+
+   To run training on a single node for DeepSeek-V3 (MoE with expert parallel) with 3-layer proxy, 
+   navigate to the Megatron-LM folder and use the following command.
+
+   .. code-block:: shell
+
+      FORCE_BANLANCE=true \
+      RUN_ENV=cluster \
+      MODEL_SIZE=671B \
+      TRAIN_ITERS=50 \
+      SEQ_LEN=4096 \
+      NUM_LAYERS=3 \
+      MICRO_BATCH_SIZE=1 GLOBAL_BATCH_SIZE=32 \
+      PR=bf16 \
+      TP=1 PP=1 ETP=1 EP=8 \
+      GEMM_TUNING=1 \
+      NVTE_CK_USES_BWD_V3=1 \
+      USE_GROUPED_GEMM=true MOE_USE_LEGACY_GROUPED_GEMM=true \
+      GPT_LAYER_IN_TE=true \
+      bash examples/deepseek_v3/train_deepseekv3.sh
+
+.. container:: model-doc pyt_megatron_lm_train_deepseek-v2-lite-16b
+
+   To run training on a single node for DeepSeek-V2-Lite (MoE with expert parallel),
+   navigate to the Megatron-LM folder and use the following command.
+
+   .. code-block:: shell
+
+      GEMM_TUNING=1 PR=bf16 MBS=4 AC=none SEQ_LEN=4096 PAD_LEN=4096 TRAIN_ITERS=50 bash examples/deepseek_v2/train_deepseekv2.sh
+
+.. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b
+
+   To run training on a single node for Mixtral 8x7B (MoE with expert parallel),
+   navigate to the Megatron-LM folder and use the following command.
+
+   .. code-block:: shell
+
+      RECOMPUTE_NUM_LAYERS=0 TEE_OUTPUT=1 MBS=1 GBS=16 TP_SIZE=1 PP_SIZE=1 AC=none PR=bf16 EP_SIZE=8 ETP_SIZE=1 SEQLEN=4096 FORCE_BALANCE=true MOCK_DATA=1 RUN_ENV=cluster MODEL_SIZE=8x7B TRAIN_ITERS=50 bash examples/mixtral/train_mixtral_moe.sh
+
+.. container:: model-doc pyt_megatron_lm_train_mixtral-8x22b-proxy
+
+   To run training on a single node for Mixtral 8x7B (MoE with expert parallel) with 4-layer proxy,
+   navigate to the Megatron-LM folder and use the following command.
+
+   .. code-block:: shell
+
+      RECOMPUTE_NUM_LAYERS=4 TEE_OUTPUT=1 MBS=1 GBS=16 TP_SIZE=1 PP_SIZE=1 AC=full NUM_LAYERS=4 PR=bf16 EP_SIZE=8 ETP_SIZE=1 SEQLEN=8192 FORCE_BALANCE=true MOCK_DATA=1 RUN_ENV=cluster MODEL_SIZE=8x22B TRAIN_ITERS=50 bash examples/mixtral/train_mixtral_moe.sh
+
+Multi-node training
+^^^^^^^^^^^^^^^^^^^
+
+To run training on multiple nodes, launch the Docker container on each node.
+For example, for Llama 3 using a two node setup (``NODE0`` as the master node),
+use these commands.
+
+* On the master node ``NODE0``:
+
+  .. code-block:: shell
+
+     TEE_OUTPUT=1 MBS=2 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8  MASTER_ADDR=IP_NODE0 NNODES=2 NODE_RANK=0 bash examples/llama/train_llama3.sh
+
+* On the worker node ``NODE1``:
+
+  .. code-block:: shell
+
+     TEE_OUTPUT=1 MBS=2 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8  MASTER_ADDR=IP_NODE0 NNODES=2 NODE_RANK=1 bash examples/llama/train_llama3.sh
+
+Or, for DeepSeek-V3, an example script ``train_deepseek_v3_slurm.sh`` is
+provided in
+`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/deepseek_v3>`__ to
+enable training at scale under a SLURM environment. For example, to run
+training on 16 nodes, try the following command:
+
+.. code-block:: shell
+
+   sbatch examples/deepseek_v3/train_deepseek_v3_slurm.sh
+
+.. _amd-megatron-lm-benchmark-test-vars-v255:
+
+Key options
+-----------
+
+The benchmark tests support the following sets of variables.
+
+``TEE_OUTPUT``
+  ``1`` to enable training logs or ``0`` to disable.
+
+``TE_FP8``
+  ``0`` for B16 or ``1`` for FP8 -- ``0`` by default.
+
+``GEMM_TUNING``
+  ``1`` to enable GEMM tuning, which boosts performance by using the best GEMM kernels.
+
+``USE_FLASH_ATTN``
+  ``1`` to enable Flash Attention.
+
+``FSDP``
+  ``1`` to enable PyTorch FSDP2. If FSDP is enabled, ``--use-distributed-optimizer``,
+  ``--overlap-param-gather``, and ``--sequence-parallel`` are automatically disabled.
+
+``ENABLE_PROFILING``
+  ``1`` to enable PyTorch profiling for performance analysis.
+
+``transformer-impl``
+  ``transformer_engine`` to use the Transformer Engine (TE) or ``local`` to disable TE.
+
+``MODEL_SIZE``
+  ``8B`` or ``70B`` for Llama 3 and 3.1. ``7B`` or ``70B`` for Llama 2, for example.
+
+``TOTAL_ITERS``
+  The total number of iterations -- ``10`` by default.
+
+``MOCK_DATA``
+  ``1`` to use mock data or ``0`` to use real data you provide.
+
+``MBS``
+  Micro batch size.
+
+``BS``
+  Global batch size.
+
+``TP`` / ``TP_SIZE``
+  Tensor parallel (``1``, ``2``, ``4``, ``8``). ``TP`` is disabled when ``FSDP`` is turned on.
+
+``EP`` / ``EP_SIZE``
+  Expert parallel for MoE models.
+
+``SEQ_LENGTH``
+  Input sequence length.
+
+``PR``
+  Precision for training. ``bf16`` for BF16 (default) or ``fp8`` for FP8 GEMMs.
+
+``AC``
+  Activation checkpointing (``none``, ``sel``, or ``full``) -- ``sel`` by default.
+
+``NUM_LAYERS``
+  Use reduced number of layers as a proxy model.
+
+``RECOMPUTE_NUM_LAYERS``
+  Number of layers used for checkpointing recompute.
+
+Previous versions
+=================
+
+See :doc:`megatron-lm-history` to find documentation for previous releases
+of the ``ROCm/megatron-lm`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst
@@ -11,37 +11,39 @@ previous releases of the ``ROCm/pytorch-training`` Docker image on `Docker Hub <

 .. list-table::
   :header-rows: 1
-   :stub-columns: 1

   * - Image version
-     - ROCm version
-     - PyTorch version
+     - Components
     - Resources

   * - v25.6
-     - 6.3.4
-     - 2.8.0a0+git7d205b2
+     - 
+       * ROCm 6.3.4
+       * PyTorch 2.8.0a0+git7d205b2
     - 
       * :doc:`Documentation <../pytorch-training>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.6/images/sha256-a4cea3c493a4a03d199a3e81960ac071d79a4a7a391aa9866add3b30a7842661>`__

   * - v25.5
-     - 6.3.4
-     - 2.7.0a0+git637433
+     - 
+       * ROCm 6.3.4
+       * PyTorch 2.7.0a0+git637433
     - 
       * :doc:`Documentation <pytorch-training-v25.5>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.5/images/sha256-d47850a9b25b4a7151f796a8d24d55ea17bba545573f0d50d54d3852f96ecde5>`__

   * - v25.4
-     - 6.3.0
-     - 2.7.0a0+git637433
+     - 
+       * ROCm 6.3.0
+       * PyTorch 2.7.0a0+git637433
     - 
       * :doc:`Documentation <pytorch-training-v25.4>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.4/images/sha256-fa98a9aa69968e654466c06f05aaa12730db79b48b113c1ab4f7a5fe6920a20b>`__

   * - v25.3
-     - 6.3.0
-     - 2.7.0a0+git637433
+     - 
+       * ROCm 6.3.0
+       * PyTorch 2.7.0a0+git637433
     - 
       * :doc:`Documentation <pytorch-training-v25.3>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.3/images/sha256-0ffdde1b590fd2787b1c7adf5686875b100980b0f314090901387c44253e709b>`__