Update PyTorch training benchmark docker doc to 25.7 (#5255)

* Update PyTorch training benchmark docker doc to 25.7 * update .wordlist.txt * update conf.py * update data sheet * fix sphinx warnings
2026-01-08 06:13:59 -05:00 · 2025-09-05 12:07:51 -04:00
parent 76fd6b2290
commit 4bc1bf00c6
15 changed files with 1079 additions and 340 deletions
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -918,6 +918,7 @@ toolchain
 toolchains
 toolset
 toolsets
+torchtitan
 torchvision
 tqdm
 tracebacks
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -124,11 +124,15 @@ article_pages = [
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.5", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.6", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-primus-migration-guide", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/primus-megatron", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.3", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.4", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.6", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4", "os": ["linux"]},
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.6-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.6-benchmark-models.yaml
@@ -0,0 +1,120 @@
+unified_docker:
+  latest:
+    pull_tag: rocm/pytorch-training:v25.6
+    docker_hub_url: https://hub.docker.com/r/rocm/pytorch-training/tags
+    rocm_version: 6.4.1
+    pytorch_version: 2.8.0a0+git7d205b2
+    python_version: 3.10.17
+    transformer_engine_version: 1.14.0+2f85f5f2
+    flash_attention_version: 3.0.0.post1
+    hipblaslt_version: 0.15.0-8c6919d
+    triton_version: 3.3.0
+model_groups:
+  - group: Pre-training
+    tag: pre-training
+    models:
+    - model: Llama 3.1 8B
+      mad_tag: pyt_train_llama-3.1-8b
+      model_repo: Llama-3.1-8B
+      url: https://huggingface.co/meta-llama/Llama-3.1-8B
+      precision: BF16
+      training_modes: [pretrain]
+    - model: Llama 3.1 70B
+      mad_tag: pyt_train_llama-3.1-70b
+      model_repo: Llama-3.1-70B
+      url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
+      precision: BF16
+      training_modes: [pretrain]
+    - model: FLUX.1-dev
+      mad_tag: pyt_train_flux
+      model_repo: Flux
+      url: https://huggingface.co/black-forest-labs/FLUX.1-dev
+      precision: BF16
+      training_modes: [pretrain]
+  - group: Fine-tuning
+    tag: fine-tuning
+    models:
+    - model: Llama 4 Scout 17B-16E
+      mad_tag: pyt_train_llama-4-scout-17b-16e
+      model_repo: Llama-4-17B_16E
+      url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3.3 70B
+      mad_tag: pyt_train_llama-3.3-70b
+      model_repo: Llama-3.3-70B
+      url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
+    - model: Llama 3.2 1B
+      mad_tag: pyt_train_llama-3.2-1b
+      model_repo: Llama-3.2-1B
+      url: https://huggingface.co/meta-llama/Llama-3.2-1B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3.2 3B
+      mad_tag: pyt_train_llama-3.2-3b
+      model_repo: Llama-3.2-3B
+      url: https://huggingface.co/meta-llama/Llama-3.2-3B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3.2 Vision 11B
+      mad_tag: pyt_train_llama-3.2-vision-11b
+      model_repo: Llama-3.2-Vision-11B
+      url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision
+      precision: BF16
+      training_modes: [finetune_fw]
+    - model: Llama 3.2 Vision 90B
+      mad_tag: pyt_train_llama-3.2-vision-90b
+      model_repo: Llama-3.2-Vision-90B
+      url: https://huggingface.co/meta-llama/Llama-3.2-90B-Vision
+      precision: BF16
+      training_modes: [finetune_fw]
+    - model: Llama 3.1 8B
+      mad_tag: pyt_train_llama-3.1-8b
+      model_repo: Llama-3.1-8B
+      url: https://huggingface.co/meta-llama/Llama-3.1-8B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3.1 70B
+      mad_tag: pyt_train_llama-3.1-70b
+      model_repo: Llama-3.1-70B
+      url: https://huggingface.co/meta-llama/Llama-3.1-70B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
+    - model: Llama 3.1 405B
+      mad_tag: pyt_train_llama-3.1-405b
+      model_repo: Llama-3.1-405B
+      url: https://huggingface.co/meta-llama/Llama-3.1-405B
+      precision: BF16
+      training_modes: [finetune_qlora, HF_finetune_lora]
+    - model: Llama 3 8B
+      mad_tag: pyt_train_llama-3-8b
+      model_repo: Llama-3-8B
+      url: https://huggingface.co/meta-llama/Meta-Llama-3-8B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3 70B
+      mad_tag: pyt_train_llama-3-70b
+      model_repo: Llama-3-70B
+      url: https://huggingface.co/meta-llama/Meta-Llama-3-70B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 2 7B
+      mad_tag: pyt_train_llama-2-7b
+      model_repo: Llama-2-7B
+      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
+    - model: Llama 2 13B
+      mad_tag: pyt_train_llama-2-13b
+      model_repo: Llama-2-13B
+      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 2 70B
+      mad_tag: pyt_train_llama-2-70b
+      model_repo: Llama-2-70B
+      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
+      precision: BF16
+      training_modes: [finetune_lora, finetune_qlora, HF_finetune_lora]
--- a/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
@@ -1,38 +1,17 @@
-unified_docker:
-  latest:
-    pull_tag: rocm/pytorch-training:v25.6
-    docker_hub_url: https://hub.docker.com/r/rocm/pytorch-training/tags
-    rocm_version: 6.4.1
-    pytorch_version: 2.8.0a0+git7d205b2
-    python_version: 3.10.17
-    transformer_engine_version: 1.14.0+2f85f5f2
-    flash_attention_version: 3.0.0.post1
-    hipblaslt_version: 0.15.0-8c6919d
-    triton_version: 3.3.0
+dockers:
+  - pull_tag: rocm/pytorch-training:v25.7
+    docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.7/images/sha256-cc6fd840ab89cb81d926fc29eca6d075aee9875a55a522675a4b9231c9a0a712
+    components:
+      ROCm: 6.4.2
+      PyTorch: 2.8.0a0+gitd06a406
+      Python: 3.10.18
+      Transformer Engine: 2.2.0.dev0+94e53dd8
+      Flash Attention: 3.0.0.post1
+      hipBLASLt: 1.1.0-4b9a52edfc
+      Triton: 3.3.0
 model_groups:
-  - group: Pre-training
-    tag: pre-training
-    models:
-    - model: Llama 3.1 8B
-      mad_tag: pyt_train_llama-3.1-8b
-      model_repo: Llama-3.1-8B
-      url: https://huggingface.co/meta-llama/Llama-3.1-8B
-      precision: BF16
-      training_modes: [pretrain]
-    - model: Llama 3.1 70B
-      mad_tag: pyt_train_llama-3.1-70b
-      model_repo: Llama-3.1-70B
-      url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
-      precision: BF16
-      training_modes: [pretrain]
-    - model: FLUX.1-dev
-      mad_tag: pyt_train_flux
-      model_repo: Flux
-      url: https://huggingface.co/black-forest-labs/FLUX.1-dev
-      precision: BF16
-      training_modes: [pretrain]
-  - group: Fine-tuning
-    tag: fine-tuning
+  - group: Meta Llama
+    tag: llama
    models:
    - model: Llama 4 Scout 17B-16E
      mad_tag: pyt_train_llama-4-scout-17b-16e
@@ -75,19 +54,19 @@ model_groups:
      model_repo: Llama-3.1-8B
      url: https://huggingface.co/meta-llama/Llama-3.1-8B
      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
+      training_modes: [pretrain, finetune_fw, finetune_lora, HF_pretrain]
    - model: Llama 3.1 70B
      mad_tag: pyt_train_llama-3.1-70b
      model_repo: Llama-3.1-70B
-      url: https://huggingface.co/meta-llama/Llama-3.1-70B
+      url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
      precision: BF16
-      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
+      training_modes: [pretrain, finetune_fw, finetune_lora]
    - model: Llama 3.1 405B
      mad_tag: pyt_train_llama-3.1-405b
      model_repo: Llama-3.1-405B
      url: https://huggingface.co/meta-llama/Llama-3.1-405B
      precision: BF16
-      training_modes: [finetune_qlora, HF_finetune_lora]
+      training_modes: [finetune_qlora]
    - model: Llama 3 8B
      mad_tag: pyt_train_llama-3-8b
      model_repo: Llama-3-8B
@@ -117,4 +96,67 @@ model_groups:
      model_repo: Llama-2-70B
      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
      precision: BF16
-      training_modes: [finetune_lora, finetune_qlora, HF_finetune_lora]
+      training_modes: [finetune_lora, finetune_qlora]
+  - group: OpenAI
+    tag: openai
+    models:
+    - model: GPT OSS 20B
+      mad_tag: pyt_train_gpt_oss_20b
+      model_repo: GPT-OSS-20B
+      url: https://huggingface.co/openai/gpt-oss-20b
+      precision: BF16
+      training_modes: [HF_finetune_lora]
+    - model: GPT OSS 120B
+      mad_tag: pyt_train_gpt_oss_120b
+      model_repo: GPT-OSS-120B
+      url: https://huggingface.co/openai/gpt-oss-120b
+      precision: BF16
+      training_modes: [HF_finetune_lora]
+  - group: Qwen
+    tag: qwen
+    models:
+    - model: Qwen 3 8B
+      mad_tag: pyt_train_qwen3-8b
+      model_repo: Qwen3-8B
+      url: https://huggingface.co/Qwen/Qwen3-8B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Qwen 3 32B
+      mad_tag: pyt_train_qwen3-32b
+      model_repo: Qwen3-32
+      url: https://huggingface.co/Qwen/Qwen3-32B
+      precision: BF16
+      training_modes: [finetune_lora]
+    - model: Qwen 2.5 32B
+      mad_tag: pyt_train_qwen2.5-32b
+      model_repo: Qwen2.5-32B
+      url: https://huggingface.co/Qwen/Qwen2.5-32B
+      precision: BF16
+      training_modes: [finetune_lora]
+    - model: Qwen 2.5 72B
+      mad_tag: pyt_train_qwen2.5-72b
+      model_repo: Qwen2.5-72B
+      url: https://huggingface.co/Qwen/Qwen2.5-72B
+      precision: BF16
+      training_modes: [finetune_lora]
+    - model: Qwen 2 1.5B
+      mad_tag: pyt_train_qwen2-1.5b
+      model_repo: Qwen2-1.5B
+      url: https://huggingface.co/Qwen/Qwen2-1.5B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Qwen 2 7B
+      mad_tag: pyt_train_qwen2-7b
+      model_repo: Qwen2-7B
+      url: https://huggingface.co/Qwen/Qwen2-7B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+  - group: Flux
+    tag: flux
+    models:
+    - model: FLUX.1-dev
+      mad_tag: pyt_train_flux
+      model_repo: Flux
+      url: https://huggingface.co/black-forest-labs/FLUX.1-dev
+      precision: BF16
+      training_modes: [pretrain]
--- a/docs/how-to/deep-learning-rocm.rst
+++ b/docs/how-to/deep-learning-rocm.rst
@@ -23,93 +23,92 @@ The table below summarizes information about ROCm-enabled deep learning framewor
      - Installation options
      - GitHub

-    * - `PyTorch <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/pytorch-compatibility.html>`_
+    * - `PyTorch <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/pytorch-compatibility.html>`__
      - .. raw:: html
-         
+
          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html"><i class="fas fa-link fa-lg"></i></a>
      - 
-        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-a-docker-image-with-pytorch-pre-installed>`_ 
-        - `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-a-wheels-package>`_
-        - `ROCm Base Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-the-pytorch-rocm-base-docker-image>`_ 
-        - `Upstream Docker file <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-the-pytorch-upstream-dockerfile>`_
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-a-docker-image-with-pytorch-pre-installed>`__
+        - `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-a-wheels-package>`__
+        - `ROCm Base Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-the-pytorch-rocm-base-docker-image>`__
+        - `Upstream Docker file <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-the-pytorch-upstream-dockerfile>`__
      - .. raw:: html
-         
+
          <a href="https://github.com/ROCm/pytorch"><i class="fab fa-github fa-lg"></i></a>
-   
-    * - `TensorFlow <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/tensorflow-compatibility.html>`_
+
+    * - `TensorFlow <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/tensorflow-compatibility.html>`__
      - .. raw:: html
-         
+
          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html"><i class="fas fa-link fa-lg"></i></a>
      - 
-        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html#using-a-docker-image-with-tensorflow-pre-installed>`_
-        - `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html#using-a-wheels-package>`_
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html#using-a-docker-image-with-tensorflow-pre-installed>`__
+        - `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html#using-a-wheels-package>`__

      - .. raw:: html
-         
+
          <a href="https://github.com/ROCm/tensorflow-upstream"><i class="fab fa-github fa-lg"></i></a> 

-    * - `JAX <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/jax-compatibility.html>`_
+    * - `JAX <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/jax-compatibility.html>`__
      - .. raw:: html
-         
+
          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/jax-install.html"><i class="fas fa-link fa-lg"></i></a>
      - 
-        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/jax-install.html#using-a-prebuilt-docker-image>`_
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/jax-install.html#using-a-prebuilt-docker-image>`__
      - .. raw:: html
-         
+
          <a href="https://github.com/ROCm/jax"><i class="fab fa-github fa-lg"></i></a>
-   
-    * - `verl <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/verl-compatibility.html>`_
+
+    * - `verl <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/verl-compatibility.html>`__
      - .. raw:: html
-         
+
          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/verl-install.html"><i class="fas fa-link fa-lg"></i></a>
      - 
-        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/verl-install.html#use-a-prebuilt-docker-image-with-verl-pre-installed>`_
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/verl-install.html#use-a-prebuilt-docker-image-with-verl-pre-installed>`__
      - .. raw:: html
-         
+
          <a href="https://github.com/ROCm/verl"><i class="fab fa-github fa-lg"></i></a>

-    * - `Stanford Megatron-LM <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/stanford-megatron-lm-compatibility.html>`_
+    * - `Stanford Megatron-LM <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/stanford-megatron-lm-compatibility.html>`__
      - .. raw:: html
-         
+
          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/stanford-megatron-lm-install.html"><i class="fas fa-link fa-lg"></i></a>
      - 
-        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/stanford-megatron-lm-install.html#use-a-prebuilt-docker-image-with-stanford-megatron-lm-pre-installed>`_
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/stanford-megatron-lm-install.html#use-a-prebuilt-docker-image-with-stanford-megatron-lm-pre-installed>`__
      - .. raw:: html
-         
+
          <a href="https://github.com/ROCm/Stanford-Megatron-LM"><i class="fab fa-github fa-lg"></i></a>
-   
-    * - `DGL <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/dgl-compatibility.html>`_
+
+    * - `DGL <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/dgl-compatibility.html>`__
      - .. raw:: html
-         
+
          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/dgl-install.html"><i class="fas fa-link fa-lg"></i></a>
      - 
-        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/dgl-install.html#use-a-prebuilt-docker-image-with-dgl-pre-installed>`_
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/dgl-install.html#use-a-prebuilt-docker-image-with-dgl-pre-installed>`__
      - .. raw:: html
-         
+
          <a href="https://github.com/ROCm/dgl"><i class="fab fa-github fa-lg"></i></a> 

-    * - `Megablocks <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/megablocks-compatibility.html>`_
+    * - `Megablocks <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/megablocks-compatibility.html>`__
      - .. raw:: html
-         
+
          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/megablocks-install.html"><i class="fas fa-link fa-lg"></i></a>
      - 
-        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/megablocks-install.html#using-a-prebuilt-docker-image-with-megablocks-pre-installed>`_
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/megablocks-install.html#using-a-prebuilt-docker-image-with-megablocks-pre-installed>`__
      - .. raw:: html
-         
+
          <a href="https://github.com/ROCm/megablocks"><i class="fab fa-github fa-lg"></i></a>
-   
-    * - `Taichi <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/taichi-compatibility.html>`_
+
+    * - `Taichi <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/taichi-compatibility.html>`__
      - .. raw:: html
-         
+
          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html"><i class="fas fa-link fa-lg"></i></a>
      - 
-        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html#use-a-prebuilt-docker-image-with-taichi-pre-installed>`_ 
-        - `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html#use-a-wheels-package>`_
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html#use-a-prebuilt-docker-image-with-taichi-pre-installed>`__
+        - `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html#use-a-wheels-package>`__

      - .. raw:: html
-         
-          <a href="https://github.com/ROCm/taichi"><i class="fab fa-github fa-lg"></i></a>      

+          <a href="https://github.com/ROCm/taichi"><i class="fab fa-github fa-lg"></i></a>

 Learn how to use your ROCm deep learning environment for training, fine-tuning, inference, and performance optimization
 through the following guides.
@@ -124,10 +123,3 @@ through the following guides.

 * :doc:`Use ROCm for AI inference optimization <rocm-for-ai/inference-optimization/index>`

-
-
-
-
-
-
-
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715.rst
@@ -46,7 +46,7 @@ vLLM inference performance testing
        - {{ unified_docker.hipblaslt_version }}

 With this Docker image, you can quickly test the :ref:`expected
-inference performance numbers <vllm-benchmark-performance-measurements>` for
+inference performance numbers <vllm-benchmark-performance-measurements-715>` for
 MI300X series accelerators.

 What's new
@@ -219,7 +219,7 @@ system's configuration.
            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
            model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.

-            Although the :ref:`available models <vllm-benchmark-available-models>` are preconfigured
+            Although the :ref:`available models <vllm-benchmark-available-models-715>` are preconfigured
            to collect latency and throughput performance data, you can also change the benchmarking
            parameters. See the standalone benchmarking tab for more information.

--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
@@ -39,7 +39,7 @@ vLLM inference performance testing
        - {{ unified_docker.hipblaslt_version }}

 With this Docker image, you can quickly test the :ref:`expected
-inference performance numbers <vllm-benchmark-performance-measurements>` for
+inference performance numbers <vllm-benchmark-performance-measurements-812>` for
 MI300X series accelerators.

 What's new
@@ -208,7 +208,7 @@ system's configuration.
            and ``{{ model.mad_tag }}_serving.csv``.

            Although the :ref:`available models
-            <vllm-benchmark-available-models>` are preconfigured to collect
+            <vllm-benchmark-available-models-812>` are preconfigured to collect
            offline throughput and online serving performance data, you can
            also change the benchmarking parameters. See the standalone
            benchmarking tab for more information.
--- a/docs/how-to/rocm-for-ai/install.rst
+++ b/docs/how-to/rocm-for-ai/install.rst
@@ -22,9 +22,9 @@ If you’re new to ROCm, refer to the :doc:`ROCm quick start install guide for L
 <rocm-install-on-linux:install/quick-start>`.

 If you’re using a Radeon GPU for graphics-accelerated applications, refer to the
-`Radeon installation instructions <https://rocm.docs.amd.com/projects/radeon/en/docs-6.1.3/docs/install/native_linux/install-radeon.html>`_.
+`Radeon installation instructions <https://rocm.docs.amd.com/projects/radeon/en/latest/docs/install/native_linux/howto_native_linux.html>`_.

-You can install ROCm on :ref:`compatible systems <rocm-install-on-linux:reference/system-requirements>` via your Linux
+You can install ROCm on :doc:`compatible systems <rocm-install-on-linux:reference/system-requirements>` via your Linux
 distribution's package manager. See the following documentation resources to get started:

 * :doc:`ROCm installation overview <rocm-install-on-linux:install/install-overview>`
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v24.12-dev.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v24.12-dev.rst
@@ -18,7 +18,7 @@ Training a model with ROCm Megatron-LM
 The ROCm Megatron-LM framework is a specialized fork of the robust Megatron-LM, designed to
 enable efficient training of large-scale language models on AMD GPUs. By leveraging AMD Instinct™ MI300X
 accelerators, AMD Megatron-LM delivers enhanced scalability, performance, and resource utilization for AI
-workloads. It is purpose-built to :ref:`support models <amd-megatron-lm-model-support>`
+workloads. It is purpose-built to :ref:`support models <amd-megatron-lm-model-support-24-12>`
 like Meta's Llama 2, Llama 3, and Llama 3.1, enabling developers to train next-generation AI models with greater
 efficiency. See the GitHub repository at `<https://github.com/ROCm/Megatron-LM>`__.

@@ -67,7 +67,7 @@ Megatron-LM provides the following key features to train large language models e

 - Pre-training

-.. _amd-megatron-lm-model-support:
+.. _amd-megatron-lm-model-support-24-12:

 The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.

--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3.rst
@@ -67,7 +67,7 @@ Megatron-LM provides the following key features to train large language models e

 - Pre-training

-.. _amd-megatron-lm-model-support:
+.. _amd-megatron-lm-model-support-25-3:

 The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.

@@ -278,7 +278,7 @@ handle a variety of input sequences, including unseen words or domain-specific t
   .. tab-item:: Llama
      :sync: llama

-      To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``Llama2Tokenizer``.
+      To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support-25-3>`, use the ``Llama2Tokenizer``.

      To train any of Llama 3 and Llama 3.1 models that this Docker image supports, use the ``HuggingFaceTokenizer``.
      Set the Hugging Face model link in the ``TOKENIZER_MODEL`` variable.
@@ -292,7 +292,7 @@ handle a variety of input sequences, including unseen words or domain-specific t
   .. tab-item:: DeepSeek V2
      :sync: deepseek

-      To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``DeepSeekV2Tokenizer``.
+      To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support-25-3>`, use the ``DeepSeekV2Tokenizer``.

 Multi-node training
 ^^^^^^^^^^^^^^^^^^^
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4.rst
@@ -67,7 +67,7 @@ Megatron-LM provides the following key features to train large language models e

 - Pre-training

-.. _amd-megatron-lm-model-support:
+.. _amd-megatron-lm-model-support-25-4:

 The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.

@@ -291,7 +291,7 @@ or ``${DATA_DIR}/tokenizer_llama2``.
   .. tab-item:: Llama
      :sync: llama

-      To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``Llama2Tokenizer``
+      To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support-25-4>`, use the ``Llama2Tokenizer``
      or the default ``HuggingFaceTokenizer``.

      To train any of Llama 3 and Llama 3.1 models that this Docker image supports, use the ``HuggingFaceTokenizer``.
@@ -320,7 +320,7 @@ or ``${DATA_DIR}/tokenizer_llama2``.
   .. tab-item:: DeepSeek V2
      :sync: deepseek

-      To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``DeepSeekV2Tokenizer``.
+      To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support-25-4>`, use the ``DeepSeekV2Tokenizer``.

 Multi-node training
 ^^^^^^^^^^^^^^^^^^^
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst
@@ -16,12 +16,20 @@ previous releases of the ``ROCm/pytorch-training`` Docker image on `Docker Hub <
     - Components
     - Resources

+   * - v25.7
+     - 
+       * ROCm 6.4.2
+       * PyTorch 2.8.0a0+gitd06a406
+     - 
+       * :doc:`Documentation <../pytorch-training>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.7/images/sha256-cc6fd840ab89cb81d926fc29eca6d075aee9875a55a522675a4b9231c9a0a712>`__
+
   * - v25.6
     - 
       * ROCm 6.3.4
       * PyTorch 2.8.0a0+git7d205b2
     - 
-       * :doc:`Documentation <../pytorch-training>`
+       * :doc:`Documentation <pytorch-training-v25.6>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.6/images/sha256-a4cea3c493a4a03d199a3e81960ac071d79a4a7a391aa9866add3b30a7842661>`__

   * - v25.5
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5.rst
@@ -437,3 +437,8 @@ Once the setup is complete, choose between two options to start benchmarking:

           ./pytorch_benchmark_report.sh -t HF_finetune_lora -p BF16 -m Llama-2-70B

+Previous versions
+=================
+
+See :doc:`pytorch-training-history` to find documentation for previous releases
+of the ``ROCm/pytorch-training`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.6.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.6.rst
@@ -0,0 +1,456 @@
+:orphan:
+
+.. meta::
+   :description: How to train a model using PyTorch for ROCm.
+   :keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
+
+**************************************
+Training a model with PyTorch for ROCm
+**************************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm vLLM
+   performance benchmark documentation. See :doc:`../pytorch-training` for the latest version.
+
+PyTorch is an open-source machine learning framework that is widely used for
+model training with GPU-optimized components for transformer-based models.
+
+The `PyTorch for ROCm training Docker <https://hub.docker.com/layers/rocm/pytorch-training/v25.6/images/sha256-a4cea3c493a4a03d199a3e81960ac071d79a4a7a391aa9866add3b30a7842661>`_
+(``rocm/pytorch-training:v25.6``) image provides a prebuilt optimized environment for fine-tuning and pretraining a
+model on AMD Instinct MI325X and MI300X accelerators. It includes the following software components to accelerate
+training workloads:
+
+--------------------------+--------------------------------+
+| Software component       | Version                        |
+==========================+================================+
+| ROCm                     | 6.3.4                          |
+--------------------------+--------------------------------+
+| PyTorch                  | 2.8.0a0+git7d205b2             |
+--------------------------+--------------------------------+
+| Python                   | 3.10.17                        |
+--------------------------+--------------------------------+
+| Transformer Engine       | 1.14.0+2f85f5f2                |
+--------------------------+--------------------------------+
+| Flash Attention          | 3.0.0.post1                    |
+--------------------------+--------------------------------+
+| hipBLASLt                | 0.15.0-8c6919d                 |
+--------------------------+--------------------------------+
+| Triton                   | 3.3.0                          |
+--------------------------+--------------------------------+
+
+.. _amd-pytorch-training-model-support-v256:
+
+Supported models
+================
+
+The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X accelerators.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.6-benchmark-models.yaml
+
+   {% set unified_docker = data.unified_docker.latest %}
+   {% set model_groups = data.model_groups %}
+
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+        <div class="row">
+          <div class="col-2 me-2 model-param-head">Workload</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+            <div class="col-6 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+   {% endfor %}
+          </div>
+        </div>
+
+        <div class="row mt-1">
+          <div class="col-2 me-2 model-param-head">Model</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+      {% set models = model_group.models %}
+      {% for model in models %}
+         {% if models|length % 3 == 0 %}
+            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% else %}
+            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% endif %}
+      {% endfor %}
+   {% endfor %}
+          </div>
+        </div>
+      </div>
+
+   .. note::
+
+      Some models require an external license agreement through a third party (for example, Meta).
+
+   .. _amd-pytorch-training-performance-measurements-v256:
+
+   Performance measurements
+   ========================
+
+   To evaluate performance, the
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
+   page provides reference throughput and latency measurements for training
+   popular AI models.
+
+   .. note::
+
+      The performance data presented in
+      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
+      should not be interpreted as the peak performance achievable by AMD
+      Instinct MI325X and MI300X accelerators or ROCm software.
+
+   System validation
+   =================
+
+   Before running AI workloads, it's important to validate that your AMD hardware is configured
+   correctly and performing optimally.
+
+   If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+   can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+   optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+   before starting training.
+
+   To test for optimal performance, consult the recommended :ref:`System health benchmarks
+   <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+   system's configuration.
+
+   This Docker image is optimized for specific model configurations outlined
+   below. Performance can vary for other training workloads, as AMD
+   doesn’t validate configurations and run conditions outside those described.
+
+   Benchmarking
+   ============
+
+   Once the setup is complete, choose between two options to start benchmarking:
+
+   .. tab-set::
+
+      .. tab-item:: MAD-integrated benchmarking
+
+         Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+         directory and install the required packages on the host machine.
+
+         .. code-block:: shell
+
+            git clone https://github.com/ROCm/MAD
+            cd MAD
+            pip install -r requirements.txt
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+         .. container:: model-doc {{ model.mad_tag }}
+
+            For example, use this command to run the performance benchmark test on the {{ model.model }} model
+            using one GPU with the {{ model.precision }} data type on the host machine.
+
+            .. code-block:: shell
+
+               export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+               madengine run \
+                   --tags {{ model.mad_tag }} \
+                   --keep-model-dir \
+                   --live-output \
+                   --timeout 28800
+
+            MAD launches a Docker container with the name
+            ``container_ci-{{ model.mad_tag }}``, for example. The latency and throughput reports of the
+            model are collected in the following path: ``~/MAD/perf.csv``.
+
+      {% endfor %}
+   {% endfor %}
+
+      .. tab-item:: Standalone benchmarking
+
+         .. rubric:: Download the Docker image and required packages
+
+         Use the following command to pull the Docker image from Docker Hub.
+
+         .. code-block:: shell
+
+            docker pull {{ unified_docker.pull_tag }}
+
+         Run the Docker container.
+
+         .. code-block:: shell
+
+            docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env {{ unified_docker.pull_tag }}
+
+         Use these commands if you exit the ``training_env`` container and need to return to it.
+
+         .. code-block:: shell
+
+            docker start training_env
+            docker exec -it training_env bash
+
+         In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
+         repository and navigate to the benchmark scripts directory
+         ``/workspace/MAD/scripts/pytorch_train``.
+
+         .. code-block:: shell
+
+            git clone https://github.com/ROCm/MAD
+            cd MAD/scripts/pytorch_train
+
+         .. rubric:: Prepare training datasets and dependencies
+
+         The following benchmarking examples require downloading models and datasets
+         from Hugging Face. To ensure successful access to gated repos, set your
+         ``HF_TOKEN``.
+
+         .. code-block:: shell
+
+            export HF_TOKEN=$your_personal_hugging_face_access_token
+
+         Run the setup script to install libraries and datasets needed for benchmarking.
+
+         .. code-block:: shell
+
+            ./pytorch_benchmark_setup.sh
+
+         .. container:: model-doc pyt_train_llama-3.1-8b
+
+            ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 8B:
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Library
+                 - Reference
+
+               * - ``accelerate``
+                 - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
+
+               * - ``datasets``
+                 - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
+
+         .. container:: model-doc pyt_train_llama-3.1-70b
+
+            ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 70B:
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Library
+                 - Reference
+
+               * - ``datasets``
+                 - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
+
+               * - ``torchdata``
+                 - `TorchData <https://pytorch.org/data/beta/index.html>`_
+
+               * - ``tomli``
+                 - `Tomli <https://pypi.org/project/tomli/>`_
+
+               * - ``tiktoken``
+                 - `tiktoken <https://github.com/openai/tiktoken>`_
+
+               * - ``blobfile``
+                 - `blobfile <https://pypi.org/project/blobfile/>`_
+
+               * - ``tabulate``
+                 - `tabulate <https://pypi.org/project/tabulate/>`_
+
+               * - ``wandb``
+                 - `Weights & Biases <https://github.com/wandb/wandb>`_
+
+               * - ``sentencepiece``
+                 - `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
+
+               * - ``tensorboard``
+                 - `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
+
+         .. container:: model-doc pyt_train_flux
+
+            ``pytorch_benchmark_setup.sh`` installs the following libraries for FLUX:
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Library
+                 - Reference
+
+               * - ``accelerate``
+                 - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
+
+               * - ``datasets``
+                 - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
+
+               * - ``sentencepiece``
+                 - `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
+
+               * - ``tensorboard``
+                 - `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
+
+               * - ``csvkit``
+                 - `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1
+
+               * - ``deepspeed``
+                 - `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2
+
+               * - ``diffusers``
+                 - `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0
+
+               * - ``GitPython``
+                 - `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44
+
+               * - ``opencv-python-headless``
+                 - `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84
+
+               * - ``peft``
+                 - `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0
+
+               * - ``protobuf``
+                 - `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2
+
+               * - ``pytest``
+                 - `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4
+
+               * - ``python-dotenv``
+                 - `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1
+
+               * - ``seaborn``
+                 - `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2
+
+               * - ``transformers``
+                 - `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0
+
+         ``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face:
+
+         * `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+         {% if model_group.tag == "pre-training" and model.mad_tag in ["pyt_train_llama-3.1-8b", "pyt_train_llama-3.1-70b", "pyt_train_flux"] %}
+
+         .. container:: model-doc {{ model.mad_tag }}
+
+            .. rubric:: Pretraining
+
+            To start the pre-training benchmark, use the following command with the
+            appropriate options. See the following list of options and their descriptions.
+
+            .. code-block:: shell
+
+               ./pytorch_benchmark_report.sh -t pretrain -m {{ model.model_repo }} -p $datatype -s $sequence_length
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Name
+                 - Options
+                 - Description
+
+            {% if model.mad_tag == "pyt_train_llama-3.1-8b" %}
+               * - ``$datatype``
+                 - ``BF16`` or ``FP8``
+                 - Only Llama 3.1 8B supports FP8 precision.
+            {% else %}
+               * - ``$datatype``
+                 - ``BF16``
+                 - Only Llama 3.1 8B supports FP8 precision.
+            {% endif %}
+
+               * - ``$sequence_length``
+                 - Sequence length for the language model.
+                 - Between 2048 and 8192. 8192 by default.
+
+            {% if model.mad_tag == "pyt_train_flux" %}
+            .. container:: model-doc {{ model.mad_tag }}
+
+               .. note::
+
+                  Occasionally, downloading the Flux dataset might fail. In the event of this
+                  error, manually download it from Hugging Face at
+                  `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
+                  and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
+                  the required dataset.
+            {% endif %}
+         {% endif %}
+
+         {% if model_group.tag == "fine-tuning" %}
+         .. container:: model-doc {{ model.mad_tag }}
+
+            .. rubric:: Fine-tuning
+
+            To start the fine-tuning benchmark, use the following command with the
+            appropriate options. See the following list of options and their descriptions.
+
+            .. code-block:: shell
+
+               ./pytorch_benchmark_report.sh -t $training_mode -m {{ model.model_repo }} -p BF16 -s $sequence_length
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Name
+                 - Options
+                 - Description
+
+               * - ``$training_mode``
+                 - ``finetune_fw``
+                 - Full weight fine-tuning (BF16 supported)
+
+               * -
+                 - ``finetune_lora``
+                 - LoRA fine-tuning (BF16 supported)
+
+               * -
+                 - ``finetune_qlora``
+                 - QLoRA fine-tuning (BF16 supported)
+
+               * -
+                 - ``HF_finetune_lora``
+                 - LoRA fine-tuning with Hugging Face PEFT
+
+               * - ``$datatype``
+                 - ``BF16``
+                 - All models support BF16.
+
+               * - ``$sequence_length``
+                 - Between 2048 and 16384.
+                 - Sequence length for the language model.
+
+            .. note::
+
+               {{ model.model }} currently supports the following fine-tuning methods:
+
+            {% for method in model.training_modes %}
+               * ``{{ method }}``
+            {% endfor %}
+            {% if model.training_modes|length < 4 %}
+
+               The upstream `torchtune <https://github.com/pytorch/torchtune>`_ repository
+               does not currently provide YAML configuration files for other combinations of
+               model to fine-tuning method
+               However, you can still configure your own YAML files to enable support for
+               fine-tuning methods not listed here by following existing patterns in the
+               ``/workspace/torchtune/recipes/configs`` directory.
+            {% endif %}
+         {% endif %}
+      {% endfor %}
+   {% endfor %}
+
+               .. rubric:: Benchmarking examples
+
+               For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.
+
+Further reading
+===============
+
+- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
+
+- To learn more about system settings and management practices to configure your system for
+  AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`pytorch-training-history` to find documentation for previous releases
+of the ``ROCm/pytorch-training`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
@@ -9,28 +9,25 @@ Training a model with PyTorch for ROCm
 PyTorch is an open-source machine learning framework that is widely used for
 model training with GPU-optimized components for transformer-based models.

-The `PyTorch for ROCm training Docker <https://hub.docker.com/r/rocm/pytorch-training/tags>`_
-(``rocm/pytorch-training:v25.6``) image provides a prebuilt optimized environment for fine-tuning and pretraining a
-model on AMD Instinct MI325X and MI300X accelerators. It includes the following software components to accelerate
-training workloads:
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml

-+--------------------------+--------------------------------+
-| Software component       | Version                        |
-+==========================+================================+
-| ROCm                     | 6.3.4                          |
-+--------------------------+--------------------------------+
-| PyTorch                  | 2.8.0a0+git7d205b2             |
-+--------------------------+--------------------------------+
-| Python                   | 3.10.17                        |
-+--------------------------+--------------------------------+
-| Transformer Engine       | 1.14.0+2f85f5f2                |
-+--------------------------+--------------------------------+
-| Flash Attention          | 3.0.0.post1                    |
-+--------------------------+--------------------------------+
-| hipBLASLt                | 0.15.0-8c6919d                 |
-+--------------------------+--------------------------------+
-| Triton                   | 3.3.0                          |
-+--------------------------+--------------------------------+
+   {% set dockers = data.dockers %}
+   {% set docker = dockers[0] %}
+   The `PyTorch for ROCm training Docker <{{ docker.docker_hub_url }}>`__
+   (``{{ docker.pull_tag }}``) image provides a prebuilt optimized environment for fine-tuning and pretraining a
+   model on AMD Instinct MI325X and MI300X accelerators. It includes the following software components to accelerate
+   training workloads:
+
+   .. list-table::
+      :header-rows: 1
+
+      * - Software component
+        - Version
+
+      {% for component_name, component_version in docker.components.items() %}
+      * - {{ component_name }}
+        - {{ component_version }}
+      {% endfor %}

 .. _amd-pytorch-training-model-support:

@@ -38,26 +35,27 @@ Supported models
 ================

 The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X accelerators.
+Some instructions, commands, and training recommendations in this documentation might
+vary by model -- select one to get started.

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml

-   {% set unified_docker = data.unified_docker.latest %}
+   {% set unified_docker = data.dockers[0] %}
   {% set model_groups = data.model_groups %}
-
   .. raw:: html

      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
        <div class="row">
-          <div class="col-2 me-2 model-param-head">Workload</div>
+          <div class="col-2 me-2 model-param-head">Model group</div>
          <div class="row col-10">
   {% for model_group in model_groups %}
-            <div class="col-6 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
   {% endfor %}
          </div>
        </div>

        <div class="row mt-1">
-          <div class="col-2 me-2 model-param-head">Model</div>
+          <div class="col-2 me-2 model-param-head">Model variant</div>
          <div class="row col-10">
   {% for model_group in model_groups %}
      {% set models = model_group.models %}
@@ -73,84 +71,116 @@ The following models are pre-optimized for performance on the AMD Instinct MI325
        </div>
      </div>

-   .. note::

-      Some models require an external license agreement through a third party (for example, Meta).
+   .. _amd-pytorch-training-supported-training-modes:

-   .. _amd-pytorch-training-performance-measurements:
+   The following table lists supported training modes per model.

-   Performance measurements
-   ========================
+   .. dropdown:: Supported training modes

-   To evaluate performance, the
+      .. list-table::
+         :header-rows: 1
+
+         * - Model
+           - Supported training modes
+
+      {% for model_group in model_groups %}
+         {% set models = model_group.models %}
+         {% for model in models %}
+         * - {{ model.model }}
+           - ``{{ model.training_modes | join('``, ``') }}``
+
+         {% endfor %}
+      {% endfor %}
+
+      .. note::
+
+         Some model and fine-tuning combinations are not listed. This is
+         because the `upstream torchtune repository <https://github.com/pytorch/torchtune>`__
+         doesn't provide default YAML configurations for them.
+         For advanced usage, you can create a custom configuration to enable
+         unlisted fine-tuning methods by using an existing file in the
+         ``/workspace/torchtune/recipes/configs`` directory as a template.
+
+.. _amd-pytorch-training-performance-measurements:
+
+Performance measurements
+========================
+
+To evaluate performance, the
+`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
+page provides reference throughput and latency measurements for training
+popular AI models.
+
+.. note::
+
+   The performance data presented in
   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
-   page provides reference throughput and latency measurements for training
-   popular AI models.
+   should not be interpreted as the peak performance achievable by AMD
+   Instinct MI325X and MI300X accelerators or ROCm software.

-   .. note::
+System validation
+=================

-      The performance data presented in
-      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
-      should not be interpreted as the peak performance achievable by AMD
-      Instinct MI325X and MI300X accelerators or ROCm software.
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.

-   System validation
-   =================
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting training.

-   Before running AI workloads, it's important to validate that your AMD hardware is configured
-   correctly and performing optimally.
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.

-   If you have already validated your system settings, including aspects like NUMA auto-balancing, you
-   can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
-   optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
-   before starting training.
+This Docker image is optimized for specific model configurations outlined
+below. Performance can vary for other training workloads, as AMD
+doesn’t test configurations and run conditions outside those described.

-   To test for optimal performance, consult the recommended :ref:`System health benchmarks
-   <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
-   system's configuration.
+Run training
+============

-   This Docker image is optimized for specific model configurations outlined
-   below. Performance can vary for other training workloads, as AMD
-   doesn’t validate configurations and run conditions outside those described.
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml

-   Benchmarking
-   ============
+   {% set unified_docker = data.dockers[0] %}
+   {% set model_groups = data.model_groups %}

-   Once the setup is complete, choose between two options to start benchmarking:
+   Once the setup is complete, choose between two options to start benchmarking training:

   .. tab-set::

      .. tab-item:: MAD-integrated benchmarking

-         Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
-         directory and install the required packages on the host machine.
+         1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+            directory and install the required packages on the host machine.

-         .. code-block:: shell
+            .. code-block:: shell

-            git clone https://github.com/ROCm/MAD
-            cd MAD
-            pip install -r requirements.txt
+               git clone https://github.com/ROCm/MAD
+               cd MAD
+               pip install -r requirements.txt

   {% for model_group in model_groups %}
      {% for model in model_group.models %}

         .. container:: model-doc {{ model.mad_tag }}

-            For example, use this command to run the performance benchmark test on the {{ model.model }} model
-            using one GPU with the {{ model.precision }} data type on the host machine.
+            2. For example, use this command to run the performance benchmark test on the {{ model.model }} model
+               using one node with the {{ model.precision }} data type on the host machine.

-            .. code-block:: shell
+               .. code-block:: shell

-               export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
-               madengine run \
-                   --tags {{ model.mad_tag }} \
-                   --keep-model-dir \
-                   --live-output \
-                   --timeout 28800
+                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+                  madengine run \
+                      --tags {{ model.mad_tag }} \
+                      --keep-model-dir \
+                      --live-output \
+                      --timeout 28800

-            MAD launches a Docker container with the name
-            ``container_ci-{{ model.mad_tag }}``, for example. The latency and throughput reports of the
-            model are collected in the following path: ``~/MAD/perf.csv``.
+               MAD launches a Docker container with the name
+               ``container_ci-{{ model.mad_tag }}``. The latency and throughput reports of the
+               model are collected in ``~/MAD/perf.csv``.

      {% endfor %}
   {% endfor %}
@@ -159,222 +189,213 @@ The following models are pre-optimized for performance on the AMD Instinct MI325

         .. rubric:: Download the Docker image and required packages

-         Use the following command to pull the Docker image from Docker Hub.
+         1. Use the following command to pull the Docker image from Docker Hub.

-         .. code-block:: shell
+            .. code-block:: shell

-            docker pull {{ unified_docker.pull_tag }}
+               docker pull {{ unified_docker.pull_tag }}

-         Run the Docker container.
+         2. Run the Docker container.

-         .. code-block:: shell
+            .. code-block:: shell

-            docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env {{ unified_docker.pull_tag }}
+               docker run -it \
+                   --device /dev/dri \
+                   --device /dev/kfd \
+                   --network host \
+                   --ipc host \
+                   --group-add video \
+                   --cap-add SYS_PTRACE \
+                   --security-opt seccomp=unconfined \
+                   --privileged \
+                   -v $HOME:$HOME \
+                   -v $HOME/.ssh:/root/.ssh \
+                   --shm-size 64G \
+                   --name training_env \
+                   {{ unified_docker.pull_tag }}

-         Use these commands if you exit the ``training_env`` container and need to return to it.
+            Use these commands if you exit the ``training_env`` container and need to return to it.

-         .. code-block:: shell
+            .. code-block:: shell

-            docker start training_env
-            docker exec -it training_env bash
+               docker start training_env
+               docker exec -it training_env bash

-         In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
-         repository and navigate to the benchmark scripts directory
-         ``/workspace/MAD/scripts/pytorch_train``.
+         3. In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
+            repository and navigate to the benchmark scripts directory
+            ``/workspace/MAD/scripts/pytorch_train``.

-         .. code-block:: shell
+            .. code-block:: shell

-            git clone https://github.com/ROCm/MAD
-            cd MAD/scripts/pytorch_train
+               git clone https://github.com/ROCm/MAD
+               cd MAD/scripts/pytorch_train

         .. rubric:: Prepare training datasets and dependencies

-         The following benchmarking examples require downloading models and datasets
-         from Hugging Face. To ensure successful access to gated repos, set your
-         ``HF_TOKEN``.
+         1. The following benchmarking examples require downloading models and datasets
+            from Hugging Face. To ensure successful access to gated repos, set your
+            ``HF_TOKEN``.

-         .. code-block:: shell
+            .. code-block:: shell

-            export HF_TOKEN=$your_personal_hugging_face_access_token
+               export HF_TOKEN=$your_personal_hugging_face_access_token

-         Run the setup script to install libraries and datasets needed for benchmarking.
+         2. Run the setup script to install libraries and datasets needed for benchmarking.

-         .. code-block:: shell
+            .. code-block:: shell

-            ./pytorch_benchmark_setup.sh
+               ./pytorch_benchmark_setup.sh

-         .. container:: model-doc pyt_train_llama-3.1-8b
+            .. container:: model-doc pyt_train_llama-3.1-8b

-            ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 8B:
+               ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 8B:

-            .. list-table::
-               :header-rows: 1
+               .. list-table::
+                  :header-rows: 1

-               * - Library
-                 - Reference
+                  * - Library
+                    - Reference

-               * - ``accelerate``
-                 - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
+                  * - ``accelerate``
+                    - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_

-               * - ``datasets``
-                 - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
+                  * - ``datasets``
+                    - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0

-         .. container:: model-doc pyt_train_llama-3.1-70b
+            .. container:: model-doc pyt_train_llama-3.1-70b

-            ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 70B:
+               ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 70B:

-            .. list-table::
-               :header-rows: 1
+               .. list-table::
+                  :header-rows: 1

-               * - Library
-                 - Reference
+                  * - Library
+                    - Reference

-               * - ``datasets``
-                 - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
+                  * - ``datasets``
+                    - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0

-               * - ``torchdata``
-                 - `TorchData <https://pytorch.org/data/beta/index.html>`_
+                  * - ``torchdata``
+                    - `TorchData <https://pytorch.org/data/beta/index.html>`_

-               * - ``tomli``
-                 - `Tomli <https://pypi.org/project/tomli/>`_
+                  * - ``tomli``
+                    - `Tomli <https://pypi.org/project/tomli/>`_

-               * - ``tiktoken``
-                 - `tiktoken <https://github.com/openai/tiktoken>`_
+                  * - ``tiktoken``
+                    - `tiktoken <https://github.com/openai/tiktoken>`_

-               * - ``blobfile``
-                 - `blobfile <https://pypi.org/project/blobfile/>`_
+                  * - ``blobfile``
+                    - `blobfile <https://pypi.org/project/blobfile/>`_

-               * - ``tabulate``
-                 - `tabulate <https://pypi.org/project/tabulate/>`_
+                  * - ``tabulate``
+                    - `tabulate <https://pypi.org/project/tabulate/>`_

-               * - ``wandb``
-                 - `Weights & Biases <https://github.com/wandb/wandb>`_
+                  * - ``wandb``
+                    - `Weights & Biases <https://github.com/wandb/wandb>`_

-               * - ``sentencepiece``
-                 - `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
+                  * - ``sentencepiece``
+                    - `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0

-               * - ``tensorboard``
-                 - `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
+                  * - ``tensorboard``
+                    - `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0

-         .. container:: model-doc pyt_train_flux
+            .. container:: model-doc pyt_train_flux

-            ``pytorch_benchmark_setup.sh`` installs the following libraries for FLUX:
+               ``pytorch_benchmark_setup.sh`` installs the following libraries for FLUX:

-            .. list-table::
-               :header-rows: 1
+               .. list-table::
+                  :header-rows: 1

-               * - Library
-                 - Reference
+                  * - Library
+                    - Reference

-               * - ``accelerate``
-                 - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
+                  * - ``accelerate``
+                    - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_

-               * - ``datasets``
-                 - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
+                  * - ``datasets``
+                    - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0

-               * - ``sentencepiece``
-                 - `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
+                  * - ``sentencepiece``
+                    - `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0

-               * - ``tensorboard``
-                 - `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
+                  * - ``tensorboard``
+                    - `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0

-               * - ``csvkit``
-                 - `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1
+                  * - ``csvkit``
+                    - `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1

-               * - ``deepspeed``
-                 - `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2
+                  * - ``deepspeed``
+                    - `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2

-               * - ``diffusers``
-                 - `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0
+                  * - ``diffusers``
+                    - `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0

-               * - ``GitPython``
-                 - `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44
+                  * - ``GitPython``
+                    - `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44

-               * - ``opencv-python-headless``
-                 - `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84
+                  * - ``opencv-python-headless``
+                    - `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84

-               * - ``peft``
-                 - `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0
+                  * - ``peft``
+                    - `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0

-               * - ``protobuf``
-                 - `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2
+                  * - ``protobuf``
+                    - `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2

-               * - ``pytest``
-                 - `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4
+                  * - ``pytest``
+                    - `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4

-               * - ``python-dotenv``
-                 - `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1
+                  * - ``python-dotenv``
+                    - `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1

-               * - ``seaborn``
-                 - `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2
+                  * - ``seaborn``
+                    - `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2

-               * - ``transformers``
-                 - `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0
+                  * - ``transformers``
+                    - `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0

-         ``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face:
+            ``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face:

-         * `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_
+            * `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_

   {% for model_group in model_groups %}
      {% for model in model_group.models %}
-         {% if model_group.tag == "pre-training" and model.mad_tag in ["pyt_train_llama-3.1-8b", "pyt_train_llama-3.1-70b", "pyt_train_flux"] %}
+         {% set training_modes = model.training_modes %}
+         {% set training_mode_descs = {
+            "pretrain": "Benchmark pre-training.",
+            "HF_pretrain": "Llama 3.1 8B pre-training with FP8 precision."
+         } %}
+         {% set available_modes = training_modes | select("in", ["pretrain", "HF_pretrain"]) | list %}
+         {% if available_modes %}

         .. container:: model-doc {{ model.mad_tag }}

-            .. rubric:: Pretraining
+            .. rubric:: Pre-training

            To start the pre-training benchmark, use the following command with the
            appropriate options. See the following list of options and their descriptions.

            .. code-block:: shell

-               ./pytorch_benchmark_report.sh -t pretrain -m {{ model.model_repo }} -p $datatype -s $sequence_length
-
-            .. list-table::
-               :header-rows: 1
-
-               * - Name
-                 - Options
-                 - Description
-
-            {% if model.mad_tag == "pyt_train_llama-3.1-8b" %}
-               * - ``$datatype``
-                 - ``BF16`` or ``FP8``
-                 - Only Llama 3.1 8B supports FP8 precision.
-            {% else %}
-               * - ``$datatype``
-                 - ``BF16``
-                 - Only Llama 3.1 8B supports FP8 precision.
-            {% endif %}
-
-               * - ``$sequence_length``
-                 - Sequence length for the language model.
-                 - Between 2048 and 8192. 8192 by default.
+               ./pytorch_benchmark_report.sh -t {% if available_modes | length == 1 %}{{ available_modes[0] }}{% else %}$training_mode{% endif %} \
+                   -m {{ model.model_repo }} \
+                   -p $datatype \
+                   -s $sequence_length

            {% if model.mad_tag == "pyt_train_flux" %}
            .. container:: model-doc {{ model.mad_tag }}

               .. note::

+                  Currently, FLUX models are not supported out-of-the-box on {{ unified_docker.pull_tag }}.
+                  To use FLUX, refer to the previous version of the ``pytorch-training`` Docker: :doc:`previous-versions/pytorch-training-v25.6`
+
                  Occasionally, downloading the Flux dataset might fail. In the event of this
                  error, manually download it from Hugging Face at
                  `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
                  and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
                  the required dataset.
            {% endif %}
-         {% endif %}
-
-         {% if model_group.tag == "fine-tuning" %}
-         .. container:: model-doc {{ model.mad_tag }}
-
-            .. rubric:: Fine-tuning
-
-            To start the fine-tuning benchmark, use the following command with the
-            appropriate options. See the following list of options and their descriptions.
-
-            .. code-block:: shell
-
-               ./pytorch_benchmark_report.sh -t $training_mode -m {{ model.model_repo }} -p BF16 -s $sequence_length

            .. list-table::
               :header-rows: 1
@@ -383,53 +404,143 @@ The following models are pre-optimized for performance on the AMD Instinct MI325
                 - Options
                 - Description

-               * - ``$training_mode``
-                 - ``finetune_fw``
-                 - Full weight fine-tuning (BF16 supported)
-
-               * -
-                 - ``finetune_lora``
-                 - LoRA fine-tuning (BF16 supported)
-
-               * -
-                 - ``finetune_qlora``
-                 - QLoRA fine-tuning (BF16 supported)
-
-               * -
-                 - ``HF_finetune_lora``
-                 - LoRA fine-tuning with Hugging Face PEFT
+               {% for mode in available_modes %}
+               * - {% if loop.first %}``$training_mode``{% endif %}
+                 - ``{{ mode }}``
+                 - {{ training_mode_descs[mode] }}
+               {% endfor %}

               * - ``$datatype``
-                 - ``BF16``
-                 - All models support BF16.
+                 - ``BF16``{% if model.mad_tag == "pyt_train_llama-3.1-8b" %} or ``FP8``{% endif %}
+                 - Only Llama 3.1 8B supports FP8 precision.
+
+               * - ``$sequence_length``
+                 - Sequence length for the language model.
+                 - Between 2048 and 8192. 8192 by default.
+         {% endif %}
+
+         {% set training_mode_descs = {
+            "finetune_fw": "Full weight fine-tuning (BF16 and FP8 supported).",
+            "finetune_lora": "LoRA fine-tuning (BF16 supported).",
+            "finetune_qlora": "QLoRA fine-tuning (BF16 supported).",
+            "HF_finetune_lora": "LoRA fine-tuning with Hugging Face PEFT.",
+         } %}
+         {% set available_modes = training_modes | select("in", ["finetune_fw", "finetune_lora", "finetune_qlora", "HF_finetune_lora"]) | list %}
+         {% if available_modes %}
+         .. container:: model-doc {{ model.mad_tag }}
+
+            .. rubric:: Fine-tuning
+
+            To start the fine-tuning benchmark, use the following command with the
+            appropriate options. See the following list of options and their descriptions.
+            See :ref:`supported training modes <amd-pytorch-training-supported-training-modes>`.
+
+            .. code-block:: shell
+
+               ./pytorch_benchmark_report.sh -t $training_mode \
+                   -m {{ model.model_repo }} \
+                   -p $datatype \
+                   -s $sequence_length
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Name
+                 - Options
+                 - Description
+
+               {% for mode in available_modes %}
+               * - {% if loop.first %}``$training_mode``{% endif %}
+                 - ``{{ mode }}``
+                 - {{ training_mode_descs[mode] }}
+               {% endfor %}
+
+               * - ``$datatype``
+                 - ``BF16``{% if "finetune_fw" in available_modes %} or ``FP8``{% endif %}
+                 - All models support BF16.{% if "finetune_fw" in available_modes %} FP8 is only available for full weight fine-tuning.{% endif %}

               * - ``$sequence_length``
                 - Between 2048 and 16384.
                 - Sequence length for the language model.

+            {% if model.mad_tag in ["pyt_train_llama3.2-vision-11b", "pyt_train_llama-3.2-vision-90b"] %}
            .. note::

-               {{ model.model }} currently supports the following fine-tuning methods:
+               For LoRA and QLoRA support with vision models (Llama 3.2 11B and 90B),
+               use the following torchtune commit for compatibility:

-            {% for method in model.training_modes %}
-               * ``{{ method }}``
-            {% endfor %}
-            {% if model.training_modes|length < 4 %}
+               .. code-block:: shell
+
+                  git checkout 48192e23188b1fc524dd6d127725ceb2348e7f0e
+
+            {% elif model.mad_tag in ["pyt_train_llama-2-7b", "pyt_train_llama-2-13b", "pyt_train_llama-2-70b"] %}
+            .. note::
+
+               You might encounter the following error with Llama 2: ``ValueError: seq_len (16384) of
+               input tensor should be smaller than max_seq_len (4096)``.
+               This error indicates that an input sequence is longer than the model's maximum context window.
+
+               Ensure your tokenized input does not exceed the model's ``max_seq_len`` (4096
+               tokens in this case). You can resolve this by truncating the input or splitting
+               it into smaller chunks before passing it to the model.
+
+               Note on reproducibility: The results in this guide are based on
+               commit ``b4c98ac`` from the upstream
+               `<https://github.com/pytorch/torchtune>`__ repository. For the
+               latest updates, you can use the main branch.

-               The upstream `torchtune <https://github.com/pytorch/torchtune>`_ repository
-               does not currently provide YAML configuration files for other combinations of
-               model to fine-tuning method
-               However, you can still configure your own YAML files to enable support for
-               fine-tuning methods not listed here by following existing patterns in the
-               ``/workspace/torchtune/recipes/configs`` directory.
            {% endif %}
         {% endif %}
      {% endfor %}
   {% endfor %}

-               .. rubric:: Benchmarking examples
+            .. rubric:: Benchmarking examples

-               For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.
+            For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.
+
+Multi-node training
+-------------------
+
+Pre-training
+~~~~~~~~~~~~
+
+Multi-node training with torchtitan is supported. The provided SLURM script is pre-configured for Llama 3 70B.
+
+To launch the training job on a SLURM cluster for Llama 3 70B, run the following commands from the MAD repository.
+
+.. code-block:: shell
+
+   # In the MAD repository
+   cd scripts/pytorch_train
+   sbatch run_slurm_train.sh
+
+Fine-tuning
+~~~~~~~~~~~
+
+Multi-node training with torchtune is supported. The provided SLURM script is pre-configured for Llama 3.3 70B.
+
+To launch the training job on a SLURM cluster for Llama 3.3 70B, run the following commands from the MAD repository.
+
+.. code-block:: shell
+
+   huggingface-cli login # Get access to HF Llama model space
+   huggingface-cli download meta-llama/Llama-3.3-70B-Instruct --local-dir ./models/Llama-3.3-70B-Instruct # Download the Llama 3.3 model locally
+   # In the MAD repository
+   cd scripts/pytorch_train
+   sbatch Torchtune_Multinode.sh
+
+.. note::
+
+   Information regarding benchmark setup:
+
+   * By default, Llama 3.3 70B is fine-tuned using ``alpaca_dataset``.
+   * You can adjust the torchtune `YAML configuration file
+     <https://github.com/pytorch/torchtune/blob/main/recipes/configs/llama3_3/70B_full_multinode.yaml>`__
+     if you're using a different model.
+   * The number of nodes and other parameters can be tuned in the SLURM script ``Torchtune_Multinode.sh``.
+   * Set the ``mounting_paths`` inside the SLURM script.
+
+Once the run is finished, you can find the log files in the ``result_torchtune/`` directory.

 Further reading
 ===============