remove lib from filename

Create cli11.yml in tag-builds
Create cli11.yml
2026-01-10 15:18:11 -05:00 · 2025-10-15 09:22:05 -06:00 · 2025-10-15 08:54:07 -06:00 · 2025-10-15 08:44:05 -06:00 · 2025-10-15 09:58:23 -04:00 · 2025-10-15 10:02:46 +02:00
49 changed files with 1329 additions and 6028 deletions
--- a/.azuredevops/components/ROCR-Runtime.yml
+++ b/.azuredevops/components/ROCR-Runtime.yml
@@ -37,6 +37,7 @@ parameters:
    - libdrm-dev
    - libelf-dev
    - libnuma-dev
+    - libsimde-dev
    - ninja-build
    - pkg-config
 - name: rocmDependencies
--- a/.azuredevops/components/rccl.yml
+++ b/.azuredevops/components/rccl.yml
@@ -70,7 +70,7 @@ parameters:
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: rccl_build_${{ job.target }}
-    timeoutInMinutes: 90
+    timeoutInMinutes: 120
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
--- a/.azuredevops/components/rocPRIM.yml
+++ b/.azuredevops/components/rocPRIM.yml
@@ -210,7 +210,7 @@ jobs:
        parameters:
          componentName: ${{ parameters.componentName }}
          testDir: '$(Agent.BuildDirectory)/rocm/bin/rocprim'
-          extraTestParameters: '-I ${{ job.shard }},,${{ job.shardCount }} -E device_merge_inplace'
+          extraTestParameters: '-I ${{ job.shard }},,${{ job.shardCount }}'
          os: ${{ job.os }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
        parameters:
--- a/.azuredevops/components/rocm-examples.yml
+++ b/.azuredevops/components/rocm-examples.yml
@@ -14,9 +14,13 @@ parameters:
  type: object
  default:
    - cmake
+    - libdw-dev
    - libglfw3-dev
    - libmsgpack-dev
+    - libopencv-dev
    - libtbb-dev
+    - libtiff-dev
+    - libva-amdgpu-dev
    - ninja-build
    - python3-pip
 - name: rocmDependencies
@@ -35,6 +39,7 @@ parameters:
    - hipSPARSE
    - hipTensor
    - llvm-project
+    - MIOpen
    - rocBLAS
    - rocFFT
    - rocJPEG
@@ -47,6 +52,7 @@ parameters:
    - rocSPARSE
    - rocThrust
    - rocWMMA
+    - rpp
 - name: rocmTestDependencies
  type: object
  default:
@@ -63,6 +69,7 @@ parameters:
    - hipSPARSE
    - hipTensor
    - llvm-project
+    - MIOpen
    - rocBLAS
    - rocFFT
    - rocminfo
@@ -77,6 +84,7 @@ parameters:
    - rocThrust
    - roctracer
    - rocWMMA
+    - rpp

 - name: jobMatrix
  type: object
@@ -105,6 +113,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
+        registerROCmPackages: true
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
      parameters:
        cmakeVersion: '3.25.0'
@@ -169,6 +178,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
+        registerROCmPackages: true
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
      parameters:
        cmakeVersion: '3.25.0'
--- a/.azuredevops/components/rocm-libraries.yml
+++ b/.azuredevops/components/rocm-libraries.yml
@@ -43,9 +43,14 @@ parameters:
    - ninja-build
    - python3-pip
    - python3-venv
+    - googletest
+    - libgtest-dev
+    - libgmock-dev
+    - libboost-filesystem-dev
 - name: pipModules
  type: object
  default:
+    - msgpack
    - joblib
    - "packaging>=22.0"
    - pytest
@@ -147,6 +152,13 @@ jobs:
          echo "##vso[task.prependpath]$USER_BASE/bin"
          echo "##vso[task.setvariable variable=PytestCmakePath]$USER_BASE/share/Pytest/cmake"
        displayName: Set cmake configure paths
+    - task: Bash@3
+      displayName: Add ROCm binaries to PATH
+      inputs:
+        targetType: inline
+        script: |
+          echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/bin"
+          echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/llvm/bin"
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        os: ${{ job.os }}
--- a/.azuredevops/dependencies/cli11.yml
+++ b/.azuredevops/dependencies/cli11.yml
@@ -0,0 +1,63 @@
+parameters:
+- name: checkoutRepo
+  type: string
+  default: 'self'
+- name: checkoutRef
+  type: string
+  default: ''
+- name: cli11Version
+  type: string
+  default: ''
+- name: aptPackages
+  type: object
+  default:
+    - cmake
+    - git
+    - ninja-build
+
+- name: jobMatrix
+  type: object
+  default:
+    buildJobs:
+      - { os: ubuntu2204, packageManager: apt}
+      - { os: almalinux8, packageManager: dnf}
+
+jobs:
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
+  - job: cli11_${{ job.os }}
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    pool:
+      vmImage: 'ubuntu-22.04'
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
+    workspace:
+      clean: all
+    steps:
+    - checkout: none
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - task: Bash@3
+      displayName: Clone cli11 ${{ parameters.cli11Version }}
+      inputs:
+        targetType: inline
+        script: git clone https://github.com/CLIUtils/CLI11.git -b ${{ parameters.cli11Version }}
+        workingDirectory: $(Agent.BuildDirectory)
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        os: ${{ job.os }}
+        cmakeBuildDir: $(Agent.BuildDirectory)/cli11/build
+        cmakeSourceDir: $(Agent.BuildDirectory)/cli11
+        useAmdclang: false
+        extraBuildFlags: >-
+          -DCMAKE_BUILD_TYPE=Release
+          -GNinja
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
+        os: ${{ job.os }}
--- a/.azuredevops/tag-builds/cli11.yml
+++ b/.azuredevops/tag-builds/cli11.yml
@@ -0,0 +1,23 @@
+variables:
+- group: common
+- template: /.azuredevops/variables-global.yml
+
+parameters:
+- name: cli11Version
+  type: string
+  default: "main"
+
+resources:
+  repositories:
+  - repository: pipelines_repo
+    type: github
+    endpoint: ROCm
+    name: ROCm/ROCm
+
+trigger: none
+pr: none
+
+jobs:
+  - template: ${{ variables.CI_DEPENDENCIES_PATH }}/cli11.yml
+    parameters:
+      cli11Version: ${{ parameters.cli11Version }}
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -27,7 +27,6 @@ ASICs
 ASan
 ASAN
 ASm
-Async
 ATI
 atomicRMW
 AddressSanitizer
@@ -131,7 +130,6 @@ ELMo
 ENDPGM
 EPYC
 ESXi
-EP
 EoS
 etcd
 fas
@@ -183,7 +181,6 @@ GPR
 GPT
 GPU
 GPU's
-GPUDirect
 GPUs
 Graphbolt
 GraphSage
@@ -246,7 +243,6 @@ Intersphinx
 Intra
 Ioffe
 JAX's
-JAXLIB
 Jinja
 JSON
 Jupyter
@@ -302,7 +298,6 @@ Makefiles
 Matplotlib
 Matrox
 MaxText
-MBT
 Megablocks
 Megatrends
 Megatron
@@ -312,7 +307,6 @@ Meta's
 Miniconda
 MirroredStrategy
 Mixtral
-MLA
 MosaicML
 MoEs
 Mooncake
@@ -355,7 +349,6 @@ OFED
 OMM
 OMP
 OMPI
-OOM
 OMPT
 OMPX
 ONNX
@@ -382,7 +375,6 @@ perf
 PEQT
 PIL
 PILImage
-PJRT
 POR
 PRNG
 PRs
@@ -402,7 +394,6 @@ Profiler's
 PyPi
 Pytest
 PyTorch
-QPS
 Qcycles
 Qwen
 RAII
@@ -505,6 +496,8 @@ TPS
 TPU
 TPUs
 TSME
+Taichi
+Taichi's
 Tagram
 TensileLite
 TensorBoard
@@ -676,7 +669,6 @@ denoised
 denoises
 denormalize
 dequantization
-dequantized
 dequantizes
 deserializers
 detections
@@ -792,7 +784,6 @@ linalg
 linearized
 linter
 linux
-llm
 llvm
 lm
 localscratch
@@ -843,7 +834,6 @@ passthrough
 pe
 perfcounter
 performant
-piecewise
 perl
 pragma
 pre
@@ -990,7 +980,6 @@ tokenizer
 tokenizes
 toolchain
 toolchains
-topk
 toolset
 toolsets
 torchtitan
@@ -1018,7 +1007,6 @@ USM
 UTCL
 UTIL
 utils
-UX
 vL
 variational
 vdi
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -767,8 +767,8 @@ HIP runtime has the following functional improvements which improves runtime per

 #### Upcoming changes

-* `__AMDGCN_WAVEFRONT_SIZE__` macro and HIP’s `warpSize` variable as `constexpr` are deprecated and will be disabled in a future release. Users are encouraged to update their code if needed to ensure future compatibility. For more information, see [AMDGCN_WAVEFRONT_SIZE deprecation](https://rocm.docs.amd.com/en/docs-7.0.0/about/release-notes.html#amdgpu-wavefront-size-compiler-macro-deprecation).
-* The `roc-obj-ls` and `roc-obj-extract` tools are  deprecated. To extract all Clang offload bundles into separate code objects use `llvm-objdump --offloading <file>`. For more information, see [Changes to ROCm Object Tooling](https://rocm.docs.amd.com/en/docs-7.0.0/about/release-notes.html#changes-to-rocm-object-tooling).
+* `__AMDGCN_WAVEFRONT_SIZE__` macro and HIP’s `warpSize` variable as `constexpr` are deprecated and will be disabled in a future release. Users are encouraged to update their code if needed to ensure future compatibility. For more information, see [AMDGCN_WAVEFRONT_SIZE deprecation](#amdgpu-wavefront-size-compiler-macro-deprecation).
+* The `roc-obj-ls` and `roc-obj-extract` tools are  deprecated. To extract all Clang offload bundles into separate code objects use `llvm-objdump --offloading <file>`. For more information, see [Changes to ROCm Object Tooling](#changes-to-rocm-object-tooling). 

 ### **MIGraphX** (2.13.0)

--- a/RELEASE.md
+++ b/RELEASE.md
@@ -91,7 +91,7 @@ firmware, AMD GPU drivers, and the ROCm user space software.
          <td rowspan="9" style="vertical-align: middle;">ROCm 7.0.2</td>
          <td>MI355X</td>
          <td>
-              01.25.15.04<br>
+              01.25.15.02 (or later)<br>
              01.25.13.09
          </td>
          <td>30.10.2<br>
@@ -102,7 +102,7 @@ firmware, AMD GPU drivers, and the ROCm user space software.
      <tr>
          <td>MI350X</td>
          <td>
-              01.25.15.04<br>
+              01.25.15.02 (or later)<br>
              01.25.13.09
          </td>
          <td>30.10.2<br>
@@ -112,7 +112,7 @@ firmware, AMD GPU drivers, and the ROCm user space software.
      <tr>
          <td>MI325X</td>
          <td>
-              01.25.04.02<br>
+              01.25.04.02 (or later)<br>
              01.25.03.03
          </td>
          <td>
@@ -139,21 +139,21 @@ firmware, AMD GPU drivers, and the ROCm user space software.
      </tr>
      <tr>
          <td>MI300A</td>
-          <td>BKC 26<br>
+          <td>BKC 26 (or later)<br>
              BKC 25</td>
          <td rowspan="3" style="vertical-align: middle;">Not Applicable</td>
      </tr>
      <tr>
          <td>MI250X</td>
-          <td>IFWI 47</td>
+          <td>IFWI 47 (or later)</td>
      </tr>
      <tr>
          <td>MI250</td>
-          <td>MU3 w/ IFWI 73</td>
+          <td>MU5 w/ IFWI 75 (or later)</td>
      </tr>
      <tr>
          <td>MI210</td>
-          <td>MU3 w/ IFWI 73</td>
+          <td>MU5 w/ IFWI 75 (or later)</td>
          <td>8.4.0.K</td>
      </tr>
      <tr>
@@ -164,7 +164,7 @@ firmware, AMD GPU drivers, and the ROCm user space software.
  </table>
 </div>

-<p id="footnote1">[1]: PLDM bundle 01.25.05.00 will be available by November 2025.</p>
+<p id="footnote1">[1]: PLDM bundle 01.25.05.00 will be available by October 31, 2025.</p>

 #### AMD Instinct MI300X GPU resiliency improvement

@@ -190,7 +190,7 @@ ROCm-LS provides the following tools to build a complete workflow for life scien

 * The hipCIM library provides powerful support for GPU-accelerated I/O operations, coupled with an array of computer vision and image processing primitives designed for N-dimensional image data in fields such as biomedical imaging. For more information, see the [hipCIM documentation](https://rocm.docs.amd.com/projects/hipCIM/en/latest/).

-* MONAI for AMD ROCm, a ROCm-enabled version of {fab}`github` [MONAI](https://github.com/Project-MONAI/MONAI), is built on top of [PyTorch for AMD ROCm](https://pytorch.org/blog/pytorch-for-amd-rocm-platform-now-available-as-python-package/), helping healthcare and life science innovators to leverage GPU acceleration with AMD Instinct GPUs for high-performance inference and training of medical AI applications. For more information, see the [MONAI for AMD ROCm documentation](https://rocm.docs.amd.com/projects/monai/en/latest/).
+* MONAI for AMD ROCm, a ROCm-enabled version of [MONAI](https://monai.io/), is built on top of [PyTorch for AMD ROCm](https://pytorch.org/blog/pytorch-for-amd-rocm-platform-now-available-as-python-package/), helping healthcare and life science innovators to leverage GPU acceleration with AMD Instinct GPUs for high-performance inference and training of medical AI applications. For more information, see the [MONAI for AMD ROCm documentation](https://rocm.docs.amd.com/projects/monai/en/latest/).

 ### Deep learning and AI framework updates

@@ -241,6 +241,8 @@ ROCm documentation continues to be updated to provide clearer and more comprehen

    For more information about the changes, see the [Changelog for the AI Developer Hub](https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/changelog.html).

+* ROCm components support a wide range of environment variables that can be used for testing, logging, debugging, experimental features, and more. The [rocBLAS](https://rocm.docs.amd.com/projects/rocBLAS/en/docs-7.0.2/reference/env-variables.html) and [RCCL](https://rocm.docs.amd.com/projects/rccl/en/docs-7.0.2/api-reference/env-variables.html) components have been updated with new environment variable content. 
+
 ## ROCm components

 The following table lists the versions of ROCm components for ROCm 7.0.2, including any version
@@ -710,11 +712,7 @@ The issue will be resolved in a future ROCm release. See [GitHub issue #5500](ht

 ### Applications using OpenCV might fail due to package incompatibility between the OS

-OpenCV packages built on Ubuntu 24.04 are incompatible with Debian 13 due to a version conflict. As a result, applications, tests, and samples that use OpenCV might fail. As a workaround, rebuild OpenCV with the version corresponding to Debian 13 from source, followed by the application that uses OpenCV. This issue will be fixed in a future ROCm release. See [GitHub issue #5501](https://github.com/ROCm/ROCm/issues/5501).
-
-### Libva-based applications might fail after ROCm installation
-
-After installing ROCm, certain applications that are dependent on the libva library (such as `vainfo` and `ffmpeg`) might fail to function correctly. This issue is only relevant if you're using libva-based applications outside of ROCm on RHEL 8.10 and Oracle Linux 8. The failure occurs due to a symbol clash between the AMD-packaged `libva-amdgpu` and the system-provided libva. This conflict was introduced when adapting the RHEL 8 build to support additional operating systems, which required changes to the build options. The issue will be fixed in a future ROCm release. See [GitHub issue #5732](https://github.com/ROCm/ROCm/issues/5732).
+OpenCV packages built on Ubuntu 24.04 are incompatible with Debian 13 due to a version conflict. As a result, applications, tests, and samples that use OpenCV might fail. To avoid the version conflict, rebuild OpenCV with the version corresponding to Debian 13, then rebuild MIVisionX on top of it. As a workaround, rebuild OpenCV from source, followed by the application that uses OpenCV. This issue will be fixed in a future ROCm release. See [GitHub issue #5501](https://github.com/ROCm/ROCm/issues/5501).

 ## ROCm upcoming changes

--- a/docs/compatibility/compatibility-matrix-historical-6.0.csv
+++ b/docs/compatibility/compatibility-matrix-historical-6.0.csv
@@ -37,6 +37,7 @@ ROCm Version,7.0.2,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6
      :doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>` [#stanford-megatron-lm_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,85f95ae,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
      :doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,2.4.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
      :doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>` [#megablocks_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.7.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`Taichi <../compatibility/ml-compatibility/taichi-compatibility>` [#taichi_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,1.8.0b1,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
      :doc:`Ray <../compatibility/ml-compatibility/ray-compatibility>` [#ray_compat-past-60]_,N/A,N/A,N/A,N/A,2.48.0.post0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
      :doc:`llama.cpp <../compatibility/ml-compatibility/llama-cpp-compatibility>` [#llama-cpp_compat-past-60]_,N/A,b6356,b6356,b6356,b6356,b5997,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
      :doc:`FlashInfer <../compatibility/ml-compatibility/flashinfer-compatibility>` [#flashinfer_compat-past-60]_,N/A,N/A,N/A,N/A,v0.2.5,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
@@ -95,7 +96,7 @@ ROCm Version,7.0.2,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6
      :doc:`rocThrust <rocthrust:index>`,4.0.0,4.0.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.1.1,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
      ,,,,,,,,,,,,,,,,,,,,
      SUPPORT LIBS,,,,,,,,,,,,,,,,,,,,
-      `hipother <https://github.com/ROCm/hipother>`_,7.0.51831,7.0.51830,6.4.43483,6.4.43483,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
+      `hipother <https://github.com/ROCm/hipother>`_,7.0.51830,7.0.51830,6.4.43483,6.4.43483,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
      `rocm-core <https://github.com/ROCm/rocm-core>`_,7.0.2,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0,6.1.5,6.1.2,6.1.1,6.1.0,6.0.2,6.0.0
      `ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,20240607.5.7,20240607.5.7,20240607.4.05,20240607.1.4246,20240125.5.08,20240125.5.08,20240125.5.08,20240125.3.30,20231016.2.245,20231016.2.245
      ,,,,,,,,,,,,,,,,,,,,
--- a/docs/compatibility/compatibility-matrix.rst
+++ b/docs/compatibility/compatibility-matrix.rst
@@ -114,7 +114,7 @@ compatibility and system requirements.
      :doc:`rocThrust <rocthrust:index>`,4.0.0,4.0.0,3.3.0
      ,,,
      SUPPORT LIBS,,,
-      `hipother <https://github.com/ROCm/hipother>`_,7.0.51831,7.0.51830,6.4.43482
+      `hipother <https://github.com/ROCm/hipother>`_,7.0.51830,7.0.51830,6.4.43482
      `rocm-core <https://github.com/ROCm/rocm-core>`_,7.0.2,7.0.1/7.0.0,6.4.0
      `ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr]_,N/A [#ROCT-rocr]_,N/A [#ROCT-rocr]_
      ,,,
@@ -291,6 +291,7 @@ Expand for full historical view of:
   .. [#stanford-megatron-lm_compat-past-60] Stanford Megatron-LM is supported only on ROCm 6.3.0.
   .. [#dgl_compat-past-60] DGL is supported only on ROCm 6.4.0.
   .. [#megablocks_compat-past-60] Megablocks is supported only on ROCm 6.3.0.
+   .. [#taichi_compat-past-60] Taichi is supported only on ROCm 6.3.2.
   .. [#ray_compat-past-60] Ray is supported only on ROCm 6.4.1.
   .. [#llama-cpp_compat-past-60] llama.cpp is supported only on ROCm 7.0.0 and 6.4.x.
   .. [#flashinfer_compat-past-60] FlashInfer is supported only on ROCm 6.4.1.
--- a/docs/compatibility/ml-compatibility/jax-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/jax-compatibility.rst
@@ -47,21 +47,6 @@ with ROCm support:
   `Community ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax-community>`_
   follow upstream JAX releases and use the latest available ROCm version.

-JAX Plugin-PJRT with JAX/JAXLIB compatibility
-================================================================================
-
-Portable JIT Runtime (PJRT) is an open, stable interface for device runtime and compiler. The table below shows the compatibility between the JAX Plugin–PJRT and JAX/JAXLIB.
-
-.. list-table::
-    :header-rows: 1
-
-    * - JAX Plugin-PJRT
-      - JAX/JAXLIB
-      - ROCm
-    * - 0.6.0
-      - 0.6.2, 0.6.0
-      - 7.0.2, 7.0.1, 7.0.0 
-
 Use cases and recommendations
 ================================================================================

--- a/docs/compatibility/ml-compatibility/pytorch-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/pytorch-compatibility.rst
@@ -407,7 +407,7 @@ with ROCm.

        **Note:** Only official release exists.

-Key features and enhancements for PyTorch 2.7/2.8 with ROCm 7.0
+Key features and enhancements for PyTorch 2.7 with ROCm 7.0
 ================================================================================

 - Enhanced TunableOp framework: Introduces ``tensorfloat32`` support for
@@ -442,6 +442,10 @@ Key features and enhancements for PyTorch 2.7/2.8 with ROCm 7.0
  ROCm-specific test conditions, and enhanced unit test coverage for Flash
  Attention and Memory Efficient operations.

+- Build system and infrastructure improvements: Provides updated CentOS Stream 9
+  support, improved Docker configuration, migration to public MAGMA repository,
+  and enhanced QA automation scripts for PyTorch unit testing.
+
 - Composable Kernel (CK) updates: Features updated CK submodule integration with
  the latest optimizations and performance improvements for core mathematical
  operations.
@@ -463,7 +467,7 @@ Key features and enhancements for PyTorch 2.7/2.8 with ROCm 7.0
  network training or inference. For AMD platforms, ``amdclang++`` has been
  validated as the supported compiler for building these extensions.

-Known issues and notes for PyTorch 2.7/2.8 with ROCm 7.0
+Known issues and notes for PyTorch 2.7 with ROCm 7.0
 ================================================================================

 - The ``matmul.allow_fp16_reduced_precision_reduction`` and
--- a/docs/compatibility/ml-compatibility/taichi-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/taichi-compatibility.rst
@@ -0,0 +1,76 @@
+:orphan:
+
+.. meta::
+    :description: Taichi compatibility
+    :keywords: GPU, Taichi compatibility
+
+.. version-set:: rocm_version latest
+
+*******************************************************************************
+Taichi compatibility
+*******************************************************************************
+
+`Taichi <https://www.taichi-lang.org/>`_ is an open-source, imperative, and parallel 
+programming language designed for high-performance numerical computation. 
+Embedded in Python, it leverages just-in-time (JIT) compilation frameworks such as LLVM to accelerate 
+compute-intensive Python code by compiling it to native GPU or CPU instructions.
+
+Taichi is widely used across various domains, including real-time physical simulation, 
+numerical computing, augmented reality, artificial intelligence, computer vision, robotics, 
+visual effects in film and gaming, and general-purpose computing.
+
+* ROCm support for Taichi is hosted in the official `https://github.com/ROCm/taichi <https://github.com/ROCm/taichi>`_ repository.
+* Due to independent compatibility considerations, this location differs from the `https://github.com/taichi-dev <https://github.com/taichi-dev>`_ upstream repository.
+* Use the prebuilt :ref:`Docker image <taichi-docker-compat>` with ROCm, PyTorch, and Taichi preinstalled.
+* See the :doc:`ROCm Taichi installation guide <rocm-install-on-linux:install/3rd-party/taichi-install>` to install and get started.
+
+.. note::
+
+	Taichi is supported on ROCm 6.3.2.
+
+Supported devices and features
+===============================================================================
+There is support through the ROCm software stack for all Taichi GPU features on AMD Instinct MI250X and MI210X series GPUs with the exception of Taichi’s GPU rendering system, CGUI.
+AMD Instinct MI300X series GPUs will be supported by November.
+
+.. _taichi-recommendations:
+
+Use cases and recommendations
+================================================================================
+To fully leverage Taichi's performance capabilities in compute-intensive tasks, it is best to adhere to specific coding patterns and utilize Taichi decorators. 
+A collection of example use cases is available in the `https://github.com/ROCm/taichi_examples <https://github.com/ROCm/taichi_examples>`_ repository, 
+providing practical insights and foundational knowledge for working with the Taichi programming language. 
+You can also refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`_ to search for Taichi examples and best practices to optimize your workflows on AMD GPUs.
+
+.. _taichi-docker-compat:
+
+Docker image compatibility
+================================================================================
+
+.. |docker-icon| raw:: html
+
+   <i class="fab fa-docker"></i>
+
+AMD validates and publishes ready-made `ROCm Taichi Docker images <https://hub.docker.com/r/rocm/taichi/tags>`_
+with ROCm backends on Docker Hub. The following Docker image tags and associated inventories 
+represent the latest Taichi version from the official Docker Hub.
+The Docker images have been validated for `ROCm 6.3.2 <https://rocm.docs.amd.com/en/docs-6.3.2/about/release-notes.html>`_. 
+Click |docker-icon| to view the image on Docker Hub.
+
+.. list-table:: 
+    :header-rows: 1
+    :class: docker-image-compatibility
+
+    * - Docker image
+      - ROCm
+      - Taichi
+      - Ubuntu
+      - Python
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/taichi/taichi-1.8.0b1_rocm6.3.2_ubuntu22.04_py3.10.12/images/sha256-e016964a751e6a92199032d23e70fa3a564fff8555afe85cd718f8aa63f11fc6"><i class="fab fa-docker fa-lg"></i> rocm/taichi</a>
+      - `6.3.2 <https://repo.radeon.com/rocm/apt/6.3.2/>`_
+      - `1.8.0b1 <https://github.com/taichi-dev/taichi>`_
+      - 22.04
+      - `3.10.12 <https://www.python.org/downloads/release/python-31012/>`_
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -107,6 +107,7 @@ article_pages = [
    {"file": "compatibility/ml-compatibility/stanford-megatron-lm-compatibility", "os": ["linux"]},
    {"file": "compatibility/ml-compatibility/dgl-compatibility", "os": ["linux"]},
    {"file": "compatibility/ml-compatibility/megablocks-compatibility", "os": ["linux"]},
+    {"file": "compatibility/ml-compatibility/taichi-compatibility", "os": ["linux"]},
    {"file": "compatibility/ml-compatibility/ray-compatibility", "os": ["linux"]},
    {"file": "compatibility/ml-compatibility/llama-cpp-compatibility", "os": ["linux"]},
    {"file": "compatibility/ml-compatibility/flashinfer-compatibility", "os": ["linux"]},
--- a/docs/data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
@@ -1,16 +1,47 @@
 dockers:
-  - pull_tag: rocm/jax-training:maxtext-v25.9
+  - pull_tag: rocm/jax-training:maxtext-v25.7-jax060
    docker_hub_url: https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7/images/sha256-45f4c727d4019a63fc47313d3a5f5a5105569539294ddfd2d742218212ae9025
    components:
-      ROCm: 7.0.0
-      JAX: 0.6.2
-      Python: 3.10.18
-      Transformer Engine: 2.2.0.dev0+c91bac54
+      ROCm: 6.4.1
+      JAX: 0.6.0
+      Python: 3.10.12
+      Transformer Engine: 2.1.0+90d703dd
+      hipBLASLt: 1.1.0-499ece1c21
+  - pull_tag: rocm/jax-training:maxtext-v25.7
+    docker_hub_url: https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7/images/sha256-45f4c727d4019a63fc47313d3a5f5a5105569539294ddfd2d742218212ae9025
+    components:
+      ROCm: 6.4.1
+      JAX: 0.5.0
+      Python: 3.10.12
+      Transformer Engine: 2.1.0+90d703dd
      hipBLASLt: 1.x.x
 model_groups:
  - group: Meta Llama
    tag: llama
    models:
+      - model: Llama 3.3 70B
+        mad_tag: jax_maxtext_train_llama-3.3-70b
+        model_repo: Llama-3.3-70B
+        precision: bf16
+        doc_options: ["single-node"]
+      - model: Llama 3.1 8B
+        mad_tag: jax_maxtext_train_llama-3.1-8b
+        model_repo: Llama-3.1-8B
+        precision: bf16
+        doc_options: ["single-node"]
+      - model: Llama 3.1 70B
+        mad_tag: jax_maxtext_train_llama-3.1-70b
+        model_repo: Llama-3.1-70B
+        precision: bf16
+        doc_options: ["single-node"]
+      - model: Llama 3 8B
+        mad_tag: jax_maxtext_train_llama-3-8b
+        multinode_training_script: llama3_8b_multinode.sh
+        doc_options: ["multi-node"]
+      - model: Llama 3 70B
+        mad_tag: jax_maxtext_train_llama-3-70b
+        multinode_training_script: llama3_70b_multinode.sh
+        doc_options: ["multi-node"]
      - model: Llama 2 7B
        mad_tag: jax_maxtext_train_llama-2-7b
        model_repo: Llama-2-7B
@@ -23,29 +54,6 @@ model_groups:
        precision: bf16
        multinode_training_script: llama2_70b_multinode.sh
        doc_options: ["single-node", "multi-node"]
-      - model: Llama 3 8B (multi-node)
-        mad_tag: jax_maxtext_train_llama-3-8b
-        multinode_training_script: llama3_8b_multinode.sh
-        doc_options: ["multi-node"]
-      - model: Llama 3 70B (multi-node)
-        mad_tag: jax_maxtext_train_llama-3-70b
-        multinode_training_script: llama3_70b_multinode.sh
-        doc_options: ["multi-node"]
-      - model: Llama 3.1 8B
-        mad_tag: jax_maxtext_train_llama-3.1-8b
-        model_repo: Llama-3.1-8B
-        precision: bf16
-        doc_options: ["single-node"]
-      - model: Llama 3.1 70B
-        mad_tag: jax_maxtext_train_llama-3.1-70b
-        model_repo: Llama-3.1-70B
-        precision: bf16
-        doc_options: ["single-node"]
-      - model: Llama 3.3 70B
-        mad_tag: jax_maxtext_train_llama-3.3-70b
-        model_repo: Llama-3.3-70B
-        precision: bf16
-        doc_options: ["single-node"]
  - group: DeepSeek
    tag: deepseek
    models:
--- a/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
@@ -1,21 +1,14 @@
 dockers:
-  MI355X and MI350X:
-    pull_tag: rocm/megatron-lm:v25.9_gfx950
-    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
-    components: &docker_components
-      ROCm: 7.0.0
-      Primus: aab4234
-      PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
+  - pull_tag: rocm/megatron-lm:v25.8_py310
+    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.8_py310/images/sha256-50fc824361054e445e86d5d88d5f58817f61f8ec83ad4a7e43ea38bbc4a142c0
+    components:
+      ROCm: 6.4.3
+      PyTorch: 2.8.0a0+gitd06a406
      Python: "3.10"
      Transformer Engine: 2.2.0.dev0+54dd2bdc
-      Flash Attention: 2.8.3
-      hipBLASLt: 911283acd1
-      Triton: 3.4.0+rocm7.0.0.git56765e8c
-      RCCL: 2.26.6
-  MI325X and MI300X:
-    pull_tag: rocm/megatron-lm:v25.9_gfx942
-    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
-    components: *docker_components
+      hipBLASLt: d1b517fc7a
+      Triton: 3.3.0
+      RCCL: 2.22.3
 model_groups:
  - group: Meta Llama
    tag: llama
@@ -26,6 +19,8 @@ model_groups:
        mad_tag: pyt_megatron_lm_train_llama-3.1-8b
      - model: Llama 3.1 70B
        mad_tag: pyt_megatron_lm_train_llama-3.1-70b
+      - model: Llama 3.1 70B (proxy)
+        mad_tag: pyt_megatron_lm_train_llama-3.1-70b-proxy
      - model: Llama 2 7B
        mad_tag: pyt_megatron_lm_train_llama-2-7b
      - model: Llama 2 70B
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/jax-maxtext-v25.7-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/jax-maxtext-v25.7-benchmark-models.yaml
@@ -1,72 +0,0 @@
-dockers:
-  - pull_tag: rocm/jax-training:maxtext-v25.7-jax060
-    docker_hub_url: https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7/images/sha256-45f4c727d4019a63fc47313d3a5f5a5105569539294ddfd2d742218212ae9025
-    components:
-      ROCm: 6.4.1
-      JAX: 0.6.0
-      Python: 3.10.12
-      Transformer Engine: 2.1.0+90d703dd
-      hipBLASLt: 1.1.0-499ece1c21
-  - pull_tag: rocm/jax-training:maxtext-v25.7
-    docker_hub_url: https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7/images/sha256-45f4c727d4019a63fc47313d3a5f5a5105569539294ddfd2d742218212ae9025
-    components:
-      ROCm: 6.4.1
-      JAX: 0.5.0
-      Python: 3.10.12
-      Transformer Engine: 2.1.0+90d703dd
-      hipBLASLt: 1.x.x
-model_groups:
-  - group: Meta Llama
-    tag: llama
-    models:
-      - model: Llama 3.3 70B
-        mad_tag: jax_maxtext_train_llama-3.3-70b
-        model_repo: Llama-3.3-70B
-        precision: bf16
-        doc_options: ["single-node"]
-      - model: Llama 3.1 8B
-        mad_tag: jax_maxtext_train_llama-3.1-8b
-        model_repo: Llama-3.1-8B
-        precision: bf16
-        doc_options: ["single-node"]
-      - model: Llama 3.1 70B
-        mad_tag: jax_maxtext_train_llama-3.1-70b
-        model_repo: Llama-3.1-70B
-        precision: bf16
-        doc_options: ["single-node"]
-      - model: Llama 3 8B
-        mad_tag: jax_maxtext_train_llama-3-8b
-        multinode_training_script: llama3_8b_multinode.sh
-        doc_options: ["multi-node"]
-      - model: Llama 3 70B
-        mad_tag: jax_maxtext_train_llama-3-70b
-        multinode_training_script: llama3_70b_multinode.sh
-        doc_options: ["multi-node"]
-      - model: Llama 2 7B
-        mad_tag: jax_maxtext_train_llama-2-7b
-        model_repo: Llama-2-7B
-        precision: bf16
-        multinode_training_script: llama2_7b_multinode.sh
-        doc_options: ["single-node", "multi-node"]
-      - model: Llama 2 70B
-        mad_tag: jax_maxtext_train_llama-2-70b
-        model_repo: Llama-2-70B
-        precision: bf16
-        multinode_training_script: llama2_70b_multinode.sh
-        doc_options: ["single-node", "multi-node"]
-  - group: DeepSeek
-    tag: deepseek
-    models:
-      - model: DeepSeek-V2-Lite (16B)
-        mad_tag: jax_maxtext_train_deepseek-v2-lite-16b
-        model_repo: DeepSeek-V2-lite
-        precision: bf16
-        doc_options: ["single-node"]
-  - group: Mistral AI
-    tag: mistral
-    models:
-      - model: Mixtral 8x7B
-        mad_tag: jax_maxtext_train_mixtral-8x7b
-        model_repo: Mixtral-8x7B
-        precision: bf16
-        doc_options: ["single-node"]
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.8-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.8-benchmark-models.yaml
@@ -1,48 +0,0 @@
-dockers:
-  - pull_tag: rocm/megatron-lm:v25.8_py310
-    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.8_py310/images/sha256-50fc824361054e445e86d5d88d5f58817f61f8ec83ad4a7e43ea38bbc4a142c0
-    components:
-      ROCm: 6.4.3
-      PyTorch: 2.8.0a0+gitd06a406
-      Python: "3.10"
-      Transformer Engine: 2.2.0.dev0+54dd2bdc
-      hipBLASLt: d1b517fc7a
-      Triton: 3.3.0
-      RCCL: 2.22.3
-model_groups:
-  - group: Meta Llama
-    tag: llama
-    models:
-      - model: Llama 3.3 70B
-        mad_tag: pyt_megatron_lm_train_llama-3.3-70b
-      - model: Llama 3.1 8B
-        mad_tag: pyt_megatron_lm_train_llama-3.1-8b
-      - model: Llama 3.1 70B
-        mad_tag: pyt_megatron_lm_train_llama-3.1-70b
-      - model: Llama 3.1 70B (proxy)
-        mad_tag: pyt_megatron_lm_train_llama-3.1-70b-proxy
-      - model: Llama 2 7B
-        mad_tag: pyt_megatron_lm_train_llama-2-7b
-      - model: Llama 2 70B
-        mad_tag: pyt_megatron_lm_train_llama-2-70b
-  - group: DeepSeek
-    tag: deepseek
-    models:
-      - model: DeepSeek-V3 (proxy)
-        mad_tag: pyt_megatron_lm_train_deepseek-v3-proxy
-      - model: DeepSeek-V2-Lite
-        mad_tag: pyt_megatron_lm_train_deepseek-v2-lite-16b
-  - group: Mistral AI
-    tag: mistral
-    models:
-      - model: Mixtral 8x7B
-        mad_tag: pyt_megatron_lm_train_mixtral-8x7b
-      - model: Mixtral 8x22B (proxy)
-        mad_tag: pyt_megatron_lm_train_mixtral-8x22b-proxy
-  - group: Qwen
-    tag: qwen
-    models:
-      - model: Qwen 2.5 7B
-        mad_tag: pyt_megatron_lm_train_qwen2.5-7b
-      - model: Qwen 2.5 72B
-        mad_tag: pyt_megatron_lm_train_qwen2.5-72b
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.8-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.8-benchmark-models.yaml
@@ -1,58 +0,0 @@
-dockers:
-  - pull_tag: rocm/megatron-lm:v25.8_py310
-    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.8_py310/images/sha256-50fc824361054e445e86d5d88d5f58817f61f8ec83ad4a7e43ea38bbc4a142c0
-    components:
-      ROCm: 6.4.3
-      Primus: 927a717
-      PyTorch: 2.8.0a0+gitd06a406
-      Python: "3.10"
-      Transformer Engine: 2.2.0.dev0+54dd2bdc
-      hipBLASLt: d1b517fc7a
-      Triton: 3.3.0
-      RCCL: 2.22.3
-model_groups:
-  - group: Meta Llama
-    tag: llama
-    models:
-      - model: Llama 3.3 70B
-        mad_tag: primus_pyt_megatron_lm_train_llama-3.3-70b
-        config_name: llama3.3_70B-pretrain.yaml
-      - model: Llama 3.1 70B
-        mad_tag: primus_pyt_megatron_lm_train_llama-3.1-70b
-        config_name: llama3.1_70B-pretrain.yaml
-      - model: Llama 3.1 8B
-        mad_tag: primus_pyt_megatron_lm_train_llama-3.1-8b
-        config_name: llama3.1_8B-pretrain.yaml
-      - model: Llama 2 7B
-        mad_tag: primus_pyt_megatron_lm_train_llama-2-7b
-        config_name: llama2_7B-pretrain.yaml
-      - model: Llama 2 70B
-        mad_tag: primus_pyt_megatron_lm_train_llama-2-70b
-        config_name: llama2_70B-pretrain.yaml
-  - group: DeepSeek
-    tag: deepseek
-    models:
-      - model: DeepSeek-V3 (proxy)
-        mad_tag: primus_pyt_megatron_lm_train_deepseek-v3-proxy
-        config_name: deepseek_v3-pretrain.yaml
-      - model: DeepSeek-V2-Lite
-        mad_tag: primus_pyt_megatron_lm_train_deepseek-v2-lite-16b
-        config_name: deepseek_v2_lite-pretrain.yaml
-  - group: Mistral AI
-    tag: mistral
-    models:
-      - model: Mixtral 8x7B
-        mad_tag: primus_pyt_megatron_lm_train_mixtral-8x7b
-        config_name: mixtral_8x7B_v0.1-pretrain.yaml
-      - model: Mixtral 8x22B (proxy)
-        mad_tag: primus_pyt_megatron_lm_train_mixtral-8x22b-proxy
-        config_name: mixtral_8x22B_v0.1-pretrain.yaml
-  - group: Qwen
-    tag: qwen
-    models:
-      - model: Qwen 2.5 7B
-        mad_tag: primus_pyt_megatron_lm_train_qwen2.5-7b
-        config_name: primus_qwen2.5_7B-pretrain.yaml
-      - model: Qwen 2.5 72B
-        mad_tag: primus_pyt_megatron_lm_train_qwen2.5-72b
-        config_name: qwen2.5_72B-pretrain.yaml
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.8-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.8-benchmark-models.yaml
@@ -1,24 +0,0 @@
-dockers:
-  - pull_tag: rocm/pytorch-training:v25.8
-    docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.8/images/sha256-5082ae01d73fec6972b0d84e5dad78c0926820dcf3c19f301d6c8eb892e573c5
-    components:
-      ROCm: 6.4.3
-      PyTorch: 2.8.0a0+gitd06a406
-      Python: 3.10.18
-      Transformer Engine: 2.2.0.dev0+a1e66aae
-      Flash Attention: 3.0.0.post1
-      hipBLASLt: 1.1.0-d1b517fc7a
-model_groups:
-  - group: Meta Llama
-    tag: llama
-    models:
-    - model: Llama 3.1 8B
-      mad_tag: primus_pyt_train_llama-3.1-8b
-      model_repo: Llama-3.1-8B
-      url: https://huggingface.co/meta-llama/Llama-3.1-8B
-      precision: BF16
-    - model: Llama 3.1 70B
-      mad_tag: primus_pyt_train_llama-3.1-70b
-      model_repo: Llama-3.1-70B
-      url: https://huggingface.co/meta-llama/Llama-3.1-70B
-      precision: BF16
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.8-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.8-benchmark-models.yaml
@@ -1,178 +0,0 @@
-dockers:
-  - pull_tag: rocm/pytorch-training:v25.8
-    docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.8/images/sha256-5082ae01d73fec6972b0d84e5dad78c0926820dcf3c19f301d6c8eb892e573c5
-    components:
-      ROCm: 6.4.3
-      PyTorch: 2.8.0a0+gitd06a406
-      Python: 3.10.18
-      Transformer Engine: 2.2.0.dev0+a1e66aae
-      Flash Attention: 3.0.0.post1
-      hipBLASLt: 1.1.0-d1b517fc7a
-model_groups:
-  - group: Meta Llama
-    tag: llama
-    models:
-    - model: Llama 4 Scout 17B-16E
-      mad_tag: pyt_train_llama-4-scout-17b-16e
-      model_repo: Llama-4-17B_16E
-      url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Llama 3.3 70B
-      mad_tag: pyt_train_llama-3.3-70b
-      model_repo: Llama-3.3-70B
-      url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
-    - model: Llama 3.2 1B
-      mad_tag: pyt_train_llama-3.2-1b
-      model_repo: Llama-3.2-1B
-      url: https://huggingface.co/meta-llama/Llama-3.2-1B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Llama 3.2 3B
-      mad_tag: pyt_train_llama-3.2-3b
-      model_repo: Llama-3.2-3B
-      url: https://huggingface.co/meta-llama/Llama-3.2-3B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Llama 3.2 Vision 11B
-      mad_tag: pyt_train_llama-3.2-vision-11b
-      model_repo: Llama-3.2-Vision-11B
-      url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision
-      precision: BF16
-      training_modes: [finetune_fw]
-    - model: Llama 3.2 Vision 90B
-      mad_tag: pyt_train_llama-3.2-vision-90b
-      model_repo: Llama-3.2-Vision-90B
-      url: https://huggingface.co/meta-llama/Llama-3.2-90B-Vision
-      precision: BF16
-      training_modes: [finetune_fw]
-    - model: Llama 3.1 8B
-      mad_tag: pyt_train_llama-3.1-8b
-      model_repo: Llama-3.1-8B
-      url: https://huggingface.co/meta-llama/Llama-3.1-8B
-      precision: BF16
-      training_modes: [pretrain, finetune_fw, finetune_lora, HF_pretrain]
-    - model: Llama 3.1 70B
-      mad_tag: pyt_train_llama-3.1-70b
-      model_repo: Llama-3.1-70B
-      url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
-      precision: BF16
-      training_modes: [pretrain, finetune_fw, finetune_lora]
-    - model: Llama 3.1 405B
-      mad_tag: pyt_train_llama-3.1-405b
-      model_repo: Llama-3.1-405B
-      url: https://huggingface.co/meta-llama/Llama-3.1-405B
-      precision: BF16
-      training_modes: [finetune_qlora]
-    - model: Llama 3 8B
-      mad_tag: pyt_train_llama-3-8b
-      model_repo: Llama-3-8B
-      url: https://huggingface.co/meta-llama/Meta-Llama-3-8B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Llama 3 70B
-      mad_tag: pyt_train_llama-3-70b
-      model_repo: Llama-3-70B
-      url: https://huggingface.co/meta-llama/Meta-Llama-3-70B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Llama 2 7B
-      mad_tag: pyt_train_llama-2-7b
-      model_repo: Llama-2-7B
-      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
-    - model: Llama 2 13B
-      mad_tag: pyt_train_llama-2-13b
-      model_repo: Llama-2-13B
-      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Llama 2 70B
-      mad_tag: pyt_train_llama-2-70b
-      model_repo: Llama-2-70B
-      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
-      precision: BF16
-      training_modes: [finetune_lora, finetune_qlora]
-  - group: OpenAI
-    tag: openai
-    models:
-    - model: GPT OSS 20B
-      mad_tag: pyt_train_gpt_oss_20b
-      model_repo: GPT-OSS-20B
-      url: https://huggingface.co/openai/gpt-oss-20b
-      precision: BF16
-      training_modes: [HF_finetune_lora]
-    - model: GPT OSS 120B
-      mad_tag: pyt_train_gpt_oss_120b
-      model_repo: GPT-OSS-120B
-      url: https://huggingface.co/openai/gpt-oss-120b
-      precision: BF16
-      training_modes: [HF_finetune_lora]
-  - group: Qwen
-    tag: qwen
-    models:
-    - model: Qwen 3 8B
-      mad_tag: pyt_train_qwen3-8b
-      model_repo: Qwen3-8B
-      url: https://huggingface.co/Qwen/Qwen3-8B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Qwen 3 32B
-      mad_tag: pyt_train_qwen3-32b
-      model_repo: Qwen3-32
-      url: https://huggingface.co/Qwen/Qwen3-32B
-      precision: BF16
-      training_modes: [finetune_lora]
-    - model: Qwen 2.5 32B
-      mad_tag: pyt_train_qwen2.5-32b
-      model_repo: Qwen2.5-32B
-      url: https://huggingface.co/Qwen/Qwen2.5-32B
-      precision: BF16
-      training_modes: [finetune_lora]
-    - model: Qwen 2.5 72B
-      mad_tag: pyt_train_qwen2.5-72b
-      model_repo: Qwen2.5-72B
-      url: https://huggingface.co/Qwen/Qwen2.5-72B
-      precision: BF16
-      training_modes: [finetune_lora]
-    - model: Qwen 2 1.5B
-      mad_tag: pyt_train_qwen2-1.5b
-      model_repo: Qwen2-1.5B
-      url: https://huggingface.co/Qwen/Qwen2-1.5B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Qwen 2 7B
-      mad_tag: pyt_train_qwen2-7b
-      model_repo: Qwen2-7B
-      url: https://huggingface.co/Qwen/Qwen2-7B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-  - group: Stable Diffusion
-    tag: sd
-    models:
-    - model: Stable Diffusion XL
-      mad_tag: pyt_huggingface_stable_diffusion_xl_2k_lora_finetuning
-      model_repo: SDXL
-      url: https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0
-      precision: BF16
-      training_modes: [finetune_lora]
-  - group: Flux
-    tag: flux
-    models:
-    - model: FLUX.1-dev
-      mad_tag: pyt_train_flux
-      model_repo: Flux
-      url: https://huggingface.co/black-forest-labs/FLUX.1-dev
-      precision: BF16
-      training_modes: [pretrain]
-  - group: NCF
-    tag: ncf
-    models:
-    - model: NCF
-      mad_tag: pyt_ncf_training
-      model_repo:
-      url: https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Recommendation/NCF
-      precision: FP32
--- a/docs/data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
@@ -1,22 +1,15 @@
 dockers:
-  MI355X and MI350X:
-    pull_tag: rocm/primus:v25.9_gfx950
-    docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
-    components: &docker_components
-      ROCm: 7.0.0
-      Primus: 0.3.0
-      Primus Turbo: 0.1.1
-      PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
+  - pull_tag: rocm/megatron-lm:v25.8_py310
+    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.8_py310/images/sha256-50fc824361054e445e86d5d88d5f58817f61f8ec83ad4a7e43ea38bbc4a142c0
+    components:
+      ROCm: 6.4.3
+      Primus: 927a717
+      PyTorch: 2.8.0a0+gitd06a406
      Python: "3.10"
      Transformer Engine: 2.2.0.dev0+54dd2bdc
-      Flash Attention: 2.8.3
-      hipBLASLt: 911283acd1
-      Triton: 3.4.0+rocm7.0.0.git56765e8c
-      RCCL: 2.26.6
-  MI325X and MI300X:
-    pull_tag: rocm/primus:v25.9_gfx942
-    docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
-    components: *docker_components
+      hipBLASLt: d1b517fc7a
+      Triton: 3.3.0
+      RCCL: 2.22.3
 model_groups:
  - group: Meta Llama
    tag: llama
--- a/docs/data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
@@ -1,39 +1,24 @@
 dockers:
-  MI355X and MI350X:
-    pull_tag: rocm/primus:v25.9_gfx950
-    docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
-    components: &docker_components
-      ROCm: 7.0.0
-      Primus: 0.3.0
-      Primus Turbo: 0.1.1
-      PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
-      Python: "3.10"
-      Transformer Engine: 2.2.0.dev0+54dd2bdc
-      Flash Attention: 2.8.3
-      hipBLASLt: 911283acd1
-      Triton: 3.4.0+rocm7.0.0.git56765e8c
-      RCCL: 2.26.6
-  MI325X and MI300X:
-    pull_tag: rocm/primus:v25.9_gfx942
-    docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
-    components: *docker_components
+  - pull_tag: rocm/pytorch-training:v25.8
+    docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.8/images/sha256-5082ae01d73fec6972b0d84e5dad78c0926820dcf3c19f301d6c8eb892e573c5
+    components:
+      ROCm: 6.4.3
+      PyTorch: 2.8.0a0+gitd06a406
+      Python: 3.10.18
+      Transformer Engine: 2.2.0.dev0+a1e66aae
+      Flash Attention: 3.0.0.post1
+      hipBLASLt: 1.1.0-d1b517fc7a
 model_groups:
  - group: Meta Llama
    tag: llama
    models:
    - model: Llama 3.1 8B
      mad_tag: primus_pyt_train_llama-3.1-8b
-      model_repo: meta-llama/Llama-3.1-8B
+      model_repo: Llama-3.1-8B
      url: https://huggingface.co/meta-llama/Llama-3.1-8B
      precision: BF16
-      config_file:
-        bf16: "./llama3_8b_fsdp_bf16.toml"
-        fp8: "./llama3_8b_fsdp_fp8.toml"
    - model: Llama 3.1 70B
      mad_tag: primus_pyt_train_llama-3.1-70b
-      model_repo: meta-llama/Llama-3.1-70B
+      model_repo: Llama-3.1-70B
      url: https://huggingface.co/meta-llama/Llama-3.1-70B
      precision: BF16
-      config_file:
-        bf16: "./llama3_70b_fsdp_bf16.toml"
-        fp8: "./llama3_70b_fsdp_fp8.toml"
--- a/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
@@ -1,21 +1,13 @@
 dockers:
-  MI355X and MI350X:
-    pull_tag: rocm/pytorch-training:v25.9_gfx950
-    docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
-    components: &docker_components
-      ROCm: 7.0.0
-      Primus: aab4234
-      PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
-      Python: "3.10"
-      Transformer Engine: 2.2.0.dev0+54dd2bdc
-      Flash Attention: 2.8.3
-      hipBLASLt: 911283acd1
-      Triton: 3.4.0+rocm7.0.0.git56765e8c
-      RCCL: 2.26.6
-  MI325X and MI300X:
-    pull_tag: rocm/pytorch-training:v25.9_gfx942
-    docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
-    components: *docker_components
+  - pull_tag: rocm/pytorch-training:v25.8
+    docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.8/images/sha256-5082ae01d73fec6972b0d84e5dad78c0926820dcf3c19f301d6c8eb892e573c5
+    components:
+      ROCm: 6.4.3
+      PyTorch: 2.8.0a0+gitd06a406
+      Python: 3.10.18
+      Transformer Engine: 2.2.0.dev0+a1e66aae
+      Flash Attention: 3.0.0.post1
+      hipBLASLt: 1.1.0-d1b517fc7a
 model_groups:
  - group: Meta Llama
    tag: llama
@@ -166,7 +158,7 @@ model_groups:
      model_repo: SDXL
      url: https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0
      precision: BF16
-      training_modes: [posttrain-p]
+      training_modes: [finetune_lora]
  - group: Flux
    tag: flux
    models:
@@ -175,7 +167,7 @@ model_groups:
      model_repo: Flux
      url: https://huggingface.co/black-forest-labs/FLUX.1-dev
      precision: BF16
-      training_modes: [posttrain-p]
+      training_modes: [pretrain]
  - group: NCF
    tag: ncf
    models:
--- a/docs/data/reference/precision-support/precision-support.yaml
+++ b/docs/data/reference/precision-support/precision-support.yaml
@@ -32,7 +32,7 @@ library_groups:

      - name: "MIGraphX"
        tag: "migraphx"
-        doc_link: "amdmigraphx:reference/MIGraphX-cpp"
+        doc_link: "amdmigraphx:reference/cpp"
        data_types:
          - type: "int8"
            support: "⚠️"
@@ -290,7 +290,7 @@ library_groups:

      - name: "Tensile"
        tag: "tensile"
-        doc_link: "tensile:src/reference/precision-support"
+        doc_link: "tensile:reference/precision-support"
        data_types:
          - type: "int8"
            support: "✅"
--- a/docs/how-to/deep-learning-rocm.rst
+++ b/docs/how-to/deep-learning-rocm.rst
@@ -98,6 +98,18 @@ The table below summarizes information about ROCm-enabled deep learning framewor

          <a href="https://github.com/ROCm/megablocks"><i class="fab fa-github fa-lg"></i></a>

+    * - `Taichi <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/taichi-compatibility.html>`__
+      - .. raw:: html
+
+          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html"><i class="fas fa-link fa-lg"></i></a>
+      - 
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html#use-a-prebuilt-docker-image-with-taichi-pre-installed>`__
+        - `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html#use-a-wheels-package>`__
+
+      - .. raw:: html
+
+          <a href="https://github.com/ROCm/taichi"><i class="fab fa-github fa-lg"></i></a>
+
    * - `Ray <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/ray-compatibility.html>`__
      - .. raw:: html

--- a/docs/how-to/rocm-for-ai/inference-optimization/vllm-optimization.rst
+++ b/docs/how-to/rocm-for-ai/inference-optimization/vllm-optimization.rst
--- a/docs/how-to/rocm-for-ai/inference-optimization/workload.rst
+++ b/docs/how-to/rocm-for-ai/inference-optimization/workload.rst
@@ -15,9 +15,10 @@ using PyTorch. It delves into specific workloads such as
 :ref:`model inference <mi300x-vllm-optimization>`, offering strategies to
 enhance efficiency.

-The following topics highlight :ref:`auto-tunable configurations <mi300x-auto-tune>` as
-well as :ref:`Triton kernel optimization <mi300x-triton-kernel-performance-optimization>`
-for meticulous tuning.
+The following topics highlight :ref:`auto-tunable configurations <mi300x-auto-tune>`
+that streamline optimization as well as advanced techniques like
+:ref:`Triton kernel optimization <mi300x-triton-kernel-performance-optimization>` for
+meticulous tuning.

 Workload tuning strategy
 ========================
@@ -85,28 +86,27 @@ Optimize model inference with vLLM
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

 vLLM provides tools and techniques specifically designed for efficient model
-inference on AMD Instinct GPUs. See the official `vLLM installation docs
-<https://docs.vllm.ai/en/latest/getting_started/installation/gpu.html>`__ for
-installation guidance. Optimizing performance with vLLM involves configuring
-tensor parallelism, leveraging advanced features, and ensuring efficient
-execution.
+inference on AMD Instinct MI300X accelerators. See :ref:`fine-tuning-llms-vllm`
+for installation guidance. Optimizing performance with vLLM
+involves configuring tensor parallelism, leveraging advanced features, and
+ensuring efficient execution. Here’s how to optimize vLLM performance:

-* Configuration for vLLM: Set engine arguments according to workload
-  requirements.
+* Tensor parallelism: Configure the
+  :ref:`tensor-parallel-size parameter <mi300x-vllm-multiple-gpus>` to distribute
+  tensor computations across multiple GPUs. Adjust parameters such as
+  ``batch-size``, ``input-len``, and ``output-len`` based on your workload.
+
+* Configuration for vLLM: Set :ref:`parameters <mi300x-vllm-optimization>`
+  according to workload requirements. Benchmark performance to understand
+  characteristics and identify bottlenecks.

 * Benchmarking and performance metrics: Measure latency and throughput to
  evaluate performance.

-.. seealso::
-
-   See :doc:`vllm-optimization` to learn more about vLLM performance
-   optimization techniques.
-
 .. _mi300x-auto-tune:

 Auto-tunable configurations
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
 Auto-tunable configurations can significantly streamline performance
 optimization by automatically adjusting parameters based on workload
 characteristics. For example:
@@ -120,7 +120,8 @@ characteristics. For example:
  your specific hardware.

 * Triton: Use :ref:`Triton’s auto-tuning features <mi300x-autotunable-kernel-config>`
-  to explore various kernel configurations and select the best-performing ones.
+  to explore various kernel configurations and automatically select the
+  best-performing ones.

 Manual tuning
 ^^^^^^^^^^^^^
@@ -327,21 +328,380 @@ hardware counters are also included.

   ROCm Systems Profiler timeline trace example.

+.. _mi300x-vllm-optimization:
+
 vLLM performance optimization
 =============================

-vLLM is a high-throughput and memory efficient inference and serving engine for
-large language models that has gained traction in the AI community for its
-performance and ease of use. See :doc:`vllm-optimization`, where you'll learn
-how to:
+vLLM is a high-throughput and memory efficient inference and serving engine for large language models that has gained traction in the AI community for
+its performance and ease of use. See :ref:`fine-tuning-llms-vllm` for a primer on vLLM with ROCm.
+
+Performance environment variables
+---------------------------------
+
+The following performance tips are not *specific* to vLLM -- they are general
+but relevant in this context. You can tune the following vLLM parameters to
+achieve optimal request latency and throughput performance.
+
+* As described in `Environment variables (MI300X)
+  <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#environment-variables>`_,
+  the environment variable ``HIP_FORCE_DEV_KERNARG`` can improve vLLM
+  performance. Set it to ``export HIP_FORCE_DEV_KERNARG=1``.
+
+* Set the :ref:`RCCL environment variable <mi300x-rccl>` ``NCCL_MIN_NCHANNELS``
+  to ``112`` to increase the number of channels on MI300X to potentially improve
+  performance.
+
+* Set the environment variable ``TORCH_BLAS_PREFER_HIPBLASLT=1`` to use hipBLASLt to improve performance.
+
+Auto-tuning using PyTorch TunableOp
+------------------------------------
+
+Since vLLM is based on the PyTorch framework, PyTorch TunableOp can be used for auto-tuning. 
+You can run auto-tuning with TunableOp in two simple steps without modifying your code:
+
+* Enable TunableOp and tuning. Optionally, enable verbose mode:
+
+  .. code-block:: shell
+
+     PYTORCH_TUNABLEOP_ENABLED=1 PYTORCH_TUNABLEOP_VERBOSE=1 your_vllm_script.sh
+
+* Enable TunableOp and disable tuning and measure.
+
+  .. code-block:: shell
+
+     PYTORCH_TUNABLEOP_ENABLED=1 PYTORCH_TUNABLEOP_TUNING=0 your_vllm_script.sh
+
+Learn more about TunableOp in the :ref:`PyTorch TunableOp <mi300x-tunableop>` section.
+
+Performance tuning based on vLLM engine configurations
+-------------------------------------------------------
+
+The following subsections describe vLLM-specific configurations for performance tuning.
+You can tune the following vLLM parameters to achieve optimal performance.
+
+*  ``tensor_parallel_size``
+
+*  ``gpu_memory_utilization``
+
+*  ``dtype``
+
+*  ``enforce_eager``
+
+*  ``kv_cache_dtype``
+
+*  ``input_len``
+
+*  ``output_len``
+
+*  ``max_num_seqs``
+
+*  ``num_scheduler_steps``
+
+*  ``max_model_len``
+
+*  ``enable_chunked_prefill``
+
+*  ``distributed_executor_backend``
+
+*  ``max_seq_len_to_capture``
+
+Refer to `vLLM documentation <https://docs.vllm.ai/en/latest/models/performance.html>`_
+for additional performance tips. :ref:`fine-tuning-llms-vllm` describes vLLM
+usage with ROCm.
+
+ROCm provides a prebuilt optimized Docker image for validating the performance
+of LLM inference with vLLM on MI300X series accelerators. The Docker image includes
+ROCm, vLLM, and PyTorch. For more information, see
+:doc:`/how-to/rocm-for-ai/inference/benchmark-docker/vllm`.
+
+.. _mi300x-vllm-throughput-measurement:
+
+Evaluating performance by throughput measurement
+-------------------------------------------------
+
+This tuning guide evaluates the performance of LLM inference workloads by measuring throughput in tokens per second (TPS). Throughput can be assessed using both real-world and synthetic data, depending on your evaluation goals.
+
+Refer to the benchmarking script located at ``benchmarks/benchmark_throughput.py`` in the `vLLM repository <https://github.com/ROCm/vllm/blob/main/benchmarks/benchmark_throughput.py>`_.
+Use this script to measure throughput effectively. You can assess throughput using real-world and synthetic data, depending on your evaluation goals.
+
+* For realistic performance evaluation, you can use datasets like Hugging Face's
+  ``ShareGPT_V3_unfiltered_cleaned_split.json``. This dataset includes real-world conversational
+  data, making it a good representation of typical use cases for language models. Download it using
+  the following command:
+
+  .. code-block:: shell
+
+     wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+* For standardized benchmarking, you can set fixed input and output token
+  lengths. Synthetic prompts provide consistent benchmarking runs, making it
+  easier to compare performance across different models or configurations.
+  Additionally, a controlled environment simplifies analysis.
+
+By balancing real-world data and synthetic data approaches, you can get a well-rounded understanding of model performance in varied scenarios.
+
+.. _mi300x-vllm-single-node:
+
+Maximizing vLLM instances on a single node
+------------------------------------------
+
+The general guideline is to maximize per-node throughput by running as many vLLM instances as possible.
+However, running too many instances might lead to insufficient memory for the KV-cache, which can affect performance.
+
+The Instinct MI300X accelerator is equipped with 192GB of HBM3 memory capacity and bandwidth.
+For models that fit in one GPU -- to maximize the accumulated throughput -- you can run as many as eight vLLM instances
+simultaneously on one MI300X node (with eight GPUs). To do so, use the GPU isolation environment
+variable ``CUDA_VISIBLE_DEVICES``.
+
+For example, this script runs eight instances of vLLM for throughput benchmarking at the same time
+with a model that can fit in one GPU:
+
+.. code-block:: shell
+
+   for i in $(seq 0 7);
+   do
+       CUDA_VISIBLE_DEVICES="$i" python3 /app/vllm/benchmarks/benchmark_throughput.py -tp 1 --dataset "/path/to/dataset/ShareGPT_V3_unfiltered_cleaned_split.json" --model /path/to/model &
+   done
+
+The total throughput achieved by running ``N`` instances of vLLM is generally much higher than running a
+single vLLM instance across ``N`` GPUs simultaneously (that is, configuring ``tensor_parallel_size`` as N or
+using the ``-tp`` N option, where ``1 < N ≤ 8``).
+
+vLLM on MI300X accelerators can run a variety of model weights, including Llama 2 (7b, 13b, 70b), Llama 3 (8b, 70b), Qwen2 (7b, 72b), Mixtral-8x7b, Mixtral-8x22b, and so on.
+Notable configurations include Llama2-70b and Llama3-70b models on a single MI300X GPU, and the Llama3.1 405b model can fit on one single node with 8 MI300X GPUs.
+
+.. _mi300x-vllm-gpu-memory-utilization:
+
+Configure the gpu_memory_utilization parameter
+----------------------------------------------
+
+There are two ways to increase throughput by configuring ``gpu-memory-utilization`` parameter.
+
+1. Increase ``gpu-memory-utilization`` to improve the throughput for a single instance as long as
+   it does not incur HIP or CUDA Out Of Memory. The default ``gpu-memory-utilization`` is 0.9.
+   You can set it to ``>0.9`` and ``<1``.
+
+   For example, below benchmarking command set the ``gpu-memory-utilization`` as 0.98, or 98%.
+
+   .. code-block:: shell
+
+      /vllm-workspace/benchmarks/benchmark_throughput.py --gpu-memory-utilization 0.98 --input-len 1024 --output-len 128 --model /path/to/model
+
+2. Decrease ``gpu-memory-utilization`` to maximize the number of vLLM instances on the same GPU.
+
+   Specify GPU memory utilization to run as many instances of vLLM as possible on a single
+   GPU. However, too many instances can result in no memory for KV-cache. For small models, run
+   multiple instances of vLLM on the same GPU by specifying a smaller ``gpu-memory-utilization`` -- as
+   long as it would not cause HIP Out Of Memory. 
+
+   For example, run two instances of the Llama3-8b model at the same time on a single GPU by specifying
+   ``--gpu-memory-utilization`` to 0.4 (40%) as follows (on GPU ``0``):
+
+   .. code-block:: shell
+
+      CUDA_VISIBLE_DEVICES=0 python3 /vllm-workspace/benchmarks/benchmark_throughput.py --gpu-memory-utilization 0.4 
+      --dataset "/path/to/dataset/ShareGPT_V3_unfiltered_cleaned_split.json" --model /path/to/model &
+
+      CUDA_VISIBLE_DEVICES=0 python3 /vllm-workspace/benchmarks/benchmark_throughput.py --gpu-memory-utilization 0.4 
+      --dataset "/path/to/dataset/ShareGPT_V3_unfiltered_cleaned_split.json" --model /path/to/model &
+
+See :ref:`vllm-engine-args` for other performance suggestions.
+
+.. _mi300x-vllm-multiple-gpus:
+
+Run vLLM on multiple GPUs
+-------------------------
+
+The two main reasons to use multiple GPUs are:
+
+*  The model size is too big to run vLLM using one GPU as it results HIP Out of Memory.
+
+*  To achieve better latency when using a single GPU is not desirable.
+
+To run one vLLM instance on multiple GPUs, use the ``-tp`` or ``--tensor-parallel-size`` option to
+specify multiple GPUs. Optionally, use the ``CUDA_VISIBLE_DEVICES`` environment variable to specify
+the GPUs.
+
+For example, you can use two GPUs to start an API server on port 8000:
+
+.. code-block:: shell
+
+   python -m vllm.entrypoints.api_server --model /path/to/model --dtype
+   float16 -tp 2 --port 8000 &
+
+To achieve both latency and throughput performance for serving, you can run multiple API servers on
+different GPUs by specifying different ports for each server and use ``CUDA_VISIBLE_DEVICES`` to
+specify the GPUs for each server, for example:
+
+.. code-block:: shell
+
+   CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.api_server --model
+   /path/to/model --dtype float16 -tp 2 --port 8000 &
+
+   CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.api_server --model
+   /path/to/model --dtype float16 -tp 2 --port 8001 &
+
+Choose an attention backend
+---------------------------
+
+vLLM on ROCm supports two attention backends, each suitable for different use cases and performance
+requirements:
+
+- **Triton Flash Attention** - For benchmarking, run vLLM scripts at
+  least once as a warm-up step so Triton can perform auto-tuning before
+  collecting benchmarking numbers. This is the default setting.
+
+- **Composable Kernel (CK) Flash Attention** - To use CK Flash Attention, specify
+  the environment variable as ``export VLLM_USE_TRITON_FLASH_ATTN=0``.
+
+
+Refer to :ref:`Model acceleration libraries <acceleration-flash-attention>`
+to learn more about Flash Attention with Triton or CK backends.
+
+.. _vllm-engine-args:
+
+vLLM engine arguments
+---------------------
+
+The following are configuration suggestions to potentially improve performance with vLLM. See
+`vLLM's engine arguments documentation <https://docs.vllm.ai/en/latest/serving/engine_args.html>`_
+for a full list of configurable engine arguments.
+
+Configure the max-num-seqs parameter
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Increase the ``max-num-seqs`` parameter from the default ``256`` to ``512`` (``--max-num-seqs
+512``). This increases the maximum number of sequences per iteration and can improve throughput.
+
+Use the float16 dtype
+^^^^^^^^^^^^^^^^^^^^^
+
+The default data type (``dtype``) is specified in the model’s configuration file. For instance, some models use ``torch.bfloat16`` as their default ``dtype``.
+Use float16 (``--dtype float16``) for better performance.
+
+Multi-step scheduling
+^^^^^^^^^^^^^^^^^^^^^
+
+Setting ``num-scheduler-steps`` for multi-step scheduling can increase performance. Set it between 10 to 15 (``--num-scheduler-steps 10``).
+
+Distributed executor backend
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The vLLM supports two modes of distributed executor backend: ``ray`` and ``mp``. When using the `<https://github.com/ROCm/vllm>`__ fork, using the ``mp``
+backend (``--distributed_executor_backend mp``) is recommended.
+
+Graph mode max-seq-len-to-capture
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Maximum sequence length covered by CUDA graphs. In the default mode (where ``enforce_eager`` is ``False``), when a sequence has context length
+larger than this, vLLM engine falls back to eager mode. The default is 8192.
+
+When working with models that support long context lengths, set the parameter ``--max-seq-len-to-capture`` to 16384.
+See this `vLLM blog <https://blog.vllm.ai/2024/10/23/vllm-serving-amd.html>`__ for details.
+
+An example of long context length model is Qwen2-7b.
+
+Whether to enable chunked prefill
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Another vLLM performance tip is to enable chunked prefill to improve
+throughput. Chunked prefill allows large prefills to be chunked into
+smaller chunks and batched together with decode requests.
+
+You can enable the feature by specifying ``--enable-chunked-prefill`` in the
+command line or setting ``enable_chunked_prefill=True`` in the LLM
+constructor. 
+
+As stated in `vLLM's documentation, <https://docs.vllm.ai/en/latest/models/performance.html#chunked-prefill>`__,
+you can tune the performance by changing ``max_num_batched_tokens``. By
+default, it is set to 512 and optimized for ITL (inter-token latency).
+Smaller ``max_num_batched_tokens`` achieves better ITL because there are
+fewer prefills interrupting decodes.
+Higher ``max_num_batched_tokens`` achieves better TTFT (time to the first
+token) as you can put more prefill to the batch.
+
+You might experience noticeable throughput improvements when
+benchmarking on a single GPU or 8 GPUs using the vLLM throughput
+benchmarking script along with the ShareGPT dataset as input.
+
+In the case of fixed ``input-len``/``output-len``, for some configurations,
+enabling chunked prefill increases the throughput. For some other
+configurations, the throughput may be worse and elicit a need to tune
+parameter ``max_num_batched_tokens`` (for example, increasing ``max_num_batched_tokens`` value to 4096 or larger).
+
+.. note::
+
+   Chunked prefill is no longer recommended. See the vLLM blog: `Serving LLMs on AMD MI300X: Best Practices <https://blog.vllm.ai/2024/10/23/vllm-serving-amd.html>`_ (October 2024).
+
+Quantization support
+---------------------
+
+Quantization reduces the precision of the model’s weights and activations, which significantly decreases the memory footprint.
+``fp8(w8a8)`` and ``AWQ`` quantization are supported for ROCm.
+
+FP8 quantization
+^^^^^^^^^^^^^^^^^
+
+`<https://github.com/ROCm/vllm>`__ supports FP8 (8-bit floating point) weight and activation quantization using hardware acceleration on the Instinct MI300X.
+Quantization of models with FP8 allows for a 2x reduction in model memory requirements and up to a 1.6x improvement in throughput with minimal impact on accuracy.
+
+AMD publishes Quark Quantized OCP FP8 models on Hugging Face. For example:
+
+* `Llama-3.1-8B-Instruct-FP8-KV <https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV>`__
+* `Llama-3.1-70B-Instruct-FP8-KV <https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV>`__
+* `Llama-3.1-405B-Instruct-FP8-KV <https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV>`__
+* `Mixtral-8x7B-Instruct-v0.1-FP8-KV <https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV>`__
+* `Mixtral-8x22B-Instruct-v0.1-FP8-KV <https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV>`__
+
+To enable vLLM benchmarking to run on fp8 quantized models, use the ``--quantization`` parameter with value ``fp8`` (``--quantization fp8``).
+
+AWQ quantization
+^^^^^^^^^^^^^^^^
+
+You can quantize your own models by installing AutoAWQ or picking one of the 400+ models on Hugging Face. Be aware that
+that AWQ support in vLLM is currently underoptimized.
+
+To enable vLLM to run on ``awq`` quantized models, using ``--quantization`` parameter with ``awq`` (``--quantization awq``).
+
+You can find more specifics in the `vLLM AutoAWQ documentation <https://docs.vllm.ai/en/stable/quantization/auto_awq.html>`_.
+
+fp8 kv-cached-dtype
+^^^^^^^^^^^^^^^^^^^^^^^
+
+Using ``fp8 kv-cache dtype`` can improve performance as it reduces the size
+of ``kv-cache``. As a result, it reduces the cost required for reading and
+writing the ``kv-cache``.
+
+To use this feature, specify ``--kv-cache-dtype`` as ``fp8``.
+
+To specify the quantization scaling config, use the
+``--quantization-param-path`` parameter. If the parameter is not specified,
+the default scaling factor of ``1`` is used, which can lead to less accurate
+results. To generate ``kv-cache`` scaling JSON file, see `FP8 KV
+Cache <https://github.com/vllm-project/llm-compressor/blob/main/examples/quantization_kv_cache/README.md>`__
+in the vLLM GitHub repository.
+
+Two sample Llama scaling configuration files are in vLLM for ``llama2-70b`` and
+``llama2-7b``.
+
+If building the vLLM using
+`Dockerfile.rocm <https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile.rocm>`_
+for ``llama2-70b`` scale config, find the file at
+``/vllm-workspace/tests/fp8_kv/llama2-70b-fp8-kv/kv_cache_scales.json`` at
+runtime.
+
+Below is a sample command to run benchmarking with this feature enabled
+for the ``llama2-70b`` model:
+
+.. code-block:: shell
+
+   python3 /vllm-workspace/benchmarks/benchmark_throughput.py --model \
+   /path/to/llama2-70b-model --kv-cache-dtype "fp8" \
+   --quantization-param-path \
+   "/vllm-workspace/tests/fp8_kv/llama2-70b-fp8-kv/kv_cache_scales.json" \
+   --input-len 512 --output-len 256 --num-prompts 500

-* Enable AITER (AI Tensor Engine for ROCm) to speed up on LLM models.
-* Configure environment variables for optimal HIP, RCCL, and Quick Reduce performance.
-* Select the right attention backend for your workload (AITER MHA/MLA vs. Triton).
-* Choose parallelism strategies (tensor, pipeline, data, expert) for multi-GPU deployments.
-* Apply quantization (``FP8``/``FP4``) to reduce memory usage by 2-4× with minimal accuracy loss.
-* Tune engine arguments (batch size, memory utilization, graph modes) for your use case.
-* Benchmark and scale across single-node and multi-node configurations.

 .. _mi300x-tunableop:

@@ -586,33 +946,33 @@ for details.

  .. code-block:: shell

-     HIP_FORCE_DEV_KERNARG=1  hipblaslt-bench --alpha 1 --beta 0 -r f16_r \
+     HIP_FORCE_DEV_KERNARG=1  hipblaslt-bench --alpha 1 --beta 0 -r f16_r \
     --a_type f16_r --b_type f8_r --compute_type f32_f16_r \
-     --initialization trig_float  --cold_iters 100 --iters 1000 --rotating 256
+     --initialization trig_float  --cold_iters 100 --iters 1000 --rotating 256

 * Example 2: Benchmark forward epilogues and backward epilogues

-  *  ``HIPBLASLT_EPILOGUE_RELU: "--activation_type relu";``
+  *  ``HIPBLASLT_EPILOGUE_RELU: "--activation_type relu";``

-  *  ``HIPBLASLT_EPILOGUE_BIAS: "--bias_vector";``
+  *  ``HIPBLASLT_EPILOGUE_BIAS: "--bias_vector";``

-  *  ``HIPBLASLT_EPILOGUE_RELU_BIAS: "--activation_type relu --bias_vector";``
+  *  ``HIPBLASLT_EPILOGUE_RELU_BIAS: "--activation_type relu --bias_vector";``

-  *  ``HIPBLASLT_EPILOGUE_GELU: "--activation_type gelu";``
+  *  ``HIPBLASLT_EPILOGUE_GELU: "--activation_type gelu";``

  *  ``HIPBLASLT_EPILOGUE_DGELU": --activation_type gelu --gradient";``

-  *  ``HIPBLASLT_EPILOGUE_GELU_BIAS: "--activation_type gelu --bias_vector";``
+  *  ``HIPBLASLT_EPILOGUE_GELU_BIAS: "--activation_type gelu --bias_vector";``

-  *  ``HIPBLASLT_EPILOGUE_GELU_AUX: "--activation_type gelu --use_e";``
+  *  ``HIPBLASLT_EPILOGUE_GELU_AUX: "--activation_type gelu --use_e";``

-  *  ``HIPBLASLT_EPILOGUE_GELU_AUX_BIAS: "--activation_type gelu --bias_vector --use_e";``
+  *  ``HIPBLASLT_EPILOGUE_GELU_AUX_BIAS: "--activation_type gelu --bias_vector --use_e";``

-  *  ``HIPBLASLT_EPILOGUE_DGELU_BGRAD: "--activation_type gelu --bias_vector --gradient";``
+  *  ``HIPBLASLT_EPILOGUE_DGELU_BGRAD: "--activation_type gelu --bias_vector --gradient";``

-  *  ``HIPBLASLT_EPILOGUE_BGRADA: "--bias_vector --gradient --bias_source a";``
+  *  ``HIPBLASLT_EPILOGUE_BGRADA: "--bias_vector --gradient --bias_source a";``

-  *  ``HIPBLASLT_EPILOGUE_BGRADB:  "--bias_vector --gradient --bias_source b";``
+  *  ``HIPBLASLT_EPILOGUE_BGRADB:  "--bias_vector --gradient --bias_source b";``


 hipBLASLt auto-tuning using hipblaslt-bench
@@ -671,26 +1031,26 @@ The tuning tool is a two-step tool. It first runs the benchmark, then it creates

  .. code-block:: python

-     defaultBenchOptions = {"ProblemType": {
-         "TransposeA": 0,
-         "TransposeB": 0,
-         "ComputeInputDataType": "s",
-         "ComputeDataType": "s",
-         "DataTypeC": "s",
-         "DataTypeD": "s",
-         "UseBias": False
-     }, "TestConfig": {
-         "ColdIter": 20,
-         "Iter": 100,
-         "AlgoMethod": "all",
-         "RequestedSolutions": 2, # Only works in AlgoMethod heuristic
-         "SolutionIndex": None, # Only works in AlgoMethod index
-         "ApiMethod": "cpp",
-         "RotatingBuffer": 0,
-     }, "TuningParameters": {
-         "SplitK": [0]
-     }, "ProblemSizes": []}
-     defaultCreateLogicOptions = {}  # Currently unused
+     defaultBenchOptions = {"ProblemType": {
+         "TransposeA": 0,
+         "TransposeB": 0,
+         "ComputeInputDataType": "s",
+         "ComputeDataType": "s",
+         "DataTypeC": "s",
+         "DataTypeD": "s",
+         "UseBias": False
+     }, "TestConfig": {
+         "ColdIter": 20,
+         "Iter": 100,
+         "AlgoMethod": "all",
+         "RequestedSolutions": 2, # Only works in AlgoMethod heuristic
+         "SolutionIndex": None, # Only works in AlgoMethod index
+         "ApiMethod": "cpp",
+         "RotatingBuffer": 0,
+     }, "TuningParameters": {
+         "SplitK": [0]
+     }, "ProblemSizes": []}
+     defaultCreateLogicOptions = {}  # Currently unused

 * ``TestConfig``
   1. ``ColdIter``: This is number the warm-up iterations before starting the kernel benchmark.
@@ -870,7 +1230,7 @@ command:

 .. code-block:: shell

-   merge.py original_dir new_tuned_yaml_dir output_dir 
+   merge.py original_dir new_tuned_yaml_dir output_dir 

 The following table describes the logic YAML files.

@@ -1473,7 +1833,7 @@ de-quantize the ``int4`` key-value from the ``int4`` data type to ``fp16``.

 From the IR snippet, you can see ``i32`` data is loaded from global memory to
 registers (``%190``). With a few element-wise operations in registers, it is
-stored in shared memory (``%269``) for the transpose operation (``%270``), which
+stored in shared memory (``%269``) for the transpose operation (``%270``), which
 needs data movement across different threads. With the transpose done, it is
 loaded from LDS to register again (``%276``), and with a few more
 element-wise operations, it is stored to LDS again (``%298``). The last step
@@ -1607,7 +1967,7 @@ something similar to the following:
   loaded at: [0x7fd4f100c000-0x7fd4f100e070]

 The kernel name and the code object file should be listed. In the
-example above, the kernel name is vector_add_assert_trap, but this might
+example above, the kernel name is vector_add_assert_trap, but this might
 also look like:

 .. code-block:: text
@@ -1721,8 +2081,3 @@ Hardware efficiency is maximized with 4 or fewer HIP streams. These environment
 configuration to two compute streams and two RCCL streams, aligning with this best practice.
 Additionally, RCCL is often pre-optimized for MI300 systems in production by querying the node
 topology during startup, reducing the need for extensive manual tuning.
-
-Further reading
-===============
-
-* :doc:`vllm-optimization`
--- a/docs/how-to/rocm-for-ai/system-setup/multi-node-setup.rst
+++ b/docs/how-to/rocm-for-ai/system-setup/multi-node-setup.rst
@@ -277,7 +277,7 @@ PyTorch training

 .. seealso::

-   See :ref:`Training a model with PyTorch <amd-pytorch-training-multinode-examples>` for more examples and information.
+   See :ref:`Training a model with PyTorch <amd-pytorch-multinode-examples>` for more examples and information.

 Megatron-LM
 -----------
--- a/docs/how-to/rocm-for-ai/system-setup/system-health-check.rst
+++ b/docs/how-to/rocm-for-ai/system-setup/system-health-check.rst
@@ -92,7 +92,7 @@ GPUs, which can impact end-to-end latency.
 .. _healthcheck-install-transferbench:

 1. To get started, use the instructions in the `TransferBench documentation
-   <https://rocm.docs.amd.com/projects/TransferBench/en/latest/install/install.html#install-transferbench>`__
+   <https://rocm.docs.amd.com/projects/TransferBench/en/latest/install/install.html#install-transferbench>`_
   or use the following commands:

   .. code:: shell
@@ -102,5 +102,5 @@ GPUs, which can impact end-to-end latency.
      CC=hipcc make

 2. Run the suggested TransferBench tests -- see `TransferBench benchmarking
-   <https://instinct.docs.amd.com/projects/system-acceptance/en/latest/common/system-validation.html#transferbench>`__
+   <https://instinct.docs.amd.com/projects/system-acceptance/en/latest/mi300x/performance-bench.html#transferbench-benchmarking-results>`_
   in the Instinct performance benchmarking documentation for instructions.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
@@ -6,8 +6,14 @@
 Training a model with JAX MaxText on ROCm
 ******************************************

+MaxText is a high-performance, open-source framework built on the Google JAX
+machine learning library to train LLMs at scale. The MaxText framework for
+ROCm is an optimized fork of the upstream
+`<https://github.com/AI-Hypercomputer/maxtext>`__ enabling efficient AI workloads
+on AMD MI300X series GPUs.
+
 The MaxText for ROCm training Docker image
-provides a prebuilt environment for training on AMD Instinct MI355X, MI350X, MI325X, and MI300X GPUs,
+provides a prebuilt environment for training on AMD Instinct MI300X and MI325X GPUs,
 including essential components like JAX, XLA, ROCm libraries, and MaxText utilities.
 It includes the following software components:

@@ -55,15 +61,15 @@ MaxText with on ROCm provides the following key features to train large language

 - Multi-node support

- NANOO FP8 (for MI300X series GPUs) and FP8 (for MI355X and MI350X) quantization support
+- NANOO FP8 quantization support

-.. _amd-maxtext-model-support-v259:
+.. _amd-maxtext-model-support-v257:

 Supported models
 ================

-The following models are pre-optimized for performance on AMD Instinct
-GPUs. Some instructions, commands, and available training
+The following models are pre-optimized for performance on AMD Instinct MI300
+series GPUs. Some instructions, commands, and available training
 configurations in this documentation might vary by model -- select one to get
 started.

@@ -133,13 +139,22 @@ Use the following command to pull the Docker image from Docker Hub.

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml

-   {% set docker = data.dockers[0] %}
+   {% set dockers = data.dockers %}
+   .. tab-set::

-   .. code-block:: shell
+      {% for docker in dockers %}
+      {% set jax_version = docker.components["JAX"] %}

-      docker pull {{ docker.pull_tag }}
+      .. tab-item:: JAX {{ jax_version }}
+         :sync: {{ docker.pull_tag }}

-.. _amd-maxtext-multi-node-setup-v259:
+         .. code-block:: shell
+
+            docker pull {{ docker.pull_tag }}
+
+      {% endfor %}
+
+.. _amd-maxtext-multi-node-setup-v257:

 Multi-node configuration
 ------------------------
@@ -147,7 +162,7 @@ Multi-node configuration
 See :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your
 environment for multi-node training.

-.. _amd-maxtext-get-started-v259:
+.. _amd-maxtext-get-started-v257:

 Benchmarking
 ============
@@ -159,7 +174,7 @@ benchmark results:

   .. _vllm-benchmark-mad:

-   {% set docker = data.dockers[0] %}
+   {% set dockers = data.dockers %}
   {% set model_groups = data.model_groups %}
   {% for model_group in model_groups %}
      {% for model in model_group.models %}
@@ -171,9 +186,6 @@ benchmark results:
         {% if model.mad_tag and "single-node" in model.doc_options %}
         .. tab-item:: MAD-integrated benchmarking

-            The following run command is tailored to {{ model.model }}.
-            See :ref:`amd-maxtext-model-support-v259` to switch to another available model.
-
            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
               directory and install the required packages on the host machine.

@@ -202,19 +214,22 @@ benchmark results:

         .. tab-item:: Standalone benchmarking

-            The following commands are optimized for {{ model.model }}. See
-            :ref:`amd-maxtext-model-support-v259` to switch to another
-            available model. Some instructions and resources might not be
-            available for all models and configurations.
-
            .. rubric:: Download the Docker image and required scripts

            Run the JAX MaxText benchmark tool independently by starting the
            Docker container as shown in the following snippet.

-            .. code-block:: shell
+            .. tab-set::
+               {% for docker in dockers %}
+               {% set jax_version = docker.components["JAX"] %}

-               docker pull {{ docker.pull_tag }}
+               .. tab-item:: JAX {{ jax_version }}
+                  :sync: {{ docker.pull_tag }}
+
+                  .. code-block:: shell
+
+                     docker pull {{ docker.pull_tag }}
+               {% endfor %}

            {% if model.model_repo and "single-node" in model.doc_options %}
            .. rubric:: Single node training
@@ -235,25 +250,33 @@ benchmark results:

            2. Launch the Docker container.

-               .. code-block:: shell
+               .. tab-set::
+                  {% for docker in dockers %}
+                  {% set jax_version = docker.components["JAX"] %}

-                  docker run -it \
-                      --device=/dev/dri \
-                      --device=/dev/kfd \
-                      --network host \
-                      --ipc host \
-                      --group-add video \
-                      --cap-add=SYS_PTRACE \
-                      --security-opt seccomp=unconfined \
-                      --privileged \
-                      -v $HOME:$HOME \
-                      -v $HOME/.ssh:/root/.ssh \
-                      -v $HF_HOME:/hf_cache \
-                      -e HF_HOME=/hf_cache \
-                      -e MAD_SECRETS_HFTOKEN=$MAD_SECRETS_HFTOKEN
-                      --shm-size 64G \
-                      --name training_env \
-                      {{ docker.pull_tag }}
+                  .. tab-item:: JAX {{ jax_version }}
+                     :sync: {{ docker.pull_tag }}
+
+                     .. code-block:: shell
+
+                        docker run -it \
+                            --device=/dev/dri \
+                            --device=/dev/kfd \
+                            --network host \
+                            --ipc host \
+                            --group-add video \
+                            --cap-add=SYS_PTRACE \
+                            --security-opt seccomp=unconfined \
+                            --privileged \
+                            -v $HOME:$HOME \
+                            -v $HOME/.ssh:/root/.ssh \
+                            -v $HF_HOME:/hf_cache \
+                            -e HF_HOME=/hf_cache \
+                            -e MAD_SECRETS_HFTOKEN=$MAD_SECRETS_HFTOKEN
+                            --shm-size 64G \
+                            --name training_env \
+                            {{ docker.pull_tag }}
+                  {% endfor %}

            3. In the Docker container, clone the ROCm MAD repository and navigate to the
               benchmark scripts directory at ``MAD/scripts/jax-maxtext``.
@@ -276,27 +299,11 @@ benchmark results:

                  ./jax-maxtext_benchmark_report.sh -m {{ model.model_repo }}

-               For quantized training, run the script with the appropriate option for your Instinct GPU.
+               For quantized training, use the following command:

-               .. tab-set::
+               .. code-block:: shell

-                  .. tab-item:: MI355X and MI350X
-
-                     For ``fp8`` quantized training on MI355X and MI350X GPUs, use the following command:
-
-                     .. code-block:: shell
-
-                        ./jax-maxtext_benchmark_report.sh -m {{ model.model_repo }} -q fp8
-
-                  {% if model.model_repo not in ["Llama-3.1-70B", "Llama-3.3-70B"] %}
-                  .. tab-item:: MI325X and MI300X
-
-                     For ``nanoo_fp8`` quantized training on MI300X series GPUs, use the following command:
-
-                     .. code-block:: shell
-
-                        ./jax-maxtext_benchmark_report.sh -m {{ model.model_repo }} -q nanoo_fp8
-                  {% endif %}
+                  ./jax-maxtext_benchmark_report.sh -m {{ model.model_repo }} -q nanoo_fp8

            {% endif %}
            {% if model.multinode_training_script and "multi-node" in model.doc_options %}
@@ -328,7 +335,7 @@ benchmark results:
         {% else %}
            .. rubric:: Multi-node training

-            For multi-node training examples, choose a model from :ref:`amd-maxtext-model-support-v259`
+            For multi-node training examples, choose a model from :ref:`amd-maxtext-model-support-v257`
            with an available `multi-node training script <https://github.com/ROCm/MAD/tree/develop/scripts/jax-maxtext/gpu-rocm>`__.
         {% endif %}
      {% endfor %}
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
@@ -10,12 +10,6 @@ Training a model with Megatron-LM on ROCm

 .. caution::

-   For a unified training solution on AMD GPUs with ROCm, the `rocm/megatron-lm
-   <https://hub.docker.com/r/rocm/megatron-lm/>`__ Docker Hub registry will be
-   deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
-   The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
-   including Megatron-LM and :doc:`torchtitan <primus-pytorch>`.
-
   Primus with Megatron is designed to replace this ROCm Megatron-LM training workflow.
   To learn how to migrate workloads from Megatron-LM to Primus with Megatron,
   see :doc:`previous-versions/megatron-lm-primus-migration-guide`.
@@ -23,25 +17,30 @@ Training a model with Megatron-LM on ROCm
 The `Megatron-LM framework for ROCm <https://github.com/ROCm/Megatron-LM>`_ is
 a specialized fork of the robust Megatron-LM, designed to enable efficient
 training of large-scale language models on AMD GPUs. By leveraging AMD
-Instinct™ GPUs, Megatron-LM delivers enhanced scalability, performance, and
-resource utilization for AI workloads. It is
+Instinct™ MI300X series GPUs, Megatron-LM delivers enhanced
+scalability, performance, and resource utilization for AI workloads. It is
 purpose-built to support models like Llama, DeepSeek, and Mixtral,
 enabling developers to train next-generation AI models more
 efficiently.

-AMD provides ready-to-use Docker images for MI355X, MI350X, MI325X, and MI300X
-GPUs containing essential components, including PyTorch, ROCm libraries, and
-Megatron-LM utilities. It contains the following software components to
-accelerate training workloads:
+AMD provides ready-to-use Docker images for MI300X series GPUs containing
+essential components, including PyTorch, ROCm libraries, and Megatron-LM
+utilities. It contains the following software components to accelerate training
+workloads:
+
+.. note::
+
+   This Docker environment is based on Python 3.10 and Ubuntu 22.04. For an alternative environment with
+   Python 3.12 and Ubuntu 24.04, see the :doc:`previous ROCm Megatron-LM v25.6 Docker release <previous-versions/megatron-lm-v25.6>`.

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml

   {% set dockers = data.dockers %}
   .. tab-set::

-   {% for supported_gpus, docker in dockers.items() %}
-      .. tab-item:: {{ supported_gpus }}
-         :sync: {{ supported_gpus }}
+      {% for docker in dockers %}
+      .. tab-item:: ``{{ docker.pull_tag }}``
+         :sync: {{ docker.pull_tag }}

         .. list-table::
            :header-rows: 1
@@ -52,8 +51,10 @@ accelerate training workloads:
            {% for component_name, component_version in docker.components.items() %}
            * - {{ component_name }}
              - {{ component_version }}
+
            {% endfor %}
-   {% endfor %}
+      {% endfor %}
+
   .. _amd-megatron-lm-model-support:

   Supported models
@@ -150,24 +151,33 @@ Download the Docker image
   {% set dockers = data.dockers %}
   1. Use the following command to pull the Docker image from Docker Hub.

+      {% if dockers|length > 1 %}
      .. tab-set::

-         {% for supported_gpus, docker in dockers.items() %}
-         .. tab-item:: {{ supported_gpus }}
-            :sync: {{ supported_gpus }}
+         {% for docker in data.dockers %}
+         .. tab-item:: {{ docker.doc_name }}
+            :sync: {{ docker.pull_tag }}

            .. code-block:: shell

               docker pull {{ docker.pull_tag }}
-         {% endfor %}

+         {% endfor %}
+      {% elif dockers|length == 1 %}
+      {% set docker = dockers[0] %}
+      .. code-block:: shell
+
+         docker pull {{ docker.pull_tag }}
+
+      {% endif %}
   2. Launch the Docker container.

+      {% if dockers|length > 1 %}
      .. tab-set::

-         {% for supported_gpus, docker in dockers.items() %}
-         .. tab-item:: {{ supported_gpus }}
-            :sync: {{ supported_gpus }}
+         {% for docker in dockers %}
+         .. tab-item:: {{ docker.doc_name }}
+            :sync: {{ docker.pull_tag }}

            .. code-block:: shell

@@ -185,7 +195,28 @@ Download the Docker image
                   --shm-size 128G \
                   --name megatron_training_env \
                   {{ docker.pull_tag }}
+
         {% endfor %}
+      {% elif dockers|length == 1 %}
+      {% set docker = dockers[0] %}
+      .. code-block:: shell
+
+         docker run -it \
+             --device /dev/dri \
+             --device /dev/kfd \
+             --device /dev/infiniband \
+             --network host --ipc host \
+             --group-add video \
+             --cap-add SYS_PTRACE \
+             --security-opt seccomp=unconfined \
+             --privileged \
+             -v $HOME:$HOME \
+             -v $HOME/.ssh:/root/.ssh \
+             --shm-size 128G \
+             --name megatron_training_env \
+             {{ docker.pull_tag }}
+
+      {% endif %}

 3. Use these commands if you exit the ``megatron_training_env`` container and need to return to it.

@@ -203,8 +234,8 @@ Download the Docker image
      pip uninstall megatron-core
      pip install -e .

-The Docker container hosts a verified commit of
-`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev>`__.
+The Docker container hosts
+`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev>`__ at verified commit ``e8e9edc``.

 .. _amd-megatron-lm-environment-setup:

@@ -541,73 +572,31 @@ Single node training
   To run training on a single node for Llama 3.1 8B FP8, navigate to the Megatron-LM folder and use the
   following command.

-   .. tab-set::
+   .. code-block:: shell

-      .. tab-item:: MI355X and MI350X
-         :sync: MI355X and MI350X
-
-         .. code-block:: shell
-
-            TEE_OUTPUT=1 \
-            MBS=4 \
-            BS=512 \
-            TP=1 \
-            TE_FP8=1 \
-            SEQ_LENGTH=8192 \
-            MODEL_SIZE=8 \
-            TOTAL_ITERS=10 \
-            GEMM_TUNING=0 \
-            bash examples/llama/train_llama3.sh
-
-      .. tab-item:: MI300X
-         :sync: MI325X and MI300X
-
-         .. code-block:: shell
-
-            TEE_OUTPUT=1 \
-            MBS=2 \
-            BS=128 \
-            TP=1 \
-            TE_FP8=1 \
-            SEQ_LENGTH=8192 \
-            MODEL_SIZE=8 \
-            TOTAL_ITERS=50 \
-            bash examples/llama/train_llama3.sh
+      TEE_OUTPUT=1 \
+      MBS=2 \
+      BS=128 \
+      TP=1 \
+      TE_FP8=1 \
+      SEQ_LENGTH=8192 \
+      MODEL_SIZE=8 \
+      TOTAL_ITERS=50 \
+      bash examples/llama/train_llama3.sh

   For Llama 3.1 8B BF16, use the following command:

-   .. tab-set::
+   .. code-block:: shell

-      .. tab-item:: MI355X and MI350X
-         :sync: MI355X and MI350X
-
-         .. code-block:: shell
-
-            TEE_OUTPUT=1 \
-            MBS=4 \
-            BS=512 \
-            TP=1 \
-            TE_FP8=0 \
-            SEQ_LENGTH=8192 \
-            MODEL_SIZE=8 \
-            TOTAL_ITERS=10 \
-            GEMM_TUNING=1 \
-            bash examples/llama/train_llama3.sh
-
-      .. tab-item:: MI300X
-         :sync: MI325X and MI300X
-
-         .. code-block:: shell
-
-            TEE_OUTPUT=1 \
-            MBS=2 \
-            BS=128 \
-            TP=1 \
-            TE_FP8=0 \
-            SEQ_LENGTH=8192 \
-            MODEL_SIZE=8 \
-            TOTAL_ITERS=50 \
-            bash examples/llama/train_llama3.sh
+      TEE_OUTPUT=1 \
+      MBS=2 \
+      BS=128 \
+      TP=1 \
+      TE_FP8=0 \
+      SEQ_LENGTH=8192 \
+      MODEL_SIZE=8 \
+      TOTAL_ITERS=50 \
+      bash examples/llama/train_llama3.sh

 .. container:: model-doc pyt_megatron_lm_train_llama-3.1-70b

@@ -636,60 +625,29 @@ Single node training
      parallelism, MCore's distributed optimizer, gradient accumulation fusion,
      or FP16.

-   To run the training on a single node for Llama 3.1 70B FP8, use the
-   following command.
+.. container:: model-doc pyt_megatron_lm_train_llama-3.1-70b-proxy
+
+   To run the training on a single node for Llama 3.1 70B with proxy, use the following command.
+
+   .. code-block:: shell
+
+      CKPT_FORMAT=torch_dist \
+      TEE_OUTPUT=1 \
+      RECOMPUTE=1 \
+      MBS=3 \
+      BS=24 \
+      TP=1 \
+      TE_FP8=1 \
+      SEQ_LENGTH=8192 \
+      MODEL_SIZE=70 \
+      FSDP=1 \
+      TOTAL_ITERS=10 \
+      NUM_LAYERS=40 \
+      bash examples/llama/train_llama3.sh

   .. note::

-      The MI300X configuration uses a proxy model. On MI300X GPUs, use two or more nodes
-      to run the full Llama 3.1 70B model with FP8 precision. MI355X and MI350X GPUs
-      can support the full 70B model with FP8 precision on a single node.
-
-   .. tab-set::
-
-      .. tab-item:: MI355X and MI350X
-         :sync: MI355X and MI350X
-
-         .. code-block:: shell
-
-            CKPT_FORMAT=torch_dist \
-            TEE_OUTPUT=1 \
-            RECOMPUTE=1 \
-            MBS=3 \
-            BS=24 \
-            TP=1 \
-            TE_FP8=1 \
-            SEQ_LENGTH=8192 \
-            MODEL_SIZE=70 \
-            FSDP=1 \
-            TOTAL_ITERS=10 \
-            bash examples/llama/train_llama3.sh
-
-      .. tab-item:: MI300X
-         :sync: MI325X and MI300X
-
-         .. code-block:: shell
-
-            FP8_WEIGHT_TRANSPOSE_CACHE=0 \
-            CKPT_FORMAT=torch_dist \
-            TEE_OUTPUT=1 \
-            RECOMPUTE=1 \
-            MBS=3 \
-            BS=24 \
-            TP=1 \
-            TE_FP8=1 \
-            SEQ_LENGTH=8192 \
-            MODEL_SIZE=70 \
-            FSDP=1 \
-            TOTAL_ITERS=10 \
-            NUM_LAYERS=40 \
-            bash examples/llama/train_llama3.sh
-
-   .. note::
-
-      The MI300X configuration uses a proxy model. On MI300X GPUs, use two or more nodes
-      to run the full Llama 3.1 70B model with FP8 precision. MI355X and MI350X GPUs
-      can support the full 70B model with FP8 precision on a single node.
+      Use two or more nodes to run the *full* Llama 70B model with FP8 precision.

   .. note::

@@ -1029,11 +987,6 @@ The benchmark tests support the following sets of variables.
 ``RECOMPUTE_NUM_LAYERS``
  Number of layers used for checkpointing recompute.

-Known issues
-============
-
-PyTorch Profiler may produce inaccurate traces when CPU activity profiling is enabled.
-
 Previous versions
 =================

--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history.rst
@@ -17,35 +17,27 @@ previous releases of the ``ROCm/jax-training`` Docker image on `Docker Hub <http
     - Components
     - Resources

-   * - 25.9 (latest)
-     -
-       * ROCm 7.0.0
-       * JAX 0.6.2
-     -
-       * :doc:`Documentation <../jax-maxtext>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7-jax060/images/sha256-7352212ae033a76dca2b9dceffc23c1b5f1a61a7a560082cf747a9bf1acfc9ce>`__
-
-   * - 25.7
-     -
+   * - 25.7 (latest)
+     - 
       * ROCm 6.4.1
       * JAX 0.6.0, 0.5.0
-     -
-       * :doc:`Documentation <jax-maxtext-v25.7>`
+     - 
+       * :doc:`Documentation <../jax-maxtext>`
       * `Docker Hub (JAX 0.6.0) <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7-jax060/images/sha256-7352212ae033a76dca2b9dceffc23c1b5f1a61a7a560082cf747a9bf1acfc9ce>`__
       * `Docker Hub (JAX 0.5.0) <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7/images/sha256-45f4c727d4019a63fc47313d3a5f5a5105569539294ddfd2d742218212ae9025>`__

   * - 25.5
-     -
+     - 
       * ROCm 6.3.4
       * JAX 0.4.35
-     -
+     - 
       * :doc:`Documentation <jax-maxtext-v25.5>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.5/images/sha256-4e0516358a227cae8f552fb866ec07e2edcf244756f02e7b40212abfbab5217b>`__

   * - 25.4
-     -
+     - 
       * ROCm 6.3.0
       * JAX 0.4.31
-     -
+     - 
       * :doc:`Documentation <jax-maxtext-v25.4>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.4/images/sha256-fb3eb71cd74298a7b3044b7130cf84113f14d518ff05a2cd625c11ea5f6a7b01>`__
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.7.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.7.rst
@@ -1,366 +0,0 @@
-:orphan:
-
-.. meta::
-   :description: How to train a model using JAX MaxText for ROCm.
-   :keywords: ROCm, AI, LLM, train, jax, torch, Llama, flux, tutorial, docker
-
-******************************************
-Training a model with JAX MaxText on ROCm
-******************************************
-
-.. caution::
-
-   This documentation does not reflect the latest version of ROCm JAX MaxText
-   training performance documentation. See :doc:`../jax-maxtext` for the latest version.
-
-MaxText is a high-performance, open-source framework built on the Google JAX
-machine learning library to train LLMs at scale. The MaxText framework for
-ROCm is an optimized fork of the upstream
-`<https://github.com/AI-Hypercomputer/maxtext>`__ enabling efficient AI workloads
-on AMD MI300X series GPUs.
-
-The MaxText for ROCm training Docker image
-provides a prebuilt environment for training on AMD Instinct MI300X and MI325X GPUs,
-including essential components like JAX, XLA, ROCm libraries, and MaxText utilities.
-It includes the following software components:
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
-
-   {% set dockers = data.dockers %}
-   .. tab-set::
-
-      {% for docker in dockers %}
-      {% set jax_version = docker.components["JAX"] %}
-
-      .. tab-item:: ``{{ docker.pull_tag }}``
-         :sync: {{ docker.pull_tag }}
-
-         .. list-table::
-            :header-rows: 1
-
-            * - Software component
-              - Version
-
-            {% for component_name, component_version in docker.components.items() %}
-            * - {{ component_name }}
-              - {{ component_version }}
-
-            {% endfor %}
-         {% if jax_version == "0.6.0" %}
-         .. note::
-
-            Shardy is a new config in JAX 0.6.0. You might get related errors if it's
-            not configured correctly. For now you can turn it off by setting
-            ``shardy=False`` during the training run. You can also follow the `migration
-            guide <https://docs.jax.dev/en/latest/shardy_jax_migration.html>`__ to enable
-            it.
-         {% endif %}
-
-      {% endfor %}
-
-MaxText with on ROCm provides the following key features to train large language models efficiently:
-
- Transformer Engine (TE)
-
- Flash Attention (FA) 3 -- with or without sequence input packing
-
- GEMM tuning
-
- Multi-node support
-
- NANOO FP8 quantization support
-
-.. _amd-maxtext-model-support-v257:
-
-Supported models
-================
-
-The following models are pre-optimized for performance on AMD Instinct MI300
-series GPUs. Some instructions, commands, and available training
-configurations in this documentation might vary by model -- select one to get
-started.
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
-
-   {% set model_groups = data.model_groups %}
-   .. raw:: html
-
-      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
-         <div class="row gx-0">
-            <div class="col-2 me-1 px-2 model-param-head">Model</div>
-            <div class="row col-10 pe-0">
-      {% for model_group in model_groups %}
-               <div class="col-4 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
-      {% endfor %}
-            </div>
-         </div>
-
-         <div class="row gx-0 pt-1">
-            <div class="col-2 me-1 px-2 model-param-head">Variant</div>
-            <div class="row col-10 pe-0">
-      {% for model_group in model_groups %}
-         {% set models = model_group.models %}
-         {% for model in models %}
-            {% if models|length % 3 == 0 %}
-               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-            {% else %}
-               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-            {% endif %}
-         {% endfor %}
-      {% endfor %}
-            </div>
-         </div>
-      </div>
-
-.. note::
-
-   Some models, such as Llama 3, require an external license agreement through
-   a third party (for example, Meta).
-
-System validation
-=================
-
-Before running AI workloads, it's important to validate that your AMD hardware is configured
-correctly and performing optimally.
-
-If you have already validated your system settings, including aspects like NUMA auto-balancing, you
-can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
-optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
-before starting training.
-
-To test for optimal performance, consult the recommended :ref:`System health benchmarks
-<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
-system's configuration.
-
-Environment setup
-=================
-
-This Docker image is optimized for specific model configurations outlined
-as follows. Performance can vary for other training workloads, as AMD
-doesn’t validate configurations and run conditions outside those described.
-
-Pull the Docker image
---------------------
-
-Use the following command to pull the Docker image from Docker Hub.
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
-
-   {% set dockers = data.dockers %}
-   .. tab-set::
-
-      {% for docker in dockers %}
-      {% set jax_version = docker.components["JAX"] %}
-
-      .. tab-item:: JAX {{ jax_version }}
-         :sync: {{ docker.pull_tag }}
-
-         .. code-block:: shell
-
-            docker pull {{ docker.pull_tag }}
-
-      {% endfor %}
-
-.. _amd-maxtext-multi-node-setup-v257:
-
-Multi-node configuration
------------------------
-
-See :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your
-environment for multi-node training.
-
-.. _amd-maxtext-get-started-v257:
-
-Benchmarking
-============
-
-Once the setup is complete, choose between two options to reproduce the
-benchmark results:
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
-
-   .. _vllm-benchmark-mad:
-
-   {% set dockers = data.dockers %}
-   {% set model_groups = data.model_groups %}
-   {% for model_group in model_groups %}
-      {% for model in model_group.models %}
-
-   .. container:: model-doc {{model.mad_tag}}
-
-      .. tab-set::
-
-         {% if model.mad_tag and "single-node" in model.doc_options %}
-         .. tab-item:: MAD-integrated benchmarking
-
-            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
-               directory and install the required packages on the host machine.
-
-               .. code-block:: shell
-
-                  git clone https://github.com/ROCm/MAD
-                  cd MAD
-                  pip install -r requirements.txt
-
-            2. Use this command to run the performance benchmark test on the {{ model.model }} model
-               using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
-
-               .. code-block:: shell
-
-                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
-                  madengine run \
-                      --tags {{model.mad_tag}} \
-                      --keep-model-dir \
-                      --live-output \
-                      --timeout 28800
-
-            MAD launches a Docker container with the name
-            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
-            model are collected in the following path: ``~/MAD/perf.csv/``.
-         {% endif %}
-
-         .. tab-item:: Standalone benchmarking
-
-            .. rubric:: Download the Docker image and required scripts
-
-            Run the JAX MaxText benchmark tool independently by starting the
-            Docker container as shown in the following snippet.
-
-            .. tab-set::
-               {% for docker in dockers %}
-               {% set jax_version = docker.components["JAX"] %}
-
-               .. tab-item:: JAX {{ jax_version }}
-                  :sync: {{ docker.pull_tag }}
-
-                  .. code-block:: shell
-
-                     docker pull {{ docker.pull_tag }}
-               {% endfor %}
-
-            {% if model.model_repo and "single-node" in model.doc_options %}
-            .. rubric:: Single node training
-
-            1. Set up environment variables.
-
-               .. code-block:: shell
-
-                  export MAD_SECRETS_HFTOKEN=<Your Hugging Face token>
-                  export HF_HOME=<Location of saved/cached Hugging Face models>
-
-               ``MAD_SECRETS_HFTOKEN`` is your Hugging Face access token to access models, tokenizers, and data.
-               See `User access tokens <https://huggingface.co/docs/hub/en/security-tokens>`__.
-
-               ``HF_HOME`` is where ``huggingface_hub`` will store local data. See `huggingface_hub CLI <https://huggingface.co/docs/huggingface_hub/main/en/guides/cli#huggingface-cli-download>`__.
-               If you already have downloaded or cached Hugging Face artifacts, set this variable to that path.
-               Downloaded files typically get cached to ``~/.cache/huggingface``.
-
-            2. Launch the Docker container.
-
-               .. tab-set::
-                  {% for docker in dockers %}
-                  {% set jax_version = docker.components["JAX"] %}
-
-                  .. tab-item:: JAX {{ jax_version }}
-                     :sync: {{ docker.pull_tag }}
-
-                     .. code-block:: shell
-
-                        docker run -it \
-                            --device=/dev/dri \
-                            --device=/dev/kfd \
-                            --network host \
-                            --ipc host \
-                            --group-add video \
-                            --cap-add=SYS_PTRACE \
-                            --security-opt seccomp=unconfined \
-                            --privileged \
-                            -v $HOME:$HOME \
-                            -v $HOME/.ssh:/root/.ssh \
-                            -v $HF_HOME:/hf_cache \
-                            -e HF_HOME=/hf_cache \
-                            -e MAD_SECRETS_HFTOKEN=$MAD_SECRETS_HFTOKEN
-                            --shm-size 64G \
-                            --name training_env \
-                            {{ docker.pull_tag }}
-                  {% endfor %}
-
-            3. In the Docker container, clone the ROCm MAD repository and navigate to the
-               benchmark scripts directory at ``MAD/scripts/jax-maxtext``.
-
-               .. code-block:: shell
-
-                  git clone https://github.com/ROCm/MAD
-                  cd MAD/scripts/jax-maxtext
-
-            4. Run the setup scripts to install libraries and datasets needed
-               for benchmarking.
-
-               .. code-block:: shell
-
-                  ./jax-maxtext_benchmark_setup.sh -m {{ model.model_repo }}
-
-            5. To run the training benchmark without quantization, use the following command:
-
-               .. code-block:: shell
-
-                  ./jax-maxtext_benchmark_report.sh -m {{ model.model_repo }}
-
-               For quantized training, use the following command:
-
-               .. code-block:: shell
-
-                  ./jax-maxtext_benchmark_report.sh -m {{ model.model_repo }} -q nanoo_fp8
-
-            {% endif %}
-            {% if model.multinode_training_script and "multi-node" in model.doc_options %}
-            .. rubric:: Multi-node training
-
-            The following examples use SLURM to run on multiple nodes.
-
-            .. note::
-
-               The following scripts will launch the Docker container and run the
-               benchmark. Run them outside of any Docker container.
-
-            1. Make sure ``$HF_HOME`` is set before running the test. See
-               `ROCm benchmarking <https://github.com/ROCm/MAD/blob/develop/scripts/jax-maxtext/gpu-rocm/readme.md>`__
-               for more details on downloading the Llama models before running the
-               benchmark.
-
-            2. To run multi-node training for {{ model.model }},
-               use the
-               `multi-node training script <https://github.com/ROCm/MAD/blob/develop/scripts/jax-maxtext/gpu-rocm/{{ model.multinode_training_script }}>`__
-               under the ``scripts/jax-maxtext/gpu-rocm/`` directory.
-
-            3. Run the multi-node training benchmark script.
-
-               .. code-block:: shell
-
-                  sbatch -N <num_nodes> {{ model.multinode_training_script }}
-
-         {% else %}
-            .. rubric:: Multi-node training
-
-            For multi-node training examples, choose a model from :ref:`amd-maxtext-model-support-v257`
-            with an available `multi-node training script <https://github.com/ROCm/MAD/tree/develop/scripts/jax-maxtext/gpu-rocm>`__.
-         {% endif %}
-      {% endfor %}
-   {% endfor %}
-
-Further reading
-===============
-
- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
-
- To learn more about system settings and management practices to configure your system for
-  AMD Instinct MI300X series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
-
- For a list of other ready-made Docker images for AI with ROCm, see
-  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
-
-Previous versions
-=================
-
-See :doc:`jax-maxtext-history` to find documentation for previous releases
-of the ``ROCm/jax-training`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst
@@ -16,73 +16,62 @@ previous releases of the ``ROCm/megatron-lm`` Docker image on `Docker Hub <https
     - Components
     - Resources

-   * - v25.9 (latest)
-     -
-       * ROCm 7.0.0
-       * Primus 0.3.0
-       * PyTorch 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
-     -
-       * :doc:`Primus Megatron documentation <../primus-megatron>`
-       * :doc:`Megatron-LM (legacy) documentation <../megatron-lm>`
-       * `Docker Hub (gfx950) <https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6>`__
-       * `Docker Hub (gfx942) <https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357>`__
-
-   * - v25.8
-     -
+   * - v25.8 (latest)
+     - 
       * ROCm 6.4.3
       * PyTorch 2.8.0a0+gitd06a406
-     -
-       * :doc:`Primus Megatron documentation <primus-megatron-v25.8>`
-       * :doc:`Megatron-LM (legacy) documentation <megatron-lm-v25.8>`
-       * `Docker Hub (py310) <https://hub.docker.com/layers/rocm/megatron-lm/v25.8_py310/images/sha256-0030c4a3dcb233c66dd5f61135821f9f5c4e321cbe0a2cdc74f110752f28c869>`__
+     - 
+       * :doc:`Primus Megatron documentation <../primus-megatron>`
+       * :doc:`Megatron-LM (legacy) documentation <../megatron-lm>`
+       * `Docker Hub (py310) <https://hub.docker.com/r/rocm/megatron-lm/tags>`__

   * - v25.7
-     -
+     - 
       * ROCm 6.4.2
       * PyTorch 2.8.0a0+gitd06a406
-     -
+     - 
       * :doc:`Primus Megatron documentation <primus-megatron-v25.7>`
       * :doc:`Megatron-LM (legacy) documentation <megatron-lm-v25.7>`
       * `Docker Hub (py310) <https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a>`__

   * - v25.6
-     -
+     - 
       * ROCm 6.4.1
       * PyTorch 2.8.0a0+git7d205b2
-     -
+     - 
       * :doc:`Documentation <megatron-lm-v25.6>`
       * `Docker Hub (py312) <https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py312/images/sha256-482ff906532285bceabdf2bda629bd32cb6174d2d07f4243a736378001b28df0>`__
       * `Docker Hub (py310) <https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py310/images/sha256-9627bd9378684fe26cb1a10c7dd817868f553b33402e49b058355b0f095568d6>`__

   * - v25.5
-     -
+     - 
       * ROCm 6.3.4
       * PyTorch 2.8.0a0+gite2f9759
-     -
+     - 
       * :doc:`Documentation <megatron-lm-v25.5>`
       * `Docker Hub (py312) <https://hub.docker.com/layers/rocm/megatron-lm/v25.5_py312/images/sha256-4506f18ba188d24189c6b1f95130b425f52c528a543bb3f420351824edceadc2>`__
       * `Docker Hub (py310) <https://hub.docker.com/layers/rocm/megatron-lm/v25.5_py310/images/sha256-743fbf1ceff7a44c4452f938d783a7abf143737d1c15b2b95f6f8a62e0fd048b>`__

   * - v25.4
-     -
+     - 
       * ROCm 6.3.0
-       * PyTorch 2.7.0a0+git637433
-     -
+       * PyTorch 2.7.0a0+git637433 
+     - 
       * :doc:`Documentation <megatron-lm-v25.4>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/megatron-lm/v25.4/images/sha256-941aa5387918ea91c376c13083aa1e6c9cab40bb1875abbbb73bbb65d8736b3f>`__

   * - v25.3
-     -
+     - 
       * ROCm 6.3.0
-       * PyTorch 2.7.0a0+git637433
-     -
+       * PyTorch 2.7.0a0+git637433 
+     - 
       * :doc:`Documentation <megatron-lm-v25.3>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/megatron-lm/v25.3/images/sha256-1e6ed9bdc3f4ca397300d5a9907e084ab5e8ad1519815ee1f868faf2af1e04e2>`__

   * - v24.12-dev
-     -
+     - 
       * ROCm 6.1.0
       * PyTorch 2.4.0
-     -
+     - 
       * :doc:`Documentation <megatron-lm-v24.12-dev>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/megatron-lm/24.12-dev/images/sha256-5818c50334ce3d69deeeb8f589d83ec29003817da34158ebc9e2d112b929bf2e>`__
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.8.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.8.rst
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.8.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.8.rst
@@ -1,667 +0,0 @@
-:orphan:
-
-.. meta::
-   :description: How to train a model using Megatron-LM for ROCm.
-   :keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
-
-********************************************
-Training a model with Primus and Megatron-LM
-********************************************
-
-.. caution::
-
-   This documentation does not reflect the latest version of ROCm Megatron-LM
-   training performance documentation. See :doc:`../primus-megatron` for the latest version.
-
-`Primus <https://github.com/AMD-AGI/Primus>`__ is a unified and flexible
-LLM training framework designed to streamline training. It streamlines LLM
-training on AMD Instinct GPUs using a modular, reproducible configuration paradigm.
-Primus is backend-agnostic and supports multiple training engines -- including Megatron.
-
-.. note::
-
-   Primus with Megatron is designed to replace the :doc:`ROCm Megatron-LM training <../megatron-lm>` workflow.
-   To learn how to migrate workloads from Megatron-LM to Primus with Megatron,
-   see :doc:`megatron-lm-primus-migration-guide`.
-
-For ease of use, AMD provides a ready-to-use Docker image for MI300 series GPUs
-containing essential components for Primus and Megatron-LM. This Docker is powered by Primus
-Turbo optimizations for performance; this release adds support for Primus Turbo
-with optimized attention and grouped GEMM kernels.
-
-.. note::
-
-   This Docker environment is based on Python 3.10 and Ubuntu 22.04. For an alternative environment with
-   Python 3.12 and Ubuntu 24.04, see the :doc:`previous ROCm Megatron-LM v25.6 Docker release <megatron-lm-v25.6>`.
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.8-benchmark-models.yaml
-
-   {% set dockers = data.dockers %}
-   {% set docker = dockers[0] %}
-   .. list-table::
-      :header-rows: 1
-
-      * - Software component
-        - Version
-
-      {% for component_name, component_version in docker.components.items() %}
-      * - {{ component_name }}
-        - {{ component_version }}
-      {% endfor %}
-
-.. _amd-primus-megatron-lm-model-support:
-
-Supported models
-================
-
-The following models are pre-optimized for performance on AMD Instinct MI300X series GPUs.
-Some instructions, commands, and training examples in this documentation might
-vary by model -- select one to get started.
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.8-benchmark-models.yaml
-
-   {% set model_groups = data.model_groups %}
-   .. raw:: html
-
-      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
-         <div class="row gx-0">
-            <div class="col-2 me-1 px-2 model-param-head">Model</div>
-            <div class="row col-10 pe-0">
-      {% for model_group in model_groups %}
-               <div class="col-3 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
-      {% endfor %}
-            </div>
-         </div>
-
-         <div class="row gx-0 pt-1">
-            <div class="col-2 me-1 px-2 model-param-head">Variant</div>
-            <div class="row col-10 pe-0">
-      {% for model_group in model_groups %}
-         {% set models = model_group.models %}
-         {% for model in models %}
-            {% if models|length % 3 == 0 %}
-               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-            {% else %}
-               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-            {% endif %}
-         {% endfor %}
-      {% endfor %}
-            </div>
-         </div>
-      </div>
-
-.. note::
-
-   Some models, such as Llama, require an external license agreement through
-   a third party (for example, Meta).
-
-System validation
-=================
-
-Before running AI workloads, it's important to validate that your AMD hardware is configured
-correctly and performing optimally.
-
-If you have already validated your system settings, including aspects like NUMA auto-balancing, you
-can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
-optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
-before starting training.
-
-To test for optimal performance, consult the recommended :ref:`System health benchmarks
-<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
-system's configuration.
-
-.. _mi300x-amd-primus-megatron-lm-training:
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.8-benchmark-models.yaml
-
-   {% set dockers = data.dockers %}
-      {% set docker = dockers[0] %}
-
-   Environment setup
-   =================
-
-   Use the following instructions to set up the environment, configure the script to train models, and
-   reproduce the benchmark results on MI300X series GPUs with the ``{{ docker.pull_tag }}`` image.
-
-   .. _amd-primus-megatron-lm-requirements:
-
-   Download the Docker image
-   -------------------------
-
-   1. Use the following command to pull the Docker image from Docker Hub.
-
-      .. code-block:: shell
-
-         docker pull {{ docker.pull_tag }}
-
-   2. Launch the Docker container.
-
-      .. code-block:: shell
-
-         docker run -it \
-             --device /dev/dri \
-             --device /dev/kfd \
-             --device /dev/infiniband \
-             --network host --ipc host \
-             --group-add video \
-             --cap-add SYS_PTRACE \
-             --security-opt seccomp=unconfined \
-             --privileged \
-             -v $HOME:$HOME \
-             --shm-size 128G \
-             --name primus_training_env \
-             {{ docker.pull_tag }}
-
-3. Use these commands if you exit the ``primus_training_env`` container and need to return to it.
-
-   .. code-block:: shell
-
-      docker start primus_training_env
-      docker exec -it primus_training_env bash
-
-The Docker container hosts verified commit ``927a717`` of the `Primus
-<https://github.com/AMD-AGI/Primus/tree/927a71702784347a311ca48fd45f0f308c6ef6dd>`__ repository.
-
-.. _amd-primus-megatron-lm-environment-setup:
-
-Configuration
-=============
-
-Primus defines a training configuration in YAML for each model in
-`examples/megatron/configs <https://github.com/AMD-AGI/Primus/tree/927a71702784347a311ca48fd45f0f308c6ef6dd/examples/megatron/configs>`__.
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.8-benchmark-models.yaml
-
-   {% set model_groups = data.model_groups %}
-   {% for model_group in model_groups %}
-      {% for model in model_group.models %}
-   .. container:: model-doc {{ model.mad_tag }}
-
-      To update training parameters for {{ model.model }}, you can update ``examples/megatron/configs/{{ model.config_name }}``.
-      Note that training configuration YAML files for other models follow this naming convention.
-
-      {% endfor %}
-   {% endfor %}
-
-.. note::
-
-   See :ref:`Key options <amd-primus-megatron-lm-benchmark-test-vars>` for more information on configuration options.
-
-Dataset options
---------------
-
-You can use either mock data or real data for training.
-
-* Mock data can be useful for testing and validation. Use the ``mock_data`` field to toggle between mock and real data. The default
-  value is ``true`` for enabled.
-
-  .. code-block:: yaml
-
-     mock_data: true
-
-* If you're using a real dataset, update the ``train_data_path`` field to point to the location of your dataset.
-
-  .. code-block:: bash
-
-     mock_data: false
-     train_data_path: /path/to/your/dataset
-
-  Ensure that the files are accessible inside the Docker container.
-
-.. _amd-primus-megatron-lm-tokenizer:
-
-Tokenizer
---------
-
-Set the ``HF_TOKEN`` environment variable with
-right permissions to access the tokenizer for each model.
-
-.. code-block:: bash
-
-   # Export your HF_TOKEN in the workspace
-   export HF_TOKEN=<your_hftoken>
-
-.. note::
-
-   In Primus, each model uses a tokenizer from Hugging Face. For example, Llama
-   3.1 8B model uses ``tokenizer_model: meta-llama/Llama-3.1-8B`` and
-   ``tokenizer_type: Llama3Tokenizer`` defined in the `llama3.1-8B model
-   <https://github.com/AMD-AGI/Primus/blob/927a71702784347a311ca48fd45f0f308c6ef6dd/examples/megatron/configs/llama3.1_8B-pretrain.yaml>`__
-   definition.
-
-.. _amd-primus-megatron-lm-run-training:
-
-Run training
-============
-
-Use the following example commands to set up the environment, configure
-:ref:`key options <amd-primus-megatron-lm-benchmark-test-vars>`, and run training on
-MI300X series GPUs with the AMD Megatron-LM environment.
-
-Single node training
--------------------
-
-To run training on a single node, navigate to ``/workspace/Primus`` and use the following setup command:
-
-.. code-block:: shell
-
-   pip install -r requirements.txt
-   export HSA_NO_SCRATCH_RECLAIM=1
-   export NVTE_CK_USES_BWD_V3=1
-
-.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.3-70b
-
-   Once setup is complete, run the appropriate training command.
-   The following run commands are tailored to Llama 3.3 70B.
-   See :ref:`amd-primus-megatron-lm-model-support` to switch to another available model.
-
-   To run pre-training for Llama 3.3 70B BF16, run:
-
-   .. code-block:: shell
-
-      EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
-      bash ./examples/run_pretrain.sh \
-          --micro_batch_size 2 \
-          --global_batch_size 16 \
-          --train_iters 50
-
-.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b
-
-   Once setup is complete, run the appropriate training command.
-   The following run commands are tailored to Llama 3.1 8B.
-   See :ref:`amd-primus-megatron-lm-model-support` to switch to another available model.
-
-   To run pre-training for Llama 3.1 8B FP8, run:
-
-   .. code-block:: shell
-
-      EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
-      bash ./examples/run_pretrain.sh \
-          --train_iters 50 \
-          --fp8 hybrid
-
-   For Llama 3.1 8B BF16, use the following command:
-
-   .. code-block:: shell
-
-      EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
-      bash ./examples/run_pretrain.sh --train_iters 50
-
-.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-70b
-
-   Once setup is complete, run the appropriate training command.
-   The following run commands are tailored to Llama 3.1 70B.
-   See :ref:`amd-primus-megatron-lm-model-support` to switch to another available model.
-
-   To run pre-training for Llama 3.1 70B BF16, run:
-
-   .. code-block:: shell
-
-      EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
-      bash ./examples/run_pretrain.sh \
-           --train_iters 50
-
-   To run the training on a single node for Llama 3.1 70B FP8 with proxy, use the following command:
-
-   .. code-block:: shell
-
-      EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
-      bash ./examples/run_pretrain.sh \
-          --train_iters 50 \
-          --num_layers 40 \
-          --fp8 hybrid
-
-   .. note::
-
-      Use two or more nodes to run the *full* Llama 70B model with FP8 precision.
-
-.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-7b
-
-   Once setup is complete, run the appropriate training command.
-   The following run commands are tailored to Llama 2 7B.
-   See :ref:`amd-primus-megatron-lm-model-support` to switch to another available model.
-
-   To run pre-training for Llama 2 7B FP8, run:
-
-   .. code-block:: shell
-
-      EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
-      bash ./examples/run_pretrain.sh \
-          --train_iters 50 \
-          --fp8 hybrid
-
-   To run pre-training for Llama 2 7B BF16, run:
-
-   .. code-block:: shell
-
-      EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
-      bash ./examples/run_pretrain.sh --train_iters 50
-
-.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-70b
-
-   Once setup is complete, run the appropriate training command.
-   The following run commands are tailored to Llama 2 70B.
-   See :ref:`amd-primus-megatron-lm-model-support` to switch to another available model.
-
-   To run pre-training for Llama 2 70B BF16, run:
-
-   .. code-block:: shell
-
-      EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
-      bash ./examples/run_pretrain.sh --train_iters 50
-
-.. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v3-proxy
-
-   Once setup is complete, run the appropriate training command.
-   The following run commands are tailored to DeepSeek-V3.
-   See :ref:`amd-primus-megatron-lm-model-support` to switch to another available model.
-
-   To run training on a single node for DeepSeek-V3 (MoE with expert parallel) with 3-layer proxy,
-   use the following command:
-
-   .. code-block:: shell
-
-      EXP=examples/megatron/configs/deepseek_v3-pretrain.yaml \
-      bash examples/run_pretrain.sh \
-          --num_layers 3 \
-          --moe_layer_freq 1 \
-          --train_iters 50
-
-.. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v2-lite-16b
-
-   Once setup is complete, run the appropriate training command.
-   The following run commands are tailored to DeepSeek-V2-Lite.
-   See :ref:`amd-primus-megatron-lm-model-support` to switch to another available model.
-
-   To run training on a single node for DeepSeek-V2-Lite (MoE with expert parallel),
-   use the following command:
-
-   .. code-block:: shell
-
-      EXP=examples/megatron/configs/deepseek_v2_lite-pretrain.yaml \
-      bash examples/run_pretrain.sh \
-          --global_batch_size 256 \
-          --train_iters 50
-
-.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x7b
-
-   Once setup is complete, run the appropriate training command.
-   The following run commands are tailored to Mixtral 8x7B.
-   See :ref:`amd-primus-megatron-lm-model-support` to switch to another available model.
-
-   To run training on a single node for Mixtral 8x7B (MoE with expert parallel),
-   use the following command:
-
-   .. code-block:: shell
-
-      EXP=examples/megatron/configs/mixtral_8x7B_v0.1-pretrain.yaml \
-      bash examples/run_pretrain.sh --train_iters 50
-
-.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x22b-proxy
-
-   Once setup is complete, run the appropriate training command.
-   The following run commands are tailored to Mixtral 8x22B.
-   See :ref:`amd-primus-megatron-lm-model-support` to switch to another available model.
-
-   To run training on a single node for Mixtral 8x22B (MoE with expert parallel) with 4-layer proxy,
-   use the following command:
-
-   .. code-block:: shell
-
-      EXP=examples/megatron/configs/mixtral_8x22B_v0.1-pretrain.yaml \
-      bash examples/run_pretrain.sh \
-          --num_layers 4 \
-          --pipeline_model_parallel_size 1 \
-          --micro_batch_size 1 \
-          --global_batch_size 16 \
-          --train_iters 50
-
-.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-7b
-
-   Once setup is complete, run the appropriate training command.
-   The following run commands are tailored to Qwen 2.5 7B.
-   See :ref:`amd-primus-megatron-lm-model-support` to switch to another available model.
-
-   To run training on a single node for Qwen 2.5 7B BF16, use the following
-   command:
-
-   .. code-block:: shell
-
-      EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \
-      bash examples/run_pretrain.sh --train_iters 50
-
-   For FP8, use the following command.
-
-   .. code-block:: shell
-
-      EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \
-      bash examples/run_pretrain.sh \
-          --train_iters 50 \
-          --fp8 hybrid
-
-.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-72b
-
-   Once setup is complete, run the appropriate training command.
-   The following run commands are tailored to Qwen 2.5 72B.
-   See :ref:`amd-primus-megatron-lm-model-support` to switch to another available model.
-
-   To run the training on a single node for Qwen 2.5 72B BF16, use the following command.
-
-   .. code-block:: shell
-
-      EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \
-      bash examples/run_pretrain.sh --train_iters 50
-
-.. _amd-primus-megatron-multi-node-examples:
-
-Multi-node training examples
----------------------------
-
-Refer to :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your environment for multi-node
-training.
-
-To run training on multiple nodes, you can use the
-`run_slurm_pretrain.sh <https://github.com/AMD-AGI/Primus/blob/927a71702784347a311ca48fd45f0f308c6ef6dd/examples/run_slurm_pretrain.sh>`__
-to launch the multi-node workload. Use the following steps to setup your environment:
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.8-benchmark-models.yaml
-
-   {% set dockers = data.dockers %}
-   {% set docker = dockers[0] %}
-
-   .. code-block:: shell
-
-      cd /workspace/Primus/
-      export DOCKER_IMAGE={{ docker.pull_tag }}
-      export HF_TOKEN=<your_HF_token>
-      export HSA_NO_SCRATCH_RECLAIM=1
-      export NVTE_CK_USES_BWD_V3=1
-      export NCCL_IB_HCA=<your_NCCL_IB_HCA> # specify which RDMA interfaces to use for communication
-      export NCCL_SOCKET_IFNAME=<your_NCCL_SOCKET_IFNAME> # your Network Interface
-      export GLOO_SOCKET_IFNAME=<your_GLOO_SOCKET_IFNAME> # your Network Interface
-      export NCCL_IB_GID_INDEX=3 # Set InfiniBand GID index for NCCL communication. Default is 3 for ROCE
-
-.. note::
-
-   * Make sure correct network drivers are installed on the nodes. If inside a Docker, either install the drivers inside the Docker container or pass the network drivers from the host while creating Docker container.
-   * If ``NCCL_IB_HCA`` and ``NCCL_SOCKET_IFNAME`` are not set, Primus will try to auto-detect. However, since NICs can vary accross different cluster, it is encouraged to explicitly export your NCCL parameters for the cluster.
-   * To find your network interface, you can use ``ip a``.
-   * To find RDMA interfaces, you can use ``ibv_devices`` to get the list of all the RDMA/IB  devices.
-
-.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.3-70b
-
-   To train Llama 3.3 70B FP8 on 8 nodes, run:
-
-   .. code-block:: shell
-
-      NNODES=8 EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
-      bash examples/run_slurm_pretrain.sh \
-          --micro_batch_size 1 \
-          --global_batch_size 256 \
-          --recompute_num_layers 80 \
-          --fp8 hybrid
-
-   To train Llama 3.3 70B BF16 on 8 nodes, run:
-
-   .. code-block:: shell
-
-      NNODES=8 EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
-      bash examples/run_slurm_pretrain.sh \
-          --micro_batch_size 1 \
-          --global_batch_size 256 \
-          --recompute_num_layers 12
-
-.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b
-
-   To train Llama 3.1 8B FP8 on 8 nodes, run:
-
-   .. code-block:: shell
-
-      # Adjust the training parameters. For e.g., `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case
-      NNODES=8 EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
-      bash ./examples/run_slurm_pretrain.sh \
-          --global_batch_size 1024 \
-          --fp8 hybrid
-
-.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-70b
-
-   To train Llama 3.1 70B FP8 on 8 nodes, run:
-
-   .. code-block:: shell
-
-      NNODES=8 EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
-      bash examples/run_slurm_pretrain.sh \
-          --micro_batch_size 1 \
-          --global_batch_size 256 \
-          --recompute_num_layers 80 \
-          --fp8 hybrid
-
-   To train Llama 3.1 70B BF16 on 8 nodes, run:
-
-   .. code-block:: shell
-
-      NNODES=8 EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
-      bash examples/run_slurm_pretrain.sh \
-          --micro_batch_size 1 \
-          --global_batch_size 256 \
-          --recompute_num_layers 12
-
-.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-7b
-
-   To train Llama 2 8B FP8 on 8 nodes, run:
-
-   .. code-block:: shell
-
-      # Adjust the training parameters. For e.g., `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case
-      NNODES=8 EXP=examples/megatron/configs/llama2_7B-pretrain.yaml bash ./examples/run_slurm_pretrain.sh --global_batch_size 2048 --fp8 hybrid
-
-.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-70b
-
-   To train Llama 2 70B FP8 on 8 nodes, run:
-
-   .. code-block:: shell
-
-      NNODES=8 EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
-      bash examples/run_slurm_pretrain.sh \
-          --micro_batch_size 2 \
-          --global_batch_size 256 \
-          --recompute_num_layers 80 \
-          --fp8 hybrid
-
-   To train Llama 2 70B BF16 on 8 nodes, run:
-
-   .. code-block:: shell
-
-      NNODES=8 EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
-      bash ./examples/run_slurm_pretrain.sh \
-          --micro_batch_size 2 \
-          --global_batch_size 1536 \
-          --recompute_num_layers 12
-
-.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x7b
-
-   To train Mixtral 8x7B BF16 on 8 nodes, run:
-
-   .. code-block:: shell
-
-      NNODES=8 EXP=examples/megatron/configs/mixtral_8x7B_v0.1-pretrain.yaml \
-      bash examples/run_slurm_pretrain.sh \
-          --micro_batch_size 2 \
-          --global_batch_size 256
-
-.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-72b
-
-   To train Qwen2.5 72B FP8 on 8 nodes, run:
-
-   .. code-block:: shell
-
-      NNODES=8 EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \
-      bash examples/run_slurm_pretrain.sh \
-          --micro_batch_size 4 \
-          --global_batch_size 256 \
-          --recompute_num_layers 80 \
-          --fp8 hybrid
-
-.. _amd-primus-megatron-lm-benchmark-test-vars:
-
-Key options
-----------
-
-The following are key options to take note of
-
-fp8
-  ``hybrid`` enables FP8 GEMMs.
-
-use_torch_fsdp2
-  ``use_torch_fsdp2: 1``  enables torch fsdp-v2. If FSDP is enabled,
-  set ``use_distributed_optimizer`` and ``overlap_param_gather`` to ``false``.
-
-profile
-  To enable PyTorch profiling, set these parameters:
-
-  .. code-block:: yaml
-
-     profile: true
-     use_pytorch_profiler: true
-     profile_step_end: 7
-     profile_step_start: 6
-
-train_iters
-  The total number of iterations (default: 50).
-
-mock_data
-  True by default.
-
-micro_batch_size
-  Micro batch size.
-
-global_batch_size
-  Global batch size.
-
-recompute_granularity
-  For activation checkpointing.
-
-num_layers
-  For using a reduced number of layers as with proxy models.
-
-Further reading
-===============
-
- For an introduction to Primus, see `Primus: A Lightweight, Unified Training
-  Framework for Large Models on AMD GPUs <https://rocm.blogs.amd.com/software-tools-optimization/primus/README.html>`__.
-
- To learn more about system settings and management practices to configure your system for
-  AMD Instinct MI300X series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
-
- For a list of other ready-made Docker images for AI with ROCm, see
-  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
-
-Previous versions
-=================
-
-See :doc:`megatron-lm-history` to find documentation for previous releases
-of the ``ROCm/megatron-lm`` Docker image.
-
-This training environment now uses Primus with Megatron as the primary
-configuration. Limited support for the legacy ROCm Megatron-LM is still
-available; see the :doc:`../megatron-lm` documentation.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-pytorch-v25.8.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-pytorch-v25.8.rst
@@ -1,312 +0,0 @@
-:orphan:
-
-.. meta::
-   :description: How to train a model using PyTorch for ROCm.
-   :keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
-
-****************************************
-Training a model with Primus and PyTorch
-****************************************
-
-.. caution::
-
-   This documentation does not reflect the latest version of ROCm Primus PyTorch training
-   performance benchmark documentation. See :doc:`../primus-pytorch` for the latest version.
-
-`Primus <https://github.com/AMD-AGI/Primus>`__ is a unified and flexible
-LLM training framework designed to streamline training. It streamlines LLM
-training on AMD Instinct GPUs using a modular, reproducible configuration paradigm.
-Primus now supports the PyTorch torchtitan backend.
-
-.. note::
-
-   Primus with the PyTorch torchtitan backend is designed to replace the :doc:`ROCm PyTorch training <../pytorch-training>` workflow.
-   See :doc:`../pytorch-training` to see steps to run workloads without Primus.
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.8-benchmark-models.yaml
-
-   {% set dockers = data.dockers %}
-   {% set docker = dockers[0] %}
-   For ease of use, AMD provides a ready-to-use Docker image -- ``{{
-   docker.pull_tag }}`` -- for MI300X series GPUs containing essential
-   components for Primus and PyTorch training with
-   Primus Turbo optimizations.
-
-   .. list-table::
-      :header-rows: 1
-
-      * - Software component
-        - Version
-
-      {% for component_name, component_version in docker.components.items() %}
-      * - {{ component_name }}
-        - {{ component_version }}
-      {% endfor %}
-
-.. _amd-primus-pytorch-model-support-v258:
-
-Supported models
-================
-
-The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X GPUs.
-Some instructions, commands, and training recommendations in this documentation might
-vary by model -- select one to get started.
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.8-benchmark-models.yaml
-
-   {% set unified_docker = data.dockers[0] %}
-   {% set model_groups = data.model_groups %}
-   .. raw:: html
-
-      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
-         <div class="row gx-0" style="display: none;">
-            <div class="col-2 me-1 px-2 model-param-head">Model</div>
-            <div class="row col-10 pe-0">
-      {% for model_group in model_groups %}
-               <div class="col-3 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
-      {% endfor %}
-            </div>
-         </div>
-
-         <div class="row gx-0 pt-1">
-            <div class="col-2 me-1 px-2 model-param-head">Model</div>
-            <div class="row col-10 pe-0">
-      {% for model_group in model_groups %}
-         {% set models = model_group.models %}
-         {% for model in models %}
-            {% if models|length % 3 == 0 %}
-               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-            {% else %}
-               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-            {% endif %}
-         {% endfor %}
-      {% endfor %}
-            </div>
-         </div>
-      </div>
-
-.. seealso::
-
-   For additional workloads, including Llama 3.3, Llama 3.2, Llama 2, GPT OSS, Qwen, and Flux models,
-   see the documentation :doc:`../pytorch-training` (without Primus)
-
-.. _amd-primus-pytorch-performance-measurements-v258:
-
-System validation
-=================
-
-Before running AI workloads, it's important to validate that your AMD hardware is configured
-correctly and performing optimally.
-
-If you have already validated your system settings, including aspects like NUMA auto-balancing, you
-can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
-optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
-before starting training.
-
-To test for optimal performance, consult the recommended :ref:`System health benchmarks
-<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
-system's configuration.
-
-This Docker image is optimized for specific model configurations outlined
-below. Performance can vary for other training workloads, as AMD
-doesn’t test configurations and run conditions outside those described.
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.8-benchmark-models.yaml
-
-   {% set unified_docker = data.dockers[0] %}
-
-   Pull the Docker image
-   =====================
-
-   Use the following command to pull the `Docker image <{{ unified_docker.docker_hub_url }}>`_ from Docker Hub.
-
-   .. code-block:: shell
-
-      docker pull {{ unified_docker.pull_tag }}
-
-   Run training
-   ============
-
-   {% set model_groups = data.model_groups %}
-
-   Once the setup is complete, choose between the following two workflows to start benchmarking training.
-   For fine-tuning workloads and multi-node training examples, see :doc:`../pytorch-training` (without Primus).
-
-   .. tab-set::
-
-      .. tab-item:: MAD-integrated benchmarking
-
-   {% for model_group in model_groups %}
-      {% for model in model_group.models %}
-
-         .. container:: model-doc {{ model.mad_tag }}
-
-            The following run command is tailored to {{ model.model }}.
-            See :ref:`amd-primus-pytorch-model-support-v258` to switch to another available model.
-
-            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
-               directory and install the required packages on the host machine.
-
-               .. code-block:: shell
-
-                  git clone https://github.com/ROCm/MAD
-                  cd MAD
-                  pip install -r requirements.txt
-
-            2. For example, use this command to run the performance benchmark test on the {{ model.model }} model
-               using one node with the {{ model.precision }} data type on the host machine.
-
-               .. code-block:: shell
-
-                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
-                  madengine run \
-                      --tags {{ model.mad_tag }} \
-                      --keep-model-dir \
-                      --live-output \
-                      --timeout 28800
-
-               MAD launches a Docker container with the name
-               ``container_ci-{{ model.mad_tag }}``. The latency and throughput reports of the
-               model are collected in ``~/MAD/perf.csv``.
-
-      {% endfor %}
-   {% endfor %}
-
-      .. tab-item:: Standalone benchmarking
-
-   {% for model_group in model_groups %}
-      {% for model in model_group.models %}
-
-         .. container:: model-doc {{ model.mad_tag }}
-
-            The following run commands are tailored to {{ model.model }}.
-            See :ref:`amd-primus-pytorch-model-support-v258` to switch to another available model.
-
-            .. rubric:: Download the Docker image and required packages
-
-            1. Use the following command to pull the Docker image from Docker Hub.
-
-               .. code-block:: shell
-
-                  docker pull {{ unified_docker.pull_tag }}
-
-            2. Run the Docker container.
-
-               .. code-block:: shell
-
-                  docker run -it \
-                      --device /dev/dri \
-                      --device /dev/kfd \
-                      --network host \
-                      --ipc host \
-                      --group-add video \
-                      --cap-add SYS_PTRACE \
-                      --security-opt seccomp=unconfined \
-                      --privileged \
-                      -v $HOME:$HOME \
-                      -v $HOME/.ssh:/root/.ssh \
-                      --shm-size 64G \
-                      --name training_env \
-                      {{ unified_docker.pull_tag }}
-
-               Use these commands if you exit the ``training_env`` container and need to return to it.
-
-               .. code-block:: shell
-
-                  docker start training_env
-                  docker exec -it training_env bash
-
-            3. In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
-               repository and navigate to the benchmark scripts directory
-               ``/workspace/MAD/scripts/pytorch_train``.
-
-               .. code-block:: shell
-
-                  git clone https://github.com/ROCm/MAD
-                  cd MAD/scripts/pytorch_train
-
-            .. rubric:: Prepare training datasets and dependencies
-
-            1. The following benchmarking examples require downloading models and datasets
-               from Hugging Face. To ensure successful access to gated repos, set your
-               ``HF_TOKEN``.
-
-               .. code-block:: shell
-
-                  export HF_TOKEN=$your_personal_hugging_face_access_token
-
-            2. Run the setup script to install libraries and datasets needed for benchmarking.
-
-               .. code-block:: shell
-
-                  ./pytorch_benchmark_setup.sh
-
-            .. rubric:: Pretraining
-
-            To start the pretraining benchmark, use the following command with the
-            appropriate options. See the following list of options and their descriptions.
-
-            .. code-block:: shell
-
-               ./pytorch_benchmark_report.sh -t pretrain \
-                   -m {{ model.model_repo }} \
-                   -p $datatype \
-                   -s $sequence_length
-
-
-            .. list-table::
-               :header-rows: 1
-
-               * - Name
-                 - Options
-                 - Description
-
-               {% for mode in available_modes %}
-               * - {% if loop.first %}``$training_mode``{% endif %}
-                 - ``{{ mode }}``
-                 - {{ training_mode_descs[mode] }}
-               {% endfor %}
-
-               * - ``$datatype``
-                 - ``BF16``{% if model.mad_tag == "primus_pyt_train_llama-3.1-8b" %} or ``FP8``{% endif %}
-                 - Currently, only Llama 3.1 8B supports FP8 precision.
-
-               * - ``$sequence_length``
-                 - Sequence length for the language model.
-                 - Between 2048 and 8192. 8192 by default.
-
-            .. rubric:: Benchmarking examples
-
-            Use the following command to run train {{ model.model }} with BF16 precision using Primus torchtitan.
-
-            .. code-block:: shell
-
-               ./pytorch_benchmark_report.sh -m {{ model.model_repo }}
-
-            To train {{ model.model }} with FP8 precision, use the following command.
-
-            .. code-block:: shell
-
-               ./pytorch_benchmark_report.sh -m {{ model.model_repo }} -p FP8
-      {% endfor %}
-   {% endfor %}
-
-Further reading
-===============
-
- For an introduction to Primus, see `Primus: A Lightweight, Unified Training
-  Framework for Large Models on AMD GPUs <https://rocm.blogs.amd.com/software-tools-optimization/primus/README.html>`__.
-
- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
-
- To learn more about system settings and management practices to configure your system for
-  AMD Instinct MI300X series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
-
- For a list of other ready-made Docker images for AI with ROCm, see
-  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
-
-Previous versions
-=================
-
-See :doc:`pytorch-training-history` to find documentation for previous releases
-of the ``ROCm/pytorch-training`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst
@@ -16,62 +16,51 @@ previous releases of the ``ROCm/pytorch-training`` Docker image on `Docker Hub <
     - Components
     - Resources

-   * - v25.9 (latest)
-     -
-       * ROCm 7.0.0
-       * Primus 0.3.0
-       * PyTorch 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
-     -
-       * :doc:`Primus PyTorch Training documentation <../primus-pytorch>`
-       * :doc:`PyTorch training (legacy) documentation <../pytorch-training>`
-       * `Docker Hub (gfx950) <https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6>`__
-       * `Docker Hub (gfx942) <https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357>`__
-
-   * - v25.8
-     -
+   * - v25.8 (latest)
+     - 
       * ROCm 6.4.3
       * PyTorch 2.8.0a0+gitd06a406
-     -
-       * :doc:`Primus PyTorch Training documentation <primus-pytorch-v25.8>`
-       * :doc:`PyTorch training (legacy) documentation <pytorch-training-v25.8>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.8/images/sha256-5082ae01d73fec6972b0d84e5dad78c0926820dcf3c19f301d6c8eb892e573c5>`__
+     - 
+       * :doc:`Primus PyTorch Training documentation <../primus-pytorch>`
+       * :doc:`PyTorch training (legacy) documentation <../pytorch-training>`
+       * `Docker Hub <https://hub.docker.com/r/rocm/pytorch-training/tags>`__

   * - v25.7
-     -
+     - 
       * ROCm 6.4.2
       * PyTorch 2.8.0a0+gitd06a406
-     -
+     - 
       * :doc:`Documentation <pytorch-training-v25.7>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.7/images/sha256-cc6fd840ab89cb81d926fc29eca6d075aee9875a55a522675a4b9231c9a0a712>`__

   * - v25.6
-     -
+     - 
       * ROCm 6.3.4
       * PyTorch 2.8.0a0+git7d205b2
-     -
+     - 
       * :doc:`Documentation <pytorch-training-v25.6>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.6/images/sha256-a4cea3c493a4a03d199a3e81960ac071d79a4a7a391aa9866add3b30a7842661>`__

   * - v25.5
-     -
+     - 
       * ROCm 6.3.4
       * PyTorch 2.7.0a0+git637433
-     -
+     - 
       * :doc:`Documentation <pytorch-training-v25.5>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.5/images/sha256-d47850a9b25b4a7151f796a8d24d55ea17bba545573f0d50d54d3852f96ecde5>`__

   * - v25.4
-     -
+     - 
       * ROCm 6.3.0
       * PyTorch 2.7.0a0+git637433
-     -
+     - 
       * :doc:`Documentation <pytorch-training-v25.4>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.4/images/sha256-fa98a9aa69968e654466c06f05aaa12730db79b48b113c1ab4f7a5fe6920a20b>`__

   * - v25.3
-     -
+     - 
       * ROCm 6.3.0
       * PyTorch 2.7.0a0+git637433
-     -
+     - 
       * :doc:`Documentation <pytorch-training-v25.3>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.3/images/sha256-0ffdde1b590fd2787b1c7adf5686875b100980b0f314090901387c44253e709b>`__
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.7.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.7.rst
@@ -10,7 +10,7 @@ Training a model with PyTorch for ROCm

 .. caution::

-   This documentation does not reflect the latest version of ROCm PyTorch training
+   This documentation does not reflect the latest version of ROCm vLLM
   performance benchmark documentation. See :doc:`../pytorch-training` for the latest version.

 PyTorch is an open-source machine learning framework that is widely used for
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.8.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.8.rst
@@ -1,588 +0,0 @@
-:orphan:
-
-.. meta::
-   :description: How to train a model using PyTorch for ROCm.
-   :keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
-
-**************************************
-Training a model with PyTorch on ROCm
-**************************************
-
-.. caution::
-
-   This documentation does not reflect the latest version of ROCm PyTorch training
-   performance benchmark documentation. See :doc:`../pytorch-training` for the latest version.
-
-PyTorch is an open-source machine learning framework that is widely used for
-model training with GPU-optimized components for transformer-based models.
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.8-benchmark-models.yaml
-
-   {% set dockers = data.dockers %}
-   {% set docker = dockers[0] %}
-   The `PyTorch for ROCm training Docker <{{ docker.docker_hub_url }}>`__
-   (``{{ docker.pull_tag }}``) image provides a prebuilt optimized environment for fine-tuning and pretraining a
-   model on AMD Instinct MI325X and MI300X GPUs. It includes the following software components to accelerate
-   training workloads:
-
-   .. list-table::
-      :header-rows: 1
-
-      * - Software component
-        - Version
-
-      {% for component_name, component_version in docker.components.items() %}
-      * - {{ component_name }}
-        - {{ component_version }}
-      {% endfor %}
-
-.. _amd-pytorch-training-model-support:
-
-Supported models
-================
-
-The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X GPUs.
-Some instructions, commands, and training recommendations in this documentation might
-vary by model -- select one to get started.
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.8-benchmark-models.yaml
-
-   {% set unified_docker = data.dockers[0] %}
-   {% set model_groups = data.model_groups %}
-   .. raw:: html
-
-      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
-         <div class="row gx-0">
-            <div class="col-2 me-1 px-2 model-param-head">Model</div>
-            <div class="row col-10 pe-0">
-      {% for model_group in model_groups %}
-               <div class="col-4 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
-      {% endfor %}
-            </div>
-         </div>
-
-         <div class="row gx-0 pt-1">
-            <div class="col-2 me-1 px-2 model-param-head">Variant</div>
-            <div class="row col-10 pe-0">
-      {% for model_group in model_groups %}
-         {% set models = model_group.models %}
-         {% for model in models %}
-            {% if models|length % 3 == 0 %}
-               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-            {% else %}
-               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-            {% endif %}
-         {% endfor %}
-      {% endfor %}
-            </div>
-         </div>
-      </div>
-
-
-   .. _amd-pytorch-training-supported-training-modes:
-
-   The following table lists supported training modes per model.
-
-   .. dropdown:: Supported training modes
-
-      .. list-table::
-         :header-rows: 1
-
-         * - Model
-           - Supported training modes
-
-      {% for model_group in model_groups %}
-         {% set models = model_group.models %}
-         {% for model in models %}
-         {% if model.training_modes %}
-         * - {{ model.model }}
-           - ``{{ model.training_modes | join('``, ``') }}``
-
-         {% endif %}
-         {% endfor %}
-      {% endfor %}
-
-      .. note::
-
-         Some model and fine-tuning combinations are not listed. This is
-         because the `upstream torchtune repository <https://github.com/pytorch/torchtune>`__
-         doesn't provide default YAML configurations for them.
-         For advanced usage, you can create a custom configuration to enable
-         unlisted fine-tuning methods by using an existing file in the
-         ``/workspace/torchtune/recipes/configs`` directory as a template.
-
-.. _amd-pytorch-training-performance-measurements:
-
-Performance measurements
-========================
-
-To evaluate performance, the
-`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
-page provides reference throughput and latency measurements for training
-popular AI models.
-
-.. note::
-
-   The performance data presented in
-   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
-   should not be interpreted as the peak performance achievable by AMD
-   Instinct MI325X and MI300X GPUs or ROCm software.
-
-System validation
-=================
-
-Before running AI workloads, it's important to validate that your AMD hardware is configured
-correctly and performing optimally.
-
-If you have already validated your system settings, including aspects like NUMA auto-balancing, you
-can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
-optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
-before starting training.
-
-To test for optimal performance, consult the recommended :ref:`System health benchmarks
-<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
-system's configuration.
-
-This Docker image is optimized for specific model configurations outlined
-below. Performance can vary for other training workloads, as AMD
-doesn’t test configurations and run conditions outside those described.
-
-Run training
-============
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.8-benchmark-models.yaml
-
-   {% set unified_docker = data.dockers[0] %}
-   {% set model_groups = data.model_groups %}
-
-   Once the setup is complete, choose between two options to start benchmarking training:
-
-   .. tab-set::
-
-      .. tab-item:: MAD-integrated benchmarking
-
-   {% for model_group in model_groups %}
-      {% for model in model_group.models %}
-
-         .. container:: model-doc {{ model.mad_tag }}
-
-            The following run command is tailored to {{ model.model }}.
-            See :ref:`amd-pytorch-training-model-support` to switch to another available model.
-
-            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
-               directory and install the required packages on the host machine.
-
-               .. code-block:: shell
-
-                  git clone https://github.com/ROCm/MAD
-                  cd MAD
-                  pip install -r requirements.txt
-
-            2. For example, use this command to run the performance benchmark test on the {{ model.model }} model
-               using one node with the {{ model.precision }} data type on the host machine.
-
-               .. code-block:: shell
-
-                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
-                  madengine run \
-                      --tags {{ model.mad_tag }} \
-                      --keep-model-dir \
-                      --live-output \
-                      --timeout 28800
-
-               MAD launches a Docker container with the name
-               ``container_ci-{{ model.mad_tag }}``. The latency and throughput reports of the
-               model are collected in ``~/MAD/perf.csv``.
-
-      {% endfor %}
-   {% endfor %}
-
-      .. tab-item:: Standalone benchmarking
-
-   {% for model_group in model_groups %}
-      {% for model in model_group.models %}
-
-         .. container:: model-doc {{ model.mad_tag }}
-
-            The following commands are tailored to {{ model.model }}.
-            See :ref:`amd-pytorch-training-model-support` to switch to another available model.
-
-      {% endfor %}
-   {% endfor %}
-
-         .. rubric:: Download the Docker image and required packages
-
-         1. Use the following command to pull the Docker image from Docker Hub.
-
-            .. code-block:: shell
-
-               docker pull {{ unified_docker.pull_tag }}
-
-         2. Run the Docker container.
-
-            .. code-block:: shell
-
-               docker run -it \
-                   --device /dev/dri \
-                   --device /dev/kfd \
-                   --network host \
-                   --ipc host \
-                   --group-add video \
-                   --cap-add SYS_PTRACE \
-                   --security-opt seccomp=unconfined \
-                   --privileged \
-                   -v $HOME:$HOME \
-                   -v $HOME/.ssh:/root/.ssh \
-                   --shm-size 64G \
-                   --name training_env \
-                   {{ unified_docker.pull_tag }}
-
-            Use these commands if you exit the ``training_env`` container and need to return to it.
-
-            .. code-block:: shell
-
-               docker start training_env
-               docker exec -it training_env bash
-
-         3. In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
-            repository and navigate to the benchmark scripts directory
-            ``/workspace/MAD/scripts/pytorch_train``.
-
-            .. code-block:: shell
-
-               git clone https://github.com/ROCm/MAD
-               cd MAD/scripts/pytorch_train
-
-         .. rubric:: Prepare training datasets and dependencies
-
-         1. The following benchmarking examples require downloading models and datasets
-            from Hugging Face. To ensure successful access to gated repos, set your
-            ``HF_TOKEN``.
-
-            .. code-block:: shell
-
-               export HF_TOKEN=$your_personal_hugging_face_access_token
-
-         2. Run the setup script to install libraries and datasets needed for benchmarking.
-
-            .. code-block:: shell
-
-               ./pytorch_benchmark_setup.sh
-
-            .. container:: model-doc pyt_train_llama-3.1-8b
-
-               ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 8B:
-
-               .. list-table::
-                  :header-rows: 1
-
-                  * - Library
-                    - Reference
-
-                  * - ``accelerate``
-                    - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
-
-                  * - ``datasets``
-                    - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
-
-            .. container:: model-doc pyt_train_llama-3.1-70b
-
-               ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 70B:
-
-               .. list-table::
-                  :header-rows: 1
-
-                  * - Library
-                    - Reference
-
-                  * - ``datasets``
-                    - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
-
-                  * - ``torchdata``
-                    - `TorchData <https://meta-pytorch.org/data/beta/index.html#torchdata>`__
-
-                  * - ``tomli``
-                    - `Tomli <https://pypi.org/project/tomli/>`__
-
-                  * - ``tiktoken``
-                    - `tiktoken <https://github.com/openai/tiktoken>`__
-
-                  * - ``blobfile``
-                    - `blobfile <https://pypi.org/project/blobfile/>`__
-
-                  * - ``tabulate``
-                    - `tabulate <https://pypi.org/project/tabulate/>`__
-
-                  * - ``wandb``
-                    - `Weights & Biases <https://github.com/wandb/wandb>`__
-
-                  * - ``sentencepiece``
-                    - `SentencePiece <https://github.com/google/sentencepiece>`__ 0.2.0
-
-                  * - ``tensorboard``
-                    - `TensorBoard <https://www.tensorflow.org/tensorboard>`__ 2.18.0
-
-            .. container:: model-doc pyt_train_flux
-
-               ``pytorch_benchmark_setup.sh`` installs the following libraries for FLUX:
-
-               .. list-table::
-                  :header-rows: 1
-
-                  * - Library
-                    - Reference
-
-                  * - ``accelerate``
-                    - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
-
-                  * - ``datasets``
-                    - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`__ 3.2.0
-
-                  * - ``sentencepiece``
-                    - `SentencePiece <https://github.com/google/sentencepiece>`__ 0.2.0
-
-                  * - ``tensorboard``
-                    - `TensorBoard <https://www.tensorflow.org/tensorboard>`__ 2.18.0
-
-                  * - ``csvkit``
-                    - `csvkit <https://csvkit.readthedocs.io/en/latest/>`__ 2.0.1
-
-                  * - ``deepspeed``
-                    - `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`__ 0.16.2
-
-                  * - ``diffusers``
-                    - `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`__ 0.31.0
-
-                  * - ``GitPython``
-                    - `GitPython <https://github.com/gitpython-developers/GitPython>`__ 3.1.44
-
-                  * - ``opencv-python-headless``
-                    - `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`__ 4.10.0.84
-
-                  * - ``peft``
-                    - `PEFT <https://huggingface.co/docs/peft/en/index>`__ 0.14.0
-
-                  * - ``protobuf``
-                    - `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`__ 5.29.2
-
-                  * - ``pytest``
-                    - `PyTest <https://docs.pytest.org/en/stable/>`__ 8.3.4
-
-                  * - ``python-dotenv``
-                    - `python-dotenv <https://pypi.org/project/python-dotenv/>`__ 1.0.1
-
-                  * - ``seaborn``
-                    - `Seaborn <https://seaborn.pydata.org/>`__ 0.13.2
-
-                  * - ``transformers``
-                    - `Transformers <https://huggingface.co/docs/transformers/en/index>`__ 4.47.0
-
-            ``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face:
-
-            * `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`__
-
-   {% for model_group in model_groups %}
-      {% for model in model_group.models %}
-         {% set training_modes = model.training_modes %}
-         {% set training_mode_descs = {
-            "pretrain": "Benchmark pre-training.",
-            "HF_pretrain": "Llama 3.1 8B pre-training with FP8 precision."
-         } %}
-         {% set available_modes = training_modes | select("in", ["pretrain", "HF_pretrain"]) | list %}
-         {% if available_modes %}
-
-         .. container:: model-doc {{ model.mad_tag }}
-
-            .. rubric:: Pre-training
-
-            To start the pre-training benchmark, use the following command with the
-            appropriate options. See the following list of options and their descriptions.
-
-            .. code-block:: shell
-
-               ./pytorch_benchmark_report.sh -t {% if available_modes | length == 1 %}{{ available_modes[0] }}{% else %}$training_mode{% endif %} \
-                   -m {{ model.model_repo }} \
-                   -p $datatype \
-                   -s $sequence_length
-
-            {% if model.mad_tag == "pyt_train_flux" %}
-            .. container:: model-doc {{ model.mad_tag }}
-
-               .. note::
-
-                  Currently, FLUX models are not supported out-of-the-box on {{ unified_docker.pull_tag }}.
-                  To use FLUX, refer to ``rocm/pytorch-training`` Docker: :doc:`pytorch-training-v25.6`
-
-                  Occasionally, downloading the Flux dataset might fail. In the event of this
-                  error, manually download it from Hugging Face at
-                  `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
-                  and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
-                  the required dataset.
-            {% endif %}
-
-            .. list-table::
-               :header-rows: 1
-
-               * - Name
-                 - Options
-                 - Description
-
-               {% for mode in available_modes %}
-               * - {% if loop.first %}``$training_mode``{% endif %}
-                 - ``{{ mode }}``
-                 - {{ training_mode_descs[mode] }}
-               {% endfor %}
-
-               * - ``$datatype``
-                 - ``BF16``{% if model.mad_tag == "pyt_train_llama-3.1-8b" %} or ``FP8``{% endif %}
-                 - Only Llama 3.1 8B supports FP8 precision.
-
-               * - ``$sequence_length``
-                 - Sequence length for the language model.
-                 - Between 2048 and 8192. 8192 by default.
-         {% endif %}
-
-         {% set training_mode_descs = {
-            "finetune_fw": "Full weight fine-tuning (BF16 and FP8 supported).",
-            "finetune_lora": "LoRA fine-tuning (BF16 supported).",
-            "finetune_qlora": "QLoRA fine-tuning (BF16 supported).",
-            "HF_finetune_lora": "LoRA fine-tuning with Hugging Face PEFT.",
-         } %}
-         {% set available_modes = training_modes | select("in", ["finetune_fw", "finetune_lora", "finetune_qlora", "HF_finetune_lora"]) | list %}
-         {% if available_modes %}
-         .. container:: model-doc {{ model.mad_tag }}
-
-            .. rubric:: Fine-tuning
-
-            To start the fine-tuning benchmark, use the following command with the
-            appropriate options. See the following list of options and their descriptions.
-            See :ref:`supported training modes <amd-pytorch-training-supported-training-modes>`.
-
-            .. code-block:: shell
-
-               ./pytorch_benchmark_report.sh -t $training_mode \
-                   -m {{ model.model_repo }} \
-                   -p $datatype \
-                   -s $sequence_length
-
-            .. list-table::
-               :header-rows: 1
-
-               * - Name
-                 - Options
-                 - Description
-
-               {% for mode in available_modes %}
-               * - {% if loop.first %}``$training_mode``{% endif %}
-                 - ``{{ mode }}``
-                 - {{ training_mode_descs[mode] }}
-               {% endfor %}
-
-               * - ``$datatype``
-                 - ``BF16``{% if "finetune_fw" in available_modes %} or ``FP8``{% endif %}
-                 - All models support BF16.{% if "finetune_fw" in available_modes %} FP8 is only available for full weight fine-tuning.{% endif %}
-
-               * - ``$sequence_length``
-                 - Between 2048 and 16384.
-                 - Sequence length for the language model.
-
-            {% if model.mad_tag in ["pyt_train_llama3.2-vision-11b", "pyt_train_llama-3.2-vision-90b"] %}
-            .. note::
-
-               For LoRA and QLoRA support with vision models (Llama 3.2 11B and 90B),
-               use the following torchtune commit for compatibility:
-
-               .. code-block:: shell
-
-                  git checkout 48192e23188b1fc524dd6d127725ceb2348e7f0e
-
-            {% elif model.mad_tag in ["pyt_train_llama-2-7b", "pyt_train_llama-2-13b", "pyt_train_llama-2-70b"] %}
-            .. note::
-
-               You might encounter the following error with Llama 2: ``ValueError: seq_len (16384) of
-               input tensor should be smaller than max_seq_len (4096)``.
-               This error indicates that an input sequence is longer than the model's maximum context window.
-
-               Ensure your tokenized input does not exceed the model's ``max_seq_len`` (4096
-               tokens in this case). You can resolve this by truncating the input or splitting
-               it into smaller chunks before passing it to the model.
-
-               Note on reproducibility: The results in this guide are based on
-               commit ``b4c98ac`` from the upstream
-               `<https://github.com/pytorch/torchtune>`__ repository. For the
-               latest updates, you can use the main branch.
-
-            {% endif %}
-         {% endif %}
-      {% endfor %}
-   {% endfor %}
-
-            .. rubric:: Benchmarking examples
-
-            For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.
-
-.. _amd-pytorch-training-multinode-examples:
-
-Multi-node training
-------------------
-
-Refer to :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your environment for multi-node
-training. See :ref:`rocm-for-ai-multi-node-setup-pyt-train-example` for example Slurm run commands.
-
-Pre-training
-~~~~~~~~~~~~
-
-Multi-node training with torchtitan is supported. The provided SLURM script is pre-configured for Llama 3 70B.
-
-To launch the training job on a SLURM cluster for Llama 3 70B, run the following commands from the MAD repository.
-
-.. code-block:: shell
-
-   # In the MAD repository
-   cd scripts/pytorch_train
-   sbatch run_slurm_train.sh
-
-Fine-tuning
-~~~~~~~~~~~
-
-Multi-node training with torchtune is supported. The provided SLURM script is pre-configured for Llama 3.3 70B.
-
-To launch the training job on a SLURM cluster for Llama 3.3 70B, run the following commands from the MAD repository.
-
-.. code-block:: shell
-
-   huggingface-cli login # Get access to HF Llama model space
-   huggingface-cli download meta-llama/Llama-3.3-70B-Instruct --local-dir ./models/Llama-3.3-70B-Instruct # Download the Llama 3.3 model locally
-   # In the MAD repository
-   cd scripts/pytorch_train
-   sbatch Torchtune_Multinode.sh
-
-.. note::
-
-   Information regarding benchmark setup:
-
-   * By default, Llama 3.3 70B is fine-tuned using ``alpaca_dataset``.
-   * You can adjust the torchtune `YAML configuration file
-     <https://github.com/pytorch/torchtune/blob/main/recipes/configs/llama3_3/70B_full_multinode.yaml>`__
-     if you're using a different model.
-   * The number of nodes and other parameters can be tuned in the SLURM script ``Torchtune_Multinode.sh``.
-   * Set the ``mounting_paths`` inside the SLURM script.
-
-Once the run is finished, you can find the log files in the ``result_torchtune/`` directory.
-
-Further reading
-===============
-
- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
-
- To learn more about system settings and management practices to configure your system for
-  AMD Instinct MI300X series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
-
- For a list of other ready-made Docker images for AI with ROCm, see
-  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
-
-Previous versions
-=================
-
-See :doc:`pytorch-training-history` to find documentation for previous releases
-of the ``ROCm/pytorch-training`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch.rst
@@ -13,42 +13,30 @@ Primus now supports the PyTorch torchtitan backend.

 .. note::

-   For a unified training solution on AMD GPUs with ROCm, the `rocm/pytorch-training
-   <https://hub.docker.com/r/rocm/pytorch-training/>`__ Docker Hub registry will be
-   deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
-   The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
-   including torchtitan and :doc:`Megatron-LM <primus-megatron>`.
-
-   Primus with the PyTorch torchtitan backend is designed to replace the
-   :doc:`ROCm PyTorch training <pytorch-training>` workflow. See
-   :doc:`pytorch-training` to see steps to run workloads without Primus.
-
-AMD provides a ready-to-use Docker image for MI355X, MI350X, MI325X, and
-MI300X GPUs containing essential components for Primus and PyTorch training
-with Primus Turbo optimizations.
+   Primus with the PyTorch torchtitan backend is designed to replace the :doc:`ROCm PyTorch training <pytorch-training>` workflow.
+   See :doc:`pytorch-training` to see steps to run workloads without Primus.

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml

   {% set dockers = data.dockers %}
-   .. tab-set::
+   {% set docker = dockers[0] %}
+   For ease of use, AMD provides a ready-to-use Docker image -- ``{{
+   docker.pull_tag }}`` -- for MI300X series GPUs containing essential
+   components for Primus and PyTorch training with
+   Primus Turbo optimizations.

-   {% for supported_gpus, docker in dockers.items() %}
-      .. tab-item:: {{ supported_gpus }}
-         :sync: {{ supported_gpus }}
+   .. list-table::
+      :header-rows: 1

-         .. list-table::
-            :header-rows: 1
+      * - Software component
+        - Version

-            * - Software component
-              - Version
+      {% for component_name, component_version in docker.components.items() %}
+      * - {{ component_name }}
+        - {{ component_version }}
+      {% endfor %}

-            {% for component_name, component_version in docker.components.items() %}
-            * - {{ component_name }}
-              - {{ component_version }}
-            {% endfor %}
-   {% endfor %}
-
-.. _amd-primus-pytorch-model-support-v259:
+.. _amd-primus-pytorch-model-support-v258:

 Supported models
 ================
@@ -59,21 +47,22 @@ vary by model -- select one to get started.

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml

+   {% set unified_docker = data.dockers[0] %}
   {% set model_groups = data.model_groups %}
   .. raw:: html

      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
-         <div class="row gx-0">
+         <div class="row gx-0" style="display: none;">
            <div class="col-2 me-1 px-2 model-param-head">Model</div>
            <div class="row col-10 pe-0">
      {% for model_group in model_groups %}
-               <div class="col-12 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+               <div class="col-3 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
      {% endfor %}
            </div>
         </div>

         <div class="row gx-0 pt-1">
-            <div class="col-2 me-1 px-2 model-param-head">Variant</div>
+            <div class="col-2 me-1 px-2 model-param-head">Model</div>
            <div class="row col-10 pe-0">
      {% for model_group in model_groups %}
         {% set models = model_group.models %}
@@ -94,7 +83,7 @@ vary by model -- select one to get started.
   For additional workloads, including Llama 3.3, Llama 3.2, Llama 2, GPT OSS, Qwen, and Flux models,
   see the documentation :doc:`pytorch-training` (without Primus)

-.. _amd-primus-pytorch-performance-measurements-v259:
+.. _amd-primus-pytorch-performance-measurements-v258:

 System validation
 =================
@@ -120,34 +109,25 @@ Pull the Docker image

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml

-   {% set dockers = data.dockers %}
+   {% set unified_docker = data.dockers[0] %}

-   Use the following command to pull the Docker image from Docker Hub.
+   Use the following command to pull the `Docker image <{{ unified_docker.docker_hub_url }}>`_ from Docker Hub.

-   .. tab-set::
+   .. code-block:: shell

-      {% for supported_gpus, docker in dockers.items() %}
-      .. tab-item:: {{ supported_gpus }}
-         :sync: {{ supported_gpus }}
-
-         .. code-block:: shell
-
-            docker pull {{ docker.pull_tag }}
-      {% endfor %}
+      docker pull {{ unified_docker.pull_tag }}

 Run training
 ============

-Once the setup is complete, choose between the following two workflows to start benchmarking training.
-For fine-tuning workloads and multi-node training examples, see :doc:`pytorch-training` (without Primus).
-For best performance on MI325X, MI350X, and MI355X GPUs, you might need to
-tweak some configurations (such as batch sizes).
-
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml

-   {% set dockers = data.dockers %}
+   {% set unified_docker = data.dockers[0] %}
   {% set model_groups = data.model_groups %}

+   Once the setup is complete, choose between the following two workflows to start benchmarking training.
+   For fine-tuning workloads and multi-node training examples, see :doc:`pytorch-training` (without Primus).
+
   .. tab-set::

      .. tab-item:: MAD-integrated benchmarking
@@ -158,7 +138,7 @@ tweak some configurations (such as batch sizes).
         .. container:: model-doc {{ model.mad_tag }}

            The following run command is tailored to {{ model.model }}.
-            See :ref:`amd-primus-pytorch-model-support-v259` to switch to another available model.
+            See :ref:`amd-primus-pytorch-model-support-v258` to switch to another available model.

            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
               directory and install the required packages on the host machine.
@@ -185,17 +165,10 @@ tweak some configurations (such as batch sizes).
               ``container_ci-{{ model.mad_tag }}``. The latency and throughput reports of the
               model are collected in ``~/MAD/perf.csv``.

-               .. note::
-
-                  Currently, Primus torchtitan models are run with Primus Turbo
-                  enabled for enhanced performance. To disable Primus Turbo,
-                  modify respective configuration file
-                  ``scripts/primus/pytorch_train/primus_torchtitan_scripts/llama3_[8B|70B]-[BF16|FP8].yaml``.
-
      {% endfor %}
   {% endfor %}

-      .. tab-item:: Primus benchmarking
+      .. tab-item:: Standalone benchmarking

   {% for model_group in model_groups %}
      {% for model in model_group.models %}
@@ -203,48 +176,34 @@ tweak some configurations (such as batch sizes).
         .. container:: model-doc {{ model.mad_tag }}

            The following run commands are tailored to {{ model.model }}.
-            See :ref:`amd-primus-pytorch-model-support-v259` to switch to another available model.
+            See :ref:`amd-primus-pytorch-model-support-v258` to switch to another available model.

            .. rubric:: Download the Docker image and required packages

-            1. Pull the appropriate Docker image for your AMD GPU architecture from Docker Hub.
+            1. Use the following command to pull the Docker image from Docker Hub.

-               .. tab-set::
+               .. code-block:: shell

-                  {% for supported_gpus, docker in dockers.items() %}
-                  .. tab-item:: {{ supported_gpus }}
-                     :sync: {{ supported_gpus }}
-
-                     .. code-block:: shell
-
-                        docker pull {{ docker.pull_tag }}
-                  {% endfor %}
+                  docker pull {{ unified_docker.pull_tag }}

            2. Run the Docker container.

-               .. tab-set::
+               .. code-block:: shell

-                  {% for supported_gpus, docker in dockers.items() %}
-                  .. tab-item:: {{ supported_gpus }}
-                     :sync: {{ supported_gpus }}
-
-                     .. code-block:: shell
-
-                        docker run -it \
-                            --device /dev/dri \
-                            --device /dev/kfd \
-                            --network host \
-                            --ipc host \
-                            --group-add video \
-                            --cap-add SYS_PTRACE \
-                            --security-opt seccomp=unconfined \
-                            --privileged \
-                            -v $HOME:$HOME \
-                            -v $HOME/.ssh:/root/.ssh \
-                            --shm-size 64G \
-                            --name training_env \
-                            {{ docker.pull_tag }}
-                  {% endfor %}
+                  docker run -it \
+                      --device /dev/dri \
+                      --device /dev/kfd \
+                      --network host \
+                      --ipc host \
+                      --group-add video \
+                      --cap-add SYS_PTRACE \
+                      --security-opt seccomp=unconfined \
+                      --privileged \
+                      -v $HOME:$HOME \
+                      -v $HOME/.ssh:/root/.ssh \
+                      --shm-size 64G \
+                      --name training_env \
+                      {{ unified_docker.pull_tag }}

               Use these commands if you exit the ``training_env`` container and need to return to it.

@@ -253,250 +212,17 @@ tweak some configurations (such as batch sizes).
                  docker start training_env
                  docker exec -it training_env bash

+            3. In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
+               repository and navigate to the benchmark scripts directory
+               ``/workspace/MAD/scripts/pytorch_train``.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD/scripts/pytorch_train
+
            .. rubric:: Prepare training datasets and dependencies

-            The following benchmarking examples require downloading models and datasets
-            from Hugging Face. To ensure successful access to gated repos, set your
-            ``HF_TOKEN``.
-
-            .. code-block:: shell
-
-               export HF_TOKEN=$your_personal_hugging_face_access_token
-
-            .. rubric:: Pretraining
-
-            To get started, navigate to the ``Primus`` directory in your container.
-
-            .. code-block::
-
-               cd /workspace/Primus
-
-            Now, to start the pretraining benchmark, use the ``run_pretrain.sh`` script
-            included with Primus with the appropriate options.
-
-            .. rubric:: Benchmarking examples
-
-            .. container:: model-doc primus_pyt_train_llama-3.1-8b
-
-               Use the following command to run train Llama 3.1 8B with BF16 precision using Primus torchtitan.
-
-               .. tab-set::
-
-                  .. tab-item:: MI355X and MI350X
-                     :sync: MI355X and MI300X
-
-                     .. code-block:: shell
-
-                        EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \
-                        bash examples/run_pretrain.sh \
-                            --metrics.enable_tensorboard false \
-                            --profiling.enable_profiling false \
-                            --training.batch_size 5
-
-                  .. tab-item:: MI325X
-                     :sync: MI325X
-
-                     .. code-block:: shell
-
-                        EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \
-                        bash examples/run_pretrain.sh \
-                            --metrics.enable_tensorboard false \
-                            --profiling.enable_profiling false \
-                            --training.batch_size 6
-
-                  .. tab-item:: MI300X
-                     :sync: MI325X and MI300X
-
-                     .. code-block:: shell
-
-                        EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \
-                        bash examples/run_pretrain.sh \
-                            --metrics.enable_tensorboard false \
-                            --profiling.enable_profiling false \
-                            --training.batch_size 4
-
-
-               To train Llama 3.1 8B with FP8 precision, use the following command.
-
-               .. tab-set::
-
-                  .. tab-item:: MI355X and MI350X
-                     :sync: MI355X and MI300X
-
-                     .. code-block:: shell
-
-                        EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \
-                        bash examples/run_pretrain.sh \
-                            --metrics.enable_tensorboard false \
-                            --profiling.enable_profiling false \
-                            --training.batch_size 8
-
-                  .. tab-item:: MI325X
-                     :sync: MI325X
-
-                     .. code-block:: shell
-
-                        EXP=examples/torchtitan/configs/llama3.1_8B-FP8-pretrain.yaml \
-                        bash examples/run_pretrain.sh \
-                            --metrics.enable_tensorboard false \
-                            --profiling.enable_profiling false \
-                            --training.batch_size 7
-
-                  .. tab-item:: MI300X
-                     :sync: MI325X and MI300X
-
-                     .. code-block:: shell
-
-                        EXP=examples/torchtitan/configs/llama3.1_8B-FP8-pretrain.yaml \
-                        bash examples/run_pretrain.sh \
-                            --metrics.enable_tensorboard false \
-                            --profiling.enable_profiling false \
-                            --training.batch_size 5
-
-            .. container:: model-doc primus_pyt_train_llama-3.1-70b
-
-               Use the following command to run train Llama 3.1 70B with BF16 precision using Primus torchtitan.
-
-               .. tab-set::
-
-                  .. tab-item:: MI355X and MI350X
-                     :sync: MI355X and MI300X
-
-                     .. code-block:: shell
-
-                        EXP=examples/torchtitan/configs/llama3.1_70B-BF16-pretrain.yaml \
-                        bash examples/run_pretrain.sh \
-                            --metrics.enable_tensorboard false \
-                            --profiling.enable_profiling false \
-                            --training.batch_size 8
-
-                  .. tab-item:: MI325X
-                     :sync: MI325X
-
-                     .. code-block:: shell
-
-                        EXP=examples/torchtitan/configs/llama3.1_70B-BF16-pretrain.yaml \
-                        bash examples/run_pretrain.sh \
-                            --metrics.enable_tensorboard false \
-                            --profiling.enable_profiling false \
-                            --training.batch_size 6
-
-                  .. tab-item:: MI300X
-                     :sync: MI325X and MI300X
-
-                     .. code-block:: shell
-
-                        EXP=examples/torchtitan/configs/llama3.1_70B-BF16-pretrain.yaml \
-                        bash examples/run_pretrain.sh \
-                            --metrics.enable_tensorboard false \
-                            --profiling.enable_profiling false \
-                            --training.batch_size 4
-
-               To train Llama 3.1 70B with FP8 precision, use the following command.
-
-               .. tab-set::
-
-                  .. tab-item:: MI355X and MI350X
-                     :sync: MI355X and MI300X
-
-                     .. code-block:: shell
-
-                        EXP=examples/torchtitan/configs/llama3.1_70B-FP8-pretrain.yaml \
-                        bash examples/run_pretrain.sh \
-                            --metrics.enable_tensorboard false \
-                            --profiling.enable_profiling false \
-                            --training.batch_size 6
-
-                  .. tab-item:: MI325X
-                     :sync: MI325X
-
-                     .. code-block:: shell
-
-                        EXP=examples/torchtitan/configs/llama3.1_70B-FP8-pretrain.yaml \
-                        bash examples/run_pretrain.sh \
-                            --metrics.enable_tensorboard false \
-                            --profiling.enable_profiling false \
-                            --training.batch_size 5
-
-                  .. tab-item:: MI300X
-                     :sync: MI325X and MI300X
-
-                     .. code-block:: shell
-
-                        EXP=examples/torchtitan/configs/llama3.1_70B-FP8-pretrain.yaml \
-                        bash examples/run_pretrain.sh \
-                            --metrics.enable_tensorboard false \
-                            --profiling.enable_profiling false \
-                            --training.batch_size 3
-      {% endfor %}
-   {% endfor %}
-
-      .. tab-item:: Standalone torchtitan benchmarking
-
-   {% for model_group in model_groups %}
-      {% for model in model_group.models %}
-
-         .. container:: model-doc {{ model.mad_tag }}
-
-            The following run commands are tailored to {{ model.model }}.
-            See :ref:`amd-primus-pytorch-model-support-v259` to switch to another available model.
-
-            .. rubric:: Download the Docker image and required packages
-
-            1. Pull the appropriate Docker image for your AMD GPU architecture from Docker Hub.
-
-               .. tab-set::
-
-                  {% for supported_gpus, docker in dockers.items() %}
-                  .. tab-item:: {{ supported_gpus }}
-                     :sync: {{ supported_gpus }}
-
-                     .. code-block:: shell
-
-                        docker pull {{ docker.pull_tag }}
-                  {% endfor %}
-
-            2. Run the Docker container.
-
-               .. tab-set::
-
-                  {% for supported_gpus, docker in dockers.items() %}
-                  .. tab-item:: {{ supported_gpus }}
-                     :sync: {{ supported_gpus }}
-
-                     .. code-block:: shell
-
-                        docker run -it \
-                            --device /dev/dri \
-                            --device /dev/kfd \
-                            --network host \
-                            --ipc host \
-                            --group-add video \
-                            --cap-add SYS_PTRACE \
-                            --security-opt seccomp=unconfined \
-                            --privileged \
-                            -v $HOME:$HOME \
-                            -v $HOME/.ssh:/root/.ssh \
-                            --shm-size 64G \
-                            --name training_env \
-                            {{ docker.pull_tag }}
-                  {% endfor %}
-
-               Use these commands if you exit the ``training_env`` container and need to return to it.
-
-               .. code-block:: shell
-
-                  docker start training_env
-                  docker exec -it training_env bash
-
-            3. Navigate to the ``torchtitan`` workspace directory.
-
-               .. code-block:: shell
-
-                  cd /workspace/torchtitan
-
-            .. rubric:: Download the tokenizer
-
            1. The following benchmarking examples require downloading models and datasets
               from Hugging Face. To ensure successful access to gated repos, set your
               ``HF_TOKEN``.
@@ -505,47 +231,62 @@ tweak some configurations (such as batch sizes).

                  export HF_TOKEN=$your_personal_hugging_face_access_token

-            2. Download the tokenizer for your model.
-
-               .. container:: model-doc {{ model.mad_tag }}
-
-                  .. code-block:: shell
-
-                     python3 scripts/download_tokenizer.py \
-                        --repo_id {{ model.model_repo }} \
-                        --tokenizer_path "original" \
-                        --hf_token=${HF_TOKEN}
-
-            .. rubric:: Pretraining examples
-
-            Run the training script with the appropriate configuration file.
-
-            For train with BF16 precicion, use the following command:
-
-            .. container:: model-doc {{ model.mad_tag }}
+            2. Run the setup script to install libraries and datasets needed for benchmarking.

               .. code-block:: shell

-                  CONFIG_FILE={{ model.config_file.bf16 }} \
-                  .run_train.sh
+                  ./pytorch_benchmark_setup.sh

-            For train with BF16 precicion, use the following command:
+            .. rubric:: Pretraining

-            .. container:: model-doc {{ model.mad_tag }}
+            To start the pretraining benchmark, use the following command with the
+            appropriate options. See the following list of options and their descriptions.

-               .. code-block:: shell
+            .. code-block:: shell

-                  CONFIG_FILE={{ model.config_file.fp8 }} \
-                  .run_train.sh
+               ./pytorch_benchmark_report.sh -t pretrain \
+                   -m {{ model.model_repo }} \
+                   -p $datatype \
+                   -s $sequence_length
+
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Name
+                 - Options
+                 - Description
+
+               {% for mode in available_modes %}
+               * - {% if loop.first %}``$training_mode``{% endif %}
+                 - ``{{ mode }}``
+                 - {{ training_mode_descs[mode] }}
+               {% endfor %}
+
+               * - ``$datatype``
+                 - ``BF16``{% if model.mad_tag == "primus_pyt_train_llama-3.1-8b" %} or ``FP8``{% endif %}
+                 - Currently, only Llama 3.1 8B supports FP8 precision.
+
+               * - ``$sequence_length``
+                 - Sequence length for the language model.
+                 - Between 2048 and 8192. 8192 by default.
+
+            .. rubric:: Benchmarking examples
+
+            Use the following command to run train {{ model.model }} with BF16 precision using Primus torchtitan.
+
+            .. code-block:: shell
+
+               ./pytorch_benchmark_report.sh -m {{ model.model_repo }}
+
+            To train {{ model.model }} with FP8 precision, use the following command.
+
+            .. code-block:: shell
+
+               ./pytorch_benchmark_report.sh -m {{ model.model_repo }} -p FP8
      {% endfor %}
   {% endfor %}

-Known issues
-============
-
-PyTorch Profiler may produce inaccurate traces when CPU activity profiling is enabled.
-
-
 Further reading
 ===============

--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
@@ -10,54 +10,44 @@ Training a model with PyTorch on ROCm

 .. note::

-   For a unified training solution on AMD GPUs with ROCm, the `rocm/pytorch-training
-   <https://hub.docker.com/r/rocm/pytorch-training/>`__ Docker Hub registry will be
-   deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
-   The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
-   including torchtitan and :doc:`Megatron-LM <primus-megatron>`.
-
+   Primus with the PyTorch torchtitan backend is designed to replace :doc:`ROCm PyTorch training <pytorch-training>` workflow.
   See :doc:`primus-pytorch` for details.

 PyTorch is an open-source machine learning framework that is widely used for
 model training with GPU-optimized components for transformer-based models.
-The PyTorch for ROCm training Docker image provides a prebuilt optimized
-environment for fine-tuning and pretraining a model on AMD Instinct MI325X
-and MI300X GPUs. It includes the following software components to accelerate
-training workloads:

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml

   {% set dockers = data.dockers %}
-   .. tab-set::
+   {% set docker = dockers[0] %}
+   The `PyTorch for ROCm training Docker <{{ docker.docker_hub_url }}>`__
+   (``{{ docker.pull_tag }}``) image provides a prebuilt optimized environment for fine-tuning and pretraining a
+   model on AMD Instinct MI325X and MI300X GPUs. It includes the following software components to accelerate
+   training workloads:

-   {% for supported_gpus, docker in dockers.items() %}
-      .. tab-item:: {{ supported_gpus }}
-         :sync: {{ supported_gpus }}
+   .. list-table::
+      :header-rows: 1

-         .. list-table::
-            :header-rows: 1
+      * - Software component
+        - Version

-            * - Software component
-              - Version
+      {% for component_name, component_version in docker.components.items() %}
+      * - {{ component_name }}
+        - {{ component_version }}
+      {% endfor %}

-            {% for component_name, component_version in docker.components.items() %}
-            * - {{ component_name }}
-              - {{ component_version }}
-            {% endfor %}
-   {% endfor %}
-
-.. _amd-pytorch-training-model-support-v259:
+.. _amd-pytorch-training-model-support:

 Supported models
 ================

-The following models are pre-optimized for performance on the AMD Instinct
-MI355X, MI350X, MI325X, and MI300X GPUs. Some instructions, commands, and
-training recommendations in this documentation might vary by model -- select
-one to get started.
+The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X GPUs.
+Some instructions, commands, and training recommendations in this documentation might
+vary by model -- select one to get started.

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml

+   {% set unified_docker = data.dockers[0] %}
   {% set model_groups = data.model_groups %}
   .. raw:: html

@@ -88,13 +78,11 @@ one to get started.
         </div>
      </div>

-.. _amd-pytorch-training-supported-training-modes-v259:

-The following table lists supported training modes per model.
+   .. _amd-pytorch-training-supported-training-modes:

-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
+   The following table lists supported training modes per model.

-   {% set model_groups = data.model_groups %}
   .. dropdown:: Supported training modes

      .. list-table::
@@ -123,7 +111,7 @@ The following table lists supported training modes per model.
         unlisted fine-tuning methods by using an existing file in the
         ``/workspace/torchtune/recipes/configs`` directory as a template.

-.. _amd-pytorch-training-performance-measurements-v259:
+.. _amd-pytorch-training-performance-measurements:

 Performance measurements
 ========================
@@ -164,7 +152,7 @@ Run training

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml

-   {% set dockers = data.dockers %}
+   {% set unified_docker = data.dockers[0] %}
   {% set model_groups = data.model_groups %}

   Once the setup is complete, choose between two options to start benchmarking training:
@@ -179,7 +167,7 @@ Run training
         .. container:: model-doc {{ model.mad_tag }}

            The following run command is tailored to {{ model.model }}.
-            See :ref:`amd-pytorch-training-model-support-v259` to switch to another available model.
+            See :ref:`amd-pytorch-training-model-support` to switch to another available model.

            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
               directory and install the required packages on the host machine.
@@ -217,7 +205,7 @@ Run training
         .. container:: model-doc {{ model.mad_tag }}

            The following commands are tailored to {{ model.model }}.
-            See :ref:`amd-pytorch-training-model-support-v259` to switch to another available model.
+            See :ref:`amd-pytorch-training-model-support` to switch to another available model.

      {% endfor %}
   {% endfor %}
@@ -226,42 +214,28 @@ Run training

         1. Use the following command to pull the Docker image from Docker Hub.

-            .. tab-set::
+            .. code-block:: shell

-               {% for supported_gpus, docker in dockers.items() %}
-               .. tab-item:: {{ supported_gpus }}
-                  :sync: {{ supported_gpus }}
+               docker pull {{ unified_docker.pull_tag }}

-                  .. code-block:: shell
+         2. Run the Docker container.

-                     docker pull {{ docker.pull_tag }}
-               {% endfor %}
+            .. code-block:: shell

-         2. Launch the Docker container.
-
-            .. tab-set::
-
-               {% for supported_gpus, docker in dockers.items() %}
-               .. tab-item:: {{ supported_gpus }}
-                  :sync: {{ supported_gpus }}
-
-                  .. code-block:: shell
-
-                     docker run -it \
-                         --device /dev/dri \
-                         --device /dev/kfd \
-                         --network host \
-                         --ipc host \
-                         --group-add video \
-                         --cap-add SYS_PTRACE \
-                         --security-opt seccomp=unconfined \
-                         --privileged \
-                         -v $HOME:$HOME \
-                         -v $HOME/.ssh:/root/.ssh \
-                         --shm-size 64G \
-                         --name training_env \
-                         {{ docker.pull_tag }}
-               {% endfor %}
+               docker run -it \
+                   --device /dev/dri \
+                   --device /dev/kfd \
+                   --network host \
+                   --ipc host \
+                   --group-add video \
+                   --cap-add SYS_PTRACE \
+                   --security-opt seccomp=unconfined \
+                   --privileged \
+                   -v $HOME:$HOME \
+                   -v $HOME/.ssh:/root/.ssh \
+                   --shm-size 64G \
+                   --name training_env \
+                   {{ unified_docker.pull_tag }}

            Use these commands if you exit the ``training_env`` container and need to return to it.

@@ -405,7 +379,7 @@ Run training

            ``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face:

-            * `frank-chieng/chinese_architecture_siheyuan <https://huggingface.co/datasets/frank-chieng/chinese_architecture_siheyuan>`__
+            * `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`__

   {% for model_group in model_groups %}
      {% for model in model_group.models %}
@@ -436,7 +410,7 @@ Run training

               .. note::

-                  Currently, FLUX models are not supported out-of-the-box on this Docker.
+                  Currently, FLUX models are not supported out-of-the-box on {{ unified_docker.pull_tag }}.
                  To use FLUX, refer to ``rocm/pytorch-training`` Docker: :doc:`previous-versions/pytorch-training-v25.6`

                  Occasionally, downloading the Flux dataset might fail. In the event of this
@@ -468,49 +442,6 @@ Run training
                 - Between 2048 and 8192. 8192 by default.
         {% endif %}

-         {% set training_modes = model.training_modes %}
-         {% set training_mode_descs = {
-            "posttrain": "Benchmark post-training.",
-         } %}
-         {% set available_modes = training_modes | select("in", ["posttrain"]) | list %}
-         {% if available_modes %}
-
-         .. container:: model-doc {{ model.mad_tag }}
-
-            .. rubric:: Post-training
-
-            To start the post-training benchmark, use the following command with the
-            appropriate options. See the following list of options and their descriptions.
-
-            .. code-block:: shell
-
-               ./pytorch_benchmark_report.sh -t {% if available_modes | length == 1 %}{{ available_modes[0] }}{% else %}$training_mode{% endif %} \
-                   -m {{ model.model_repo }} \
-                   -p $datatype \
-                   -s $sequence_length
-
-            .. list-table::
-               :header-rows: 1
-
-               * - Name
-                 - Options
-                 - Description
-
-               {% for mode in available_modes %}
-               * - {% if loop.first %}``$training_mode``{% endif %}
-                 - ``{{ mode }}``
-                 - {{ training_mode_descs[mode] }}
-               {% endfor %}
-
-               * - ``$datatype``
-                 - ``BF16``{% if model.mad_tag == "pyt_train_llama-3.1-8b" %} or ``FP8``{% endif %}
-                 - Only Llama 3.1 8B supports FP8 precision.
-
-               * - ``$sequence_length``
-                 - Sequence length for the language model.
-                 - Between 2048 and 8192. 8192 by default.
-         {% endif %}
-
         {% set training_mode_descs = {
            "finetune_fw": "Full weight fine-tuning (BF16 and FP8 supported).",
            "finetune_lora": "LoRA fine-tuning (BF16 supported).",
@@ -525,7 +456,7 @@ Run training

            To start the fine-tuning benchmark, use the following command with the
            appropriate options. See the following list of options and their descriptions.
-            See :ref:`supported training modes <amd-pytorch-training-supported-training-modes-v259>`.
+            See :ref:`supported training modes <amd-pytorch-training-supported-training-modes>`.

            .. code-block:: shell

@@ -590,7 +521,7 @@ Run training

            For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.

-.. _amd-pytorch-training-multinode-examples-v259:
+.. _amd-pytorch-training-multinode-examples:

 Multi-node training
 -------------------
@@ -639,11 +570,6 @@ To launch the training job on a SLURM cluster for Llama 3.3 70B, run the followi

 Once the run is finished, you can find the log files in the ``result_torchtune/`` directory.

-Known issues
-============
-
-PyTorch Profiler may produce inaccurate traces when CPU activity profiling is enabled.
-
 Further reading
 ===============

--- a/docs/how-to/rocm-for-ai/training/scale-model-training.rst
+++ b/docs/how-to/rocm-for-ai/training/scale-model-training.rst
@@ -46,7 +46,7 @@ In DDP training, each process or worker owns a replica of the model and processe

 See the following developer blogs for more in-depth explanations and examples.

-*  `Multi GPU training with DDP — PyTorch Tutorials <https://docs.pytorch.org/tutorials/beginner/ddp_series_multigpu.html>`__
+*  `Multi GPU training with DDP — PyTorch Tutorials <https://pytorch.org/tutorials/beginner/ddp_series_multigpu.html>`_

 *  `Building a decoder transformer model on AMD GPUs — ROCm Blogs
   <https://rocm.blogs.amd.com/artificial-intelligence/decoder-transformer/README.html#distributed-training-on-multiple-gpus>`_
--- a/docs/reference/graph-safe-support.rst
+++ b/docs/reference/graph-safe-support.rst
@@ -93,7 +93,7 @@ The following table shows whether a ROCm library is graph-safe.
      - ⚠️ (experimental)
    * 
      - `rocThrust <https://github.com/ROCm/rocThrust>`_
-      - ❌ (see :doc:`details <rocthrust:reference/rocThrust-hipgraph-support>`)
+      - ❌ (see :doc:`details <rocthrust:hipgraph-support>`)
    * 
      - `rocWMMA <https://github.com/ROCm/rocWMMA>`_
      - ❌
--- a/docs/sphinx/_toc.yml.in
+++ b/docs/sphinx/_toc.yml.in
@@ -43,6 +43,8 @@ subtrees:
        title: DGL compatibility
      - file: compatibility/ml-compatibility/megablocks-compatibility.rst
        title: Megablocks compatibility
+      - file: compatibility/ml-compatibility/taichi-compatibility.rst
+        title: Taichi compatibility
      - file: compatibility/ml-compatibility/ray-compatibility.rst
        title: Ray compatibility
      - file: compatibility/ml-compatibility/llama-cpp-compatibility.rst
@@ -132,8 +134,6 @@ subtrees:
            title: Profile and debug
          - file: how-to/rocm-for-ai/inference-optimization/workload.rst
            title: Workload optimization
-          - file: how-to/rocm-for-ai/inference-optimization/vllm-optimization.rst
-            title: vLLM V1 performance optimization

      - url: https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/
        title: AI tutorials
Author	SHA1	Message	Date
David Dixon	3acbd30cc3	remove lib from filename	2025-10-15 09:22:05 -06:00
David Dixon	bf7928b879	Create cli11.yml in tag-builds	2025-10-15 08:54:07 -06:00
David Dixon	fdfd988c6a	Create cli11.yml	2025-10-15 08:44:05 -06:00
Pratik Basyal	f21cfe1171	GitHub issue added to 702 known issues (#5520 ) * GitHub issue added to 702 known issues * Added missing RCCL changelog	2025-10-15 09:58:23 -04:00
Jan Stephan	170cb47a4f	Merge pull request #5512 from j-stephan/rocm-examples-deps [Ex CI] Add libtiff-dev, libopencv-dev and rpp	2025-10-15 10:02:46 +02:00
Braden Stefanuk	d19a8e4a83	[superbuild] Add dependencies for hipblaslt and origami (#5487 ) * ci: add deps for origami in superbuild * ci: add rocm path to system path * build: add pip msgpack dep	2025-10-14 16:05:24 -06:00
amd-hsivasun	3a0b8529ed	[Ex CI] Added MIOpen to the test dependencies for rocm-examples (#5517 )	2025-10-14 14:56:36 -04:00
Joseph Macaranas	f9d7fc2e6a	[External CI] Add libsimde-dev to ROCR pipeline (#5515 )	2025-10-14 14:24:45 -04:00
Nilesh M Negi	d424687191	[Ex CI] Increase RCCL build time limit to 120mins (#5516 )	2025-10-14 12:59:40 -05:00
Jan Stephan	35e6e50888	[Ex CI] Add libopencv-dev Signed-off-by: Jan Stephan <jan.stephan@amd.com>	2025-10-13 20:00:25 +02:00
Jan Stephan	91cfe98eb3	[Ex CI] Add libtiff-dev and rpp Signed-off-by: Jan Stephan <jan.stephan@amd.com>	2025-10-13 17:42:59 +02:00
Pratik Basyal	036aaa2e78	ROCm for HPC topic updated Develop (#5504 ) * ROCm for HPC topic updated * ROCm for HPC topic udpated * Minor editorial	2025-10-10 22:31:51 -04:00
Pratik Basyal	78258e0f85	702 compatibility Footnote updated (#5502 ) * Footnote updated * Minor update * Minor update * Break added * Line break added * Line break * Footnote updated * Minor correction	2025-10-10 21:23:07 -04:00
amd-hsong	c79d9f74ef	Merge pull request #5490 Re-enable device_merge_inplace unit test for rocPRIM	2025-10-10 15:03:23 -06:00
amd-hsivasun	fb1b78c6f0	[Ex CI] Added Component and Module Dependencies (#5489 ) * [Ex CI] Added Component and Module Dependencies * Add registerROCmPackages flag	2025-10-10 16:01:11 -04:00
Hao Song	29a90f0271	[rocPRIM] Re-enable device_merge_inplace unit test for rocPRIM	2025-10-09 21:48:11 +00:00