Docs: Ray release 25.12 and compatibility version format standardization (#5845 )

Bump urllib3 from 2.5.0 to 2.6.3 in /docs/sphinx (#5842 )
Bumps [urllib3](https://github.com/urllib3/urllib3) from 2.5.0 to 2.6.3. - [Release notes](https://github.com/urllib3/urllib3/releases) - [Changelog](https://github.com/urllib3/urllib3/blob/main/CHANGES.rst) - [Commits](https://github.com/urllib3/urllib3/compare/2.5.0...2.6.3) --- updated-dependencies: - dependency-name: urllib3 dependency-version: 2.6.3 dependency-type: indirect ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2026-01-09 22:58:17 -05:00 · 2026-01-08 12:09:11 -05:00 · 2026-01-08 08:22:01 -05:00 · 2026-01-07 13:49:31 -05:00 · 2026-01-07 11:00:38 -05:00 · 2026-01-06 14:10:42 -05:00
54 changed files with 6858 additions and 1001 deletions
--- a/.azuredevops/components/HIP.yml
+++ b/.azuredevops/components/HIP.yml
@@ -34,6 +34,7 @@ parameters:
  default:
    - cmake
    - libnuma-dev
+    - libsimde-dev
    - mesa-common-dev
    - ninja-build
    - ocl-icd-libopencl1
--- a/.azuredevops/components/origami.yml
+++ b/.azuredevops/components/origami.yml
@@ -39,6 +39,7 @@ parameters:
    - python3
    - python3-dev
    - python3-pip
+    - python3-venv
    - libgtest-dev
    - libboost-filesystem-dev
    - libboost-program-options-dev
@@ -46,6 +47,8 @@ parameters:
  type: object
  default:
    - nanobind>=2.0.0
+    - pytest
+    - pytest-cov
 - name: rocmDependencies
  type: object
  default:
@@ -72,8 +75,10 @@ parameters:
      - { os: ubuntu2204, packageManager: apt }
      - { os: almalinux8, packageManager: dnf }
    testJobs:
-      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+      # - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
+      # - { os: ubuntu2204, packageManager: apt, target: gfx1151 }
+      # - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
 - name: downstreamComponentMatrix
  type: object
  default:
@@ -116,6 +121,11 @@ jobs:
      parameters:
        dependencyList:
          - gtest
+    - ${{ if ne(job.os, 'almalinux8') }}:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
+        parameters:
+          dependencyList:
+            - catch2
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
@@ -137,6 +147,7 @@ jobs:
          -DORIGAMI_BUILD_SHARED_LIBS=ON
          -DORIGAMI_ENABLE_PYTHON=ON
          -DORIGAMI_BUILD_TESTING=ON
+          -DORIGAMI_ENABLE_FETCH=ON
          -GNinja
    - ${{ if ne(job.os, 'almalinux8') }}:
      - task: PublishPipelineArtifact@1
@@ -169,7 +180,6 @@ jobs:
      dependsOn: origami_build_${{ job.os }}
      condition:
        and(succeeded(),
-          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
          eq(${{ parameters.aggregatePipeline }}, False)
        )
@@ -180,30 +190,30 @@ jobs:
      workspace:
        clean: all
      steps:
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-        parameters:
-          checkoutRepo: ${{ parameters.checkoutRepo }}
-          sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
        parameters:
          aptPackages: ${{ parameters.aptPackages }}
          pipModules: ${{ parameters.pipModules }}
          packageManager: ${{ job.packageManager }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+        parameters:
+          checkoutRepo: ${{ parameters.checkoutRepo }}
+          sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
+        parameters:
+          dependencyList:
+            - gtest
+      - ${{ if ne(job.os, 'almalinux8') }}:
+        - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
+          parameters:
+            dependencyList:
+              - catch2
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
        parameters:
          preTargetFilter: ${{ parameters.componentName }}
          os: ${{ job.os }}
-      - task: DownloadPipelineArtifact@2
-        displayName: 'Download Build Directory Artifact'
-        inputs:
-          artifact: '${{ parameters.componentName }}_${{ job.os }}_build_dir'
-          path: '$(Agent.BuildDirectory)/s/build'
-      - task: DownloadPipelineArtifact@2
-        displayName: 'Download Python Source Artifact'
-        inputs:
-          artifact: '${{ parameters.componentName }}_${{ job.os }}_python_src'
-          path: '$(Agent.BuildDirectory)/s/python'
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
        parameters:
          checkoutRef: ${{ parameters.checkoutRef }}
@@ -212,25 +222,72 @@ jobs:
          gpuTarget: ${{ job.target }}
          ${{ if parameters.triggerDownstreamJobs }}:
            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+      - task: CMake@1
+        displayName: 'Origami Test CMake Configuration'
+        inputs:
+          cmakeArgs: >-
+            -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor
+            -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
+            -DORIGAMI_BUILD_SHARED_LIBS=ON
+            -DORIGAMI_ENABLE_PYTHON=ON
+            -DORIGAMI_BUILD_TESTING=ON
+            -GNinja
+            $(Agent.BuildDirectory)/s
+      - task: Bash@3
+        displayName: 'Build Origami Tests and Python Bindings'
+        inputs:
+          targetType: inline
+          workingDirectory: build
+          script: |
+            cmake --build . --target origami-tests origami_python -- -j$(nproc)
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      # Run tests using CTest (discovers and runs both C++ and Python tests)
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
        parameters:
          componentName: ${{ parameters.componentName }}
          os: ${{ job.os }}
-          testDir: '$(Agent.BuildDirectory)/rocm/bin'
-          testExecutable: './origami-tests'
-          testParameters: '--yaml origami-tests.yaml --gtest_output=xml:./test_output.xml --gtest_color=yes'
-      - script: |
-          set -e
-          export PYTHONPATH=$(Agent.BuildDirectory)/s/build/python:$PYTHONPATH
-
-          echo "--- Running origami_test.py ---"
-          python3 $(Agent.BuildDirectory)/s/python/origami_test.py
-          
-          echo "--- Running origami_grid_test.py ---"
-          python3 $(Agent.BuildDirectory)/s/python/origami_grid_test.py
-        displayName: 'Run Python Binding Tests'
-        condition: succeeded()
+          testDir: 'build'
+          testParameters: '--output-on-failure --force-new-ctest-process --output-junit test_output.xml'
+      # Test pip install workflow
+      # - task: Bash@3
+      #   displayName: 'Test Pip Install'
+      #   inputs:
+      #     targetType: inline
+      #     script: |
+      #       set -e
+            
+      #       echo "==================================================================="
+      #       echo "Testing pip install workflow (pip install -e .)"
+      #       echo "==================================================================="
+            
+      #       # Set environment variables for pip install CMake build
+      #       export ROCM_PATH=$(Agent.BuildDirectory)/rocm
+      #       export CMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm:$(Agent.BuildDirectory)/vendor
+      #       export CMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
+            
+      #       echo "ROCM_PATH: $ROCM_PATH"
+      #       echo "CMAKE_PREFIX_PATH: $CMAKE_PREFIX_PATH"
+      #       echo "CMAKE_CXX_COMPILER: $CMAKE_CXX_COMPILER"
+      #       echo ""
+            
+      #       # Install from source directory
+      #       cd "$(Agent.BuildDirectory)/s/python"
+      #       pip install -e .
+            
+      #       # Verify import works
+      #       echo ""
+      #       echo "Verifying origami can be imported..."
+      #       python3 -c "import origami; print('✓ Successfully imported origami')"
+            
+      #       # Run pytest on installed package
+      #       echo ""
+      #       echo "Running pytest tests..."
+      #       python3 -m pytest tests/ -v -m "not slow" --tb=short
+            
+      #       echo ""
+      #       echo "==================================================================="
+      #       echo "Pip install test completed successfully"
+      #       echo "==================================================================="
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
        parameters:
          aptPackages: ${{ parameters.aptPackages }}
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -261,6 +261,7 @@ Ioffe
 JAX's
 JAXLIB
 Jinja
+js
 JSON
 Jupyter
 KFD
@@ -525,6 +526,7 @@ TensileLite
 TensorBoard
 TensorFlow
 TensorParallel
+TheRock
 ToC
 TorchAudio
 torchaudio
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -685,7 +685,7 @@ See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/roc
 * `Compute Throughput` panel to TUI's `High Level Analysis` category with the following metrics: VALU FLOPs, VALU IOPs, MFMA FLOPs (F8), MFMA FLOPs (BF16), MFMA FLOPs (F16), MFMA FLOPs (F32), MFMA FLOPs (F64), MFMA FLOPs (F6F4) (in gfx950), MFMA IOPs (Int8), SALU Utilization, VALU Utilization, MFMA Utilization, VMEM Utilization, Branch Utilization, IPC

 * `Memory Throughput` panel to TUI's `High Level Analysis` category with the following metrics: vL1D Cache BW, vL1D Cache Utilization, Theoretical LDS Bandwidth, LDS Utilization, L2 Cache BW, L2 Cache Utilization, L2-Fabric Read BW, L2-Fabric Write BW, sL1D Cache BW, L1I BW, Address Processing Unit Busy, Data-Return Busy, L1I-L2 Bandwidth, sL1D-L2 BW
-* Roofline support for Debian 12 and Azure Linux 3.0.
+* Roofline support for Debian 12.
 * Notice for change in default output format to `rocpd` in a future release
  * This is displayed when `--format-rocprof-output rocpd` is not used in profile mode

--- a/docs/compatibility/compatibility-matrix-historical-6.0.csv
+++ b/docs/compatibility/compatibility-matrix-historical-6.0.csv
@@ -8,7 +8,7 @@ ROCm Version,7.1.1,7.1.0,7.0.2,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6
      ,,,,,,,,,,,,,,,,,,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9
      ,"Oracle Linux 10, 9, 8","Oracle Linux 10, 9, 8","Oracle Linux 10, 9, 8","Oracle Linux 9, 8","Oracle Linux 9, 8","Oracle Linux 9, 8","Oracle Linux 9, 8","Oracle Linux 9, 8",Oracle Linux 8.10,Oracle Linux 8.10,Oracle Linux 8.10,Oracle Linux 8.10,Oracle Linux 8.9,Oracle Linux 8.9,Oracle Linux 8.9,Oracle Linux 8.9,Oracle Linux 8.9,Oracle Linux 8.9,Oracle Linux 8.9,,,
      ,"Debian 13, 12","Debian 13, 12","Debian 13, 12",Debian 12,Debian 12,Debian 12,Debian 12,Debian 12,Debian 12,Debian 12,Debian 12,,,,,,,,,,,
-      ,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,,,,,,,,,,,,
+      ,,,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,,,,,,,,,,,,
      ,Rocky Linux 9,Rocky Linux 9,Rocky Linux 9,Rocky Linux 9,,,,,,,,,,,,,,,,,,
      ,.. _architecture-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
      :doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA4,CDNA4,CDNA4,CDNA4,,,,,,,,,,,,,,,,,,
@@ -37,7 +37,7 @@ ROCm Version,7.1.1,7.1.0,7.0.2,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6
      :doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>` [#stanford-megatron-lm_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,85f95ae,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
      :doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat-past-60]_,N/A,N/A,N/A,2.4.0,2.4.0,N/A,N/A,2.4.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
      :doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>` [#megablocks_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.7.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`Ray <../compatibility/ml-compatibility/ray-compatibility>` [#ray_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,2.48.0.post0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`Ray <../compatibility/ml-compatibility/ray-compatibility>` [#ray_compat-past-60]_,N/A,N/A,N/A,2.51.1,N/A,N/A,2.48.0.post0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
      :doc:`llama.cpp <../compatibility/ml-compatibility/llama-cpp-compatibility>` [#llama-cpp_compat-past-60]_,N/A,N/A,N/A,b6652,b6356,b6356,b6356,b5997,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
      :doc:`FlashInfer <../compatibility/ml-compatibility/flashinfer-compatibility>` [#flashinfer_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,v0.2.5,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
      `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.23.1,1.22.0,1.22.0,1.22.0,1.20.0,1.20.0,1.20.0,1.20.0,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.14.1,1.14.1
@@ -80,12 +80,12 @@ ROCm Version,7.1.1,7.1.0,7.0.2,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6
      :doc:`hipSPARSE <hipsparse:index>`,4.1.0,4.1.0,4.0.1,4.0.1,3.2.0,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.1.1,3.1.1,3.1.1,3.1.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
      :doc:`hipSPARSELt <hipsparselt:index>`,0.2.5,0.2.5,0.2.4,0.2.4,0.2.3,0.2.3,0.2.3,0.2.3,0.2.2,0.2.2,0.2.2,0.2.2,0.2.1,0.2.1,0.2.1,0.2.1,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0
      :doc:`rocALUTION <rocalution:index>`,4.0.1,4.0.1,4.0.0,4.0.0,3.2.3,3.2.3,3.2.3,3.2.2,3.2.1,3.2.1,3.2.1,3.2.1,3.2.1,3.2.0,3.2.0,3.2.0,3.1.1,3.1.1,3.1.1,3.1.1,3.0.3,3.0.3
-      :doc:`rocBLAS <rocblas:index>`,5.1.0,5.1.0,5.0.2,5.0.0,4.4.1,4.4.1,4.4.0,4.4.0,4.3.0,4.3.0,4.3.0,4.3.0,4.2.4,4.2.1,4.2.1,4.2.0,4.1.2,4.1.2,4.1.0,4.1.0,4.0.0,4.0.0
+      :doc:`rocBLAS <rocblas:index>`,5.1.1,5.1.0,5.0.2,5.0.0,4.4.1,4.4.1,4.4.0,4.4.0,4.3.0,4.3.0,4.3.0,4.3.0,4.2.4,4.2.1,4.2.1,4.2.0,4.1.2,4.1.2,4.1.0,4.1.0,4.0.0,4.0.0
      :doc:`rocFFT <rocfft:index>`,1.0.35,1.0.35,1.0.34,1.0.34,1.0.32,1.0.32,1.0.32,1.0.32,1.0.31,1.0.31,1.0.31,1.0.31,1.0.30,1.0.29,1.0.29,1.0.28,1.0.27,1.0.27,1.0.27,1.0.26,1.0.25,1.0.23
      :doc:`rocRAND <rocrand:index>`,4.1.0,4.1.0,4.0.0,4.0.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.1,3.1.0,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,2.10.17
      :doc:`rocSOLVER <rocsolver:index>`,3.31.0,3.31.0,3.30.1,3.30.0,3.28.2,3.28.2,3.28.0,3.28.0,3.27.0,3.27.0,3.27.0,3.27.0,3.26.2,3.26.0,3.26.0,3.26.0,3.25.0,3.25.0,3.25.0,3.25.0,3.24.0,3.24.0
      :doc:`rocSPARSE <rocsparse:index>`,4.1.0,4.1.0,4.0.2,4.0.2,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.0.2,3.0.2
-      :doc:`rocWMMA <rocwmma:index>`,2.0.0,2.0.0,2.0.0,2.0.0,1.7.0,1.7.0,1.7.0,1.7.0,1.6.0,1.6.0,1.6.0,1.6.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0
+      :doc:`rocWMMA <rocwmma:index>`,2.1.0,2.0.0,2.0.0,2.0.0,1.7.0,1.7.0,1.7.0,1.7.0,1.6.0,1.6.0,1.6.0,1.6.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0
      :doc:`Tensile <tensile:src/index>`,4.44.0,4.44.0,4.44.0,4.44.0,4.43.0,4.43.0,4.43.0,4.43.0,4.42.0,4.42.0,4.42.0,4.42.0,4.41.0,4.41.0,4.41.0,4.41.0,4.40.0,4.40.0,4.40.0,4.40.0,4.39.0,4.39.0
      ,,,,,,,,,,,,,,,,,,,,,,
      PRIMITIVES,.. _primitivelibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
@@ -96,20 +96,20 @@ ROCm Version,7.1.1,7.1.0,7.0.2,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6
      ,,,,,,,,,,,,,,,,,,,,,,
      SUPPORT LIBS,,,,,,,,,,,,,,,,,,,,,,
      `hipother <https://github.com/ROCm/hipother>`_,7.1.52802,7.1.25424,7.0.51831,7.0.51830,6.4.43483,6.4.43483,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
-      `rocm-core <https://github.com/ROCm/rocm-core>`_,7.1.0,7.1.0,7.0.2,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0,6.1.5,6.1.2,6.1.1,6.1.0,6.0.2,6.0.0
+      `rocm-core <https://github.com/ROCm/rocm-core>`_,7.1.1,7.1.0,7.0.2,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0,6.1.5,6.1.2,6.1.1,6.1.0,6.0.2,6.0.0
      `ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,20240607.5.7,20240607.5.7,20240607.4.05,20240607.1.4246,20240125.5.08,20240125.5.08,20240125.5.08,20240125.3.30,20231016.2.245,20231016.2.245
      ,,,,,,,,,,,,,,,,,,,,,,
      SYSTEM MGMT TOOLS,.. _tools-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
-      :doc:`AMD SMI <amdsmi:index>`,26.1.0,26.1.0,26.0.2,26.0.0,25.5.1,25.5.1,25.4.2,25.3.0,24.7.1,24.7.1,24.7.1,24.7.1,24.6.3,24.6.3,24.6.3,24.6.2,24.5.1,24.5.1,24.5.1,24.4.1,23.4.2,23.4.2
+      :doc:`AMD SMI <amdsmi:index>`,26.2.0,26.1.0,26.0.2,26.0.0,25.5.1,25.5.1,25.4.2,25.3.0,24.7.1,24.7.1,24.7.1,24.7.1,24.6.3,24.6.3,24.6.3,24.6.2,24.5.1,24.5.1,24.5.1,24.4.1,23.4.2,23.4.2
      :doc:`ROCm Data Center Tool <rdc:index>`,1.2.0,1.2.0,1.1.0,1.1.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0
      :doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
      :doc:`ROCm SMI <rocm_smi_lib:index>`,7.8.0,7.8.0,7.8.0,7.8.0,7.7.0,7.5.0,7.5.0,7.5.0,7.4.0,7.4.0,7.4.0,7.4.0,7.3.0,7.3.0,7.3.0,7.3.0,7.2.0,7.2.0,7.0.0,7.0.0,6.0.2,6.0.0
-      :doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.2.0,1.2.0,1.2.0,1.2.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.0.60204,1.0.60202,1.0.60201,1.0.60200,1.0.60105,1.0.60102,1.0.60101,1.0.60100,1.0.60002,1.0.60000
+      :doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.3.0,1.2.0,1.2.0,1.2.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.0.60204,1.0.60202,1.0.60201,1.0.60200,1.0.60105,1.0.60102,1.0.60101,1.0.60100,1.0.60002,1.0.60000
      ,,,,,,,,,,,,,,,,,,,,,,
      PERFORMANCE TOOLS,,,,,,,,,,,,,,,,,,,,,,
      :doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,2.6.0,2.6.0,2.6.0,2.6.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0
-      :doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.3.0,3.3.0,3.2.3,3.2.3,3.1.1,3.1.1,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.0.1,2.0.1,2.0.1,2.0.1,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.2.0,1.2.0,1.1.1,1.1.0,1.0.2,1.0.2,1.0.1,1.0.0,0.1.2,0.1.1,0.1.0,0.1.0,1.11.2,1.11.2,1.11.2,1.11.2,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.3.1,3.3.0,3.2.3,3.2.3,3.1.1,3.1.1,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.0.1,2.0.1,2.0.1,2.0.1,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.2.1,1.2.0,1.1.1,1.1.0,1.0.2,1.0.2,1.0.1,1.0.0,0.1.2,0.1.1,0.1.0,0.1.0,1.11.2,1.11.2,1.11.2,1.11.2,N/A,N/A,N/A,N/A,N/A,N/A
      :doc:`ROCProfiler <rocprofiler:index>`,2.0.70101,2.0.70100,2.0.70002,2.0.70000,2.0.60403,2.0.60402,2.0.60401,2.0.60400,2.0.60303,2.0.60302,2.0.60301,2.0.60300,2.0.60204,2.0.60202,2.0.60201,2.0.60200,2.0.60105,2.0.60102,2.0.60101,2.0.60100,2.0.60002,2.0.60000
      :doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,1.0.0,1.0.0,1.0.0,1.0.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.0,0.5.0,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,N/A,N/A,N/A,N/A,N/A,N/A
      :doc:`ROCTracer <roctracer:index>`,4.1.70101,4.1.70100,4.1.70002,4.1.70000,4.1.60403,4.1.60402,4.1.60401,4.1.60400,4.1.60303,4.1.60302,4.1.60301,4.1.60300,4.1.60204,4.1.60202,4.1.60201,4.1.60200,4.1.60105,4.1.60102,4.1.60101,4.1.60100,4.1.60002,4.1.60000
--- a/docs/compatibility/compatibility-matrix.rst
+++ b/docs/compatibility/compatibility-matrix.rst
@@ -32,7 +32,7 @@ compatibility and system requirements.
      ,SLES 15 SP7,SLES 15 SP7,SLES 15 SP6
      ,"Oracle Linux 10, 9, 8","Oracle Linux 10, 9, 8","Oracle Linux 9, 8"
      ,"Debian 13, 12","Debian 13, 12",Debian 12
-      ,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0
+      ,,,Azure Linux 3.0
      ,Rocky Linux 9,Rocky Linux 9,
      ,.. _architecture-support-compatibility-matrix:,,
      :doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA4,CDNA4,
@@ -98,12 +98,12 @@ compatibility and system requirements.
      :doc:`hipSPARSE <hipsparse:index>`,4.1.0,4.1.0,3.2.0
      :doc:`hipSPARSELt <hipsparselt:index>`,0.2.5,0.2.5,0.2.3
      :doc:`rocALUTION <rocalution:index>`,4.0.1,4.0.1,3.2.2
-      :doc:`rocBLAS <rocblas:index>`,5.1.0,5.1.0,4.4.0
+      :doc:`rocBLAS <rocblas:index>`,5.1.1,5.1.0,4.4.0
      :doc:`rocFFT <rocfft:index>`,1.0.35,1.0.35,1.0.32
      :doc:`rocRAND <rocrand:index>`,4.1.0,4.1.0,3.3.0
      :doc:`rocSOLVER <rocsolver:index>`,3.31.0,3.31.0,3.28.0
      :doc:`rocSPARSE <rocsparse:index>`,4.1.0,4.1.0,3.4.0
-      :doc:`rocWMMA <rocwmma:index>`,2.0.0,2.0.0,1.7.0
+      :doc:`rocWMMA <rocwmma:index>`,2.1.0,2.0.0,1.7.0
      :doc:`Tensile <tensile:src/index>`,4.44.0,4.44.0,4.43.0
      ,,,
      PRIMITIVES,.. _primitivelibs-support-compatibility-matrix:,,
@@ -114,20 +114,20 @@ compatibility and system requirements.
      ,,,
      SUPPORT LIBS,,,
      `hipother <https://github.com/ROCm/hipother>`_,7.1.52802,7.1.25424,6.4.43482
-      `rocm-core <https://github.com/ROCm/rocm-core>`_,7.1.0,7.1.0,6.4.0
+      `rocm-core <https://github.com/ROCm/rocm-core>`_,7.1.1,7.1.0,6.4.0
      `ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr]_,N/A [#ROCT-rocr]_,N/A [#ROCT-rocr]_
      ,,,
      SYSTEM MGMT TOOLS,.. _tools-support-compatibility-matrix:,,
-      :doc:`AMD SMI <amdsmi:index>`,26.1.0,26.1.0,25.3.0
+      :doc:`AMD SMI <amdsmi:index>`,26.2.0,26.1.0,25.3.0
      :doc:`ROCm Data Center Tool <rdc:index>`,1.2.0,1.2.0,0.3.0
      :doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0,1.0.0
      :doc:`ROCm SMI <rocm_smi_lib:index>`,7.8.0,7.8.0,7.5.0
-      :doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.2.0,1.2.0,1.1.0
+      :doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.3.0,1.2.0,1.1.0
      ,,,
      PERFORMANCE TOOLS,,,
      :doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,2.6.0,2.6.0,1.4.0
-      :doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.3.0,3.3.0,3.1.0
-      :doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.2.0,1.2.0,1.0.0
+      :doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.3.1,3.3.0,3.1.0
+      :doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.2.1,1.2.0,1.0.0
      :doc:`ROCProfiler <rocprofiler:index>`,2.0.70101,2.0.70100,2.0.60400
      :doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,1.0.0,1.0.0,0.6.0
      :doc:`ROCTracer <roctracer:index>`,4.1.70101,4.1.70100,4.1.60400
@@ -157,8 +157,8 @@ compatibility and system requirements.

 .. [#os-compatibility] Some operating systems are supported on limited GPUs. For detailed information, see the latest :ref:`supported_distributions`. For version specific information, see `ROCm 7.1.1 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.1/reference/system-requirements.html#supported-operating-systems>`__, `ROCm 7.1.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.0/reference/system-requirements.html#supported-operating-systems>`__, and `ROCm 6.4.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.0/reference/system-requirements.html#supported-operating-systems>`__.
 .. [#gpu-compatibility] Some GPUs have limited operating system support. For detailed information, see the latest :ref:`supported_GPUs`. For version specific information, see `ROCm 7.1.1 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.1/reference/system-requirements.html#supported-gpus>`__, `ROCm 7.1.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.0/reference/system-requirements.html#supported-gpus>`__, and `ROCm 6.4.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.0/reference/system-requirements.html#supported-gpus>`__.
-.. [#dgl_compat] DGL is supported only on ROCm 7.0.0, ROCm 6.4.3 and ROCm 6.4.0.
-.. [#llama-cpp_compat] llama.cpp is supported only on ROCm 7.0.0 and ROCm 6.4.x.
+.. [#dgl_compat] DGL is only supported on ROCm 7.0.0, ROCm 6.4.3 and ROCm 6.4.0.
+.. [#llama-cpp_compat] llama.cpp is only supported on ROCm 7.0.0 and ROCm 6.4.x.
 .. [#mi325x_KVM] For AMD Instinct MI325X KVM SR-IOV users, do not use AMD GPU Driver (amdgpu) 30.20.0.
 .. [#driver_patch] AMD GPU Driver (amdgpu) 30.10.1 is a quality release that resolves an issue identified in the 30.10 release. There are no other significant changes or feature additions in ROCm 7.0.1 from ROCm 7.0.0. AMD GPU Driver (amdgpu) 30.10.1 is compatible with ROCm 7.0.1 and ROCm 7.0.0.
 .. [#kfd_support] As of ROCm 6.4.0, forward and backward compatibility between the AMD GPU Driver (amdgpu) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The supported user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and AMD GPU Driver support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
@@ -169,44 +169,7 @@ compatibility and system requirements.
 Operating systems, kernel and Glibc versions
 *********************************************

-Use this lookup table to confirm which operating system and kernel versions are supported with ROCm.
-
-.. csv-table::
-   :header: "OS", "Version", "Kernel", "Glibc"
-   :widths: 40, 20, 30, 20
-   :stub-columns: 1
-
-   `Ubuntu <https://ubuntu.com/about/release-cycle#ubuntu-kernel-release-cycle>`_, 24.04.3, "6.8 [GA], 6.14 [HWE]", 2.39
-   ,,
-   `Ubuntu <https://ubuntu.com/about/release-cycle#ubuntu-kernel-release-cycle>`_, 24.04.2, "6.8 [GA], 6.11 [HWE]", 2.39
-   ,,
-   `Ubuntu <https://ubuntu.com/about/release-cycle#ubuntu-kernel-release-cycle>`_, 22.04.5, "5.15 [GA], 6.8 [HWE]", 2.35
-   ,,
-   `Red Hat Enterprise Linux (RHEL 10) <https://access.redhat.com/articles/3078#RHEL9>`_, 10.1, 6.12.0-124, 2.39
-   ,10.0, 6.12.0-55, 2.39
-   ,,
-   `Red Hat Enterprise Linux (RHEL 9) <https://access.redhat.com/articles/3078#RHEL9>`_, 9.7, 5.14.0-611, 2.34
-   ,9.6, 5.14.0-570, 2.34
-   ,9.5, 5.14+, 2.34
-   ,9.4, 5.14.0-427, 2.34
-   ,,
-   `Red Hat Enterprise Linux (RHEL 8) <https://access.redhat.com/articles/3078#RHEL8>`_, 8.10, 4.18.0-553, 2.28
-   ,,
-   `SUSE Linux Enterprise Server (SLES) <https://www.suse.com/support/kb/doc/?id=000019587#SLE15SP4>`_, 15 SP7, 6.40-150700.51, 2.38
-   ,15 SP6, "6.5.0+, 6.4.0", 2.38
-   ,15 SP5, 5.14.21, 2.31
-   ,,
-   `Rocky Linux <https://wiki.rockylinux.org/rocky/version/>`_, 9, 5.14.0-570, 2.34
-   ,,
-   `Oracle Linux <https://blogs.oracle.com/scoter/post/oracle-linux-and-unbreakable-enterprise-kernel-uek-releases>`_, 10, 6.12.0 (UEK), 2.39
-   ,9, 6.12.0 (UEK), 2.34
-   ,8, 5.15.0 (UEK), 2.28
-   ,,
-   `Debian <https://www.debian.org/download>`_,13, 6.12, 2.35
-   ,12, 6.1.0, 2.36
-   ,,
-   `Azure Linux <https://techcommunity.microsoft.com/blog/linuxandopensourceblog/azure-linux-3-0-now-in-preview-on-azure-kubernetes-service-v1-31/4287229>`_,3.0, 6.6.92, 2.38
-   ,,
+For detailed information on operating system supported on ROCm 7.1.1 and associated Kernel and Glibc version, see the latest :ref:`supported_distributions`. For version specific information, see `ROCm 7.1.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.0/reference/system-requirements.html#supported-operating-systems>`__, and `ROCm 6.4.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.0/reference/system-requirements.html#supported-operating-systems>`__.

 .. note::

@@ -238,16 +201,16 @@ Expand for full historical view of:

   .. rubric:: Footnotes

-   .. [#os-compatibility-past-60] Some operating systems are supported on limited GPUs. For detailed information, see :ref:`supported_distributions` and select the required ROCm version for version specific support.
-   .. [#gpu-compatibility-past-60] Some GPUs have limited operating system support. For detailed information, see :ref:`supported_GPUs` and select the required ROCm version for version specific support.
+   .. [#os-compatibility-past-60] Some operating systems are supported on limited GPUs. For detailed information, see the latest :ref:`supported_distributions`. For version specific information, see `ROCm 7.1.1 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.1/reference/system-requirements.html#supported-operating-systems>`__, `ROCm 7.1.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.0/reference/system-requirements.html#supported-operating-systems>`__, and `ROCm 6.4.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.0/reference/system-requirements.html#supported-operating-systems>`__.
+   .. [#gpu-compatibility-past-60] Some GPUs have limited operating system support. For detailed information, see the latest :ref:`supported_GPUs`. For version specific information, see `ROCm 7.1.1 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.1/reference/system-requirements.html#supported-gpus>`__, `ROCm 7.1.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.0/reference/system-requirements.html#supported-gpus>`__, and `ROCm 6.4.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.0/reference/system-requirements.html#supported-gpus>`__.
   .. [#tf-mi350-past-60] TensorFlow 2.17.1 is not supported on AMD Instinct MI350 Series GPUs. Use TensorFlow 2.19.1 or 2.18.1 with MI350 Series GPUs instead.
-   .. [#verl_compat-past-60] verl is supported only on ROCm 7.0.0 and 6.2.0.
-   .. [#stanford-megatron-lm_compat-past-60] Stanford Megatron-LM is supported only on ROCm 6.3.0.
-   .. [#dgl_compat-past-60] DGL is supported only on ROCm 7.0.0, ROCm 6.4.3 and ROCm 6.4.0.
-   .. [#megablocks_compat-past-60] Megablocks is supported only on ROCm 6.3.0.
-   .. [#ray_compat-past-60] Ray is supported only on ROCm 6.4.1.
-   .. [#llama-cpp_compat-past-60] llama.cpp is supported only on ROCm 7.0.0 and 6.4.x.
-   .. [#flashinfer_compat-past-60] FlashInfer is supported only on ROCm 6.4.1.
+   .. [#verl_compat-past-60] verl is only supported on ROCm 7.0.0 and 6.2.0.
+   .. [#stanford-megatron-lm_compat-past-60] Stanford Megatron-LM is only supported on ROCm 6.3.0.
+   .. [#dgl_compat-past-60] DGL is only supported on ROCm 7.0.0, ROCm 6.4.3 and ROCm 6.4.0.
+   .. [#megablocks_compat-past-60] Megablocks is only supported on ROCm 6.3.0.
+   .. [#ray_compat-past-60] Ray is only supported on ROCm 7.0.0 and 6.4.1.
+   .. [#llama-cpp_compat-past-60] llama.cpp is only supported on ROCm 7.0.0 and 6.4.x.
+   .. [#flashinfer_compat-past-60] FlashInfer is only supported on ROCm 6.4.1.
   .. [#mi325x_KVM-past-60] For AMD Instinct MI325X KVM SR-IOV users, do not use AMD GPU Driver (amdgpu) 30.20.0.
   .. [#driver_patch-past-60] AMD GPU Driver (amdgpu) 30.10.1 is a quality release that resolves an issue identified in the 30.10 release. There are no other significant changes or feature additions in ROCm 7.0.1 from ROCm 7.0.0. AMD GPU Driver (amdgpu) 30.10.1 is compatible with ROCm 7.0.1 and ROCm 7.0.0.
   .. [#kfd_support-past-60] As of ROCm 6.4.0, forward and backward compatibility between the AMD GPU Driver (amdgpu) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The supported user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and AMD GPU Driver support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
--- a/docs/compatibility/ml-compatibility/dgl-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/dgl-compatibility.rst
@@ -36,63 +36,9 @@ Support overview
  - You can also consult the upstream `Installation guide <https://www.dgl.ai/pages/start.html>`__ 
    for additional context.

-Version support
--------------------------------------------------------------------------------
-
-DGL is supported on `ROCm 7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__, 
-`ROCm 6.4.3 <https://repo.radeon.com/rocm/apt/6.4.3/>`__, and `ROCm 6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__.
-
-Supported devices
--------------------------------------------------------------------------------
-
-**Officially Supported**: AMD Instinct™ MI300X, MI250X
-
-.. _dgl-recommendations:
-
-Use cases and recommendations
-================================================================================
-
-DGL can be used for Graph Learning, and building popular graph models like  
-GAT, GCN, and GraphSage. Using these models, a variety of use cases are supported:
-
- Recommender systems
- Network Optimization and Analysis
- 1D (Temporal) and 2D (Image) Classification
- Drug Discovery
-
-For use cases and recommendations, refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__, 
-where you can search for DGL examples and best practices to optimize your workloads on AMD GPUs.
-
-* Although multiple use cases of DGL have been tested and verified, a few have been  
-  outlined in the `DGL in the Real World: Running GNNs on Real Use Cases 
-  <https://rocm.blogs.amd.com/artificial-intelligence/dgl_blog2/README.html>`__ blog 
-  post, which walks through four real-world graph neural network (GNN) workloads 
-  implemented with the Deep Graph Library on ROCm. It covers tasks ranging from 
-  heterogeneous e-commerce graphs and multiplex networks (GATNE) to molecular graph 
-  regression (GNN-FiLM) and EEG-based neurological diagnosis (EEG-GCNN). For each use 
-  case, the authors detail: the dataset and task, how DGL is used, and their experience 
-  porting to ROCm. It is shown that DGL codebases often run without modification, with 
-  seamless integration of graph operations, message passing, sampling, and convolution. 
-
-* The `Graph Neural Networks (GNNs) at Scale: DGL with ROCm on AMD Hardware 
-  <https://rocm.blogs.amd.com/artificial-intelligence/why-graph-neural/README.html>`__ 
-  blog post introduces the Deep Graph Library (DGL) and its enablement on the AMD ROCm platform, 
-  bringing high-performance graph neural network (GNN) training to AMD GPUs. DGL bridges 
-  the gap between dense tensor frameworks and the irregular nature of graph data through a 
-  graph-first, message-passing abstraction. Its design ensures scalability, flexibility, and 
-  interoperability across frameworks like PyTorch and TensorFlow. AMD’s ROCm integration 
-  enables DGL to run efficiently on HIP-based GPUs, supported by prebuilt Docker containers 
-  and open-source repositories. This marks a major step in AMD's mission to advance open, 
-  scalable AI ecosystems beyond traditional architectures.
-
-You can pre-process datasets and begin training on AMD GPUs through:
-
-* Single-GPU training/inference
-* Multi-GPU training
-
 .. _dgl-docker-compat:

-Docker image compatibility
+Compatibility matrix
 ================================================================================

 .. |docker-icon| raw:: html
@@ -114,6 +60,7 @@ Click the |docker-icon| to view the image on Docker Hub.
      - PyTorch
      - Ubuntu
      - Python
+      - GPU

    * - .. raw:: html

@@ -124,6 +71,7 @@ Click the |docker-icon| to view the image on Docker Hub.
      - `2.8.0 <https://github.com/pytorch/pytorch/releases/tag/v2.8.0>`__
      - 24.04
      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`__
+      - MI300X, MI250X

    * - .. raw:: html

@@ -134,6 +82,7 @@ Click the |docker-icon| to view the image on Docker Hub.
      - `2.6.0 <https://github.com/pytorch/pytorch/releases/tag/v2.6.0>`__
      - 24.04
      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`__
+      - MI300X, MI250X

    * - .. raw:: html

@@ -144,6 +93,7 @@ Click the |docker-icon| to view the image on Docker Hub.
      - `2.7.1 <https://github.com/pytorch/pytorch/releases/tag/v2.7.1>`__
      - 22.04
      - `3.10.16 <https://www.python.org/downloads/release/python-31016/>`__
+      - MI300X, MI250X

    * - .. raw:: html

@@ -154,6 +104,7 @@ Click the |docker-icon| to view the image on Docker Hub.
      - `2.6.0 <https://github.com/pytorch/pytorch/releases/tag/v2.6.0>`__
      - 24.04
      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`__
+      - MI300X, MI250X

    * - .. raw:: html

@@ -164,6 +115,7 @@ Click the |docker-icon| to view the image on Docker Hub.
      - `2.6.0 <https://github.com/pytorch/pytorch/releases/tag/v2.6.0>`__
      - 24.04
      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`__
+      - MI300X, MI250X

    * - .. raw:: html

@@ -174,7 +126,7 @@ Click the |docker-icon| to view the image on Docker Hub.
      - `2.4.1 <https://github.com/pytorch/pytorch/releases/tag/v2.4.1>`__
      - 24.04
      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`__
-
+      - MI300X, MI250X

    * - .. raw:: html

@@ -185,7 +137,7 @@ Click the |docker-icon| to view the image on Docker Hub.
      - `2.4.1 <https://github.com/pytorch/pytorch/releases/tag/v2.4.1>`__
      - 22.04
      - `3.10.16 <https://www.python.org/downloads/release/python-31016/>`__
-
+      - MI300X, MI250X

    * - .. raw:: html

@@ -196,7 +148,10 @@ Click the |docker-icon| to view the image on Docker Hub.
      - `2.3.0 <https://github.com/pytorch/pytorch/releases/tag/v2.3.0>`__
      - 22.04
      - `3.10.16 <https://www.python.org/downloads/release/python-31016/>`__
-      
+      - MI300X, MI250X
+
+
+.. _dgl-key-rocm-libraries:

 Key ROCm libraries for DGL
 ================================================================================
@@ -310,8 +265,9 @@ If you prefer to build it yourself, ensure the following dependencies are instal
        multiplication (GEMM) and accumulation operations with mixed precision
        support.

+.. _dgl-supported-features-latest:

-Supported features
+Supported features with ROCm 7.0.0
 ================================================================================

 Many functions and methods available upstream are also supported in DGL on ROCm.
@@ -335,14 +291,17 @@ Instead of listing them all, support is grouped into the following categories to
 * DGL Sparse
 * GraphBolt

-Unsupported features
+.. _dgl-unsupported-features-latest:
+
+Unsupported features with ROCm 7.0.0
 ================================================================================

 * TF32 Support (only supported for PyTorch 2.7 and above)
 * Kineto/ROCTracer integration

+.. _dgl-unsupported-functions:

-Unsupported functions
+Unsupported functions with ROCm 7.0.0
 ================================================================================

 * ``bfs``
@@ -355,6 +314,50 @@ Unsupported functions
 * ``sample_labors_noprob``
 * ``sparse_admin``

+.. _dgl-recommendations:
+
+Use cases and recommendations
+================================================================================
+
+DGL can be used for Graph Learning, and building popular graph models like  
+GAT, GCN, and GraphSage. Using these models, a variety of use cases are supported:
+
+- Recommender systems
+- Network Optimization and Analysis
+- 1D (Temporal) and 2D (Image) Classification
+- Drug Discovery
+
+For use cases and recommendations, refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__, 
+where you can search for DGL examples and best practices to optimize your workloads on AMD GPUs.
+
+* Although multiple use cases of DGL have been tested and verified, a few have been  
+  outlined in the `DGL in the Real World: Running GNNs on Real Use Cases 
+  <https://rocm.blogs.amd.com/artificial-intelligence/dgl_blog2/README.html>`__ blog 
+  post, which walks through four real-world graph neural network (GNN) workloads 
+  implemented with the Deep Graph Library on ROCm. It covers tasks ranging from 
+  heterogeneous e-commerce graphs and multiplex networks (GATNE) to molecular graph 
+  regression (GNN-FiLM) and EEG-based neurological diagnosis (EEG-GCNN). For each use 
+  case, the authors detail: the dataset and task, how DGL is used, and their experience 
+  porting to ROCm. It is shown that DGL codebases often run without modification, with 
+  seamless integration of graph operations, message passing, sampling, and convolution. 
+
+* The `Graph Neural Networks (GNNs) at Scale: DGL with ROCm on AMD Hardware 
+  <https://rocm.blogs.amd.com/artificial-intelligence/why-graph-neural/README.html>`__ 
+  blog post introduces the Deep Graph Library (DGL) and its enablement on the AMD ROCm platform, 
+  bringing high-performance graph neural network (GNN) training to AMD GPUs. DGL bridges 
+  the gap between dense tensor frameworks and the irregular nature of graph data through a 
+  graph-first, message-passing abstraction. Its design ensures scalability, flexibility, and 
+  interoperability across frameworks like PyTorch and TensorFlow. AMD’s ROCm integration 
+  enables DGL to run efficiently on HIP-based GPUs, supported by prebuilt Docker containers 
+  and open-source repositories. This marks a major step in AMD's mission to advance open, 
+  scalable AI ecosystems beyond traditional architectures.
+
+You can pre-process datasets and begin training on AMD GPUs through:
+
+* Single-GPU training/inference
+* Multi-GPU training
+
+
 Previous versions
 ===============================================================================
 See :doc:`rocm-install-on-linux:install/3rd-party/previous-versions/dgl-history` to find documentation for previous releases
--- a/docs/compatibility/ml-compatibility/flashinfer-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/flashinfer-compatibility.rst
@@ -42,38 +42,9 @@ Support overview
  - You can also consult the upstream `Installation guide <https://docs.flashinfer.ai/installation.html>`__ 
    for additional context.

-Version support
--------------------------------------------------------------------------------
-
-FlashInfer is supported on `ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__.
-
-Supported devices
--------------------------------------------------------------------------------
-
-**Officially Supported**: AMD Instinct™ MI300X
-
-
-.. _flashinfer-recommendations:
-
-Use cases and recommendations
-================================================================================
-
-This release of FlashInfer on ROCm provides the decode functionality for LLM inferencing.
-In the decode phase, tokens are generated sequentially, with the model predicting each new 
-token based on the previously generated tokens and the input context.
-
-FlashInfer on ROCm brings over upstream features such as load balancing, sparse and dense 
-attention optimizations, and batching support, enabling efficient execution on AMD Instinct™ MI300X GPUs.
-
-Because large LLMs often require substantial KV caches or long context windows, FlashInfer on ROCm 
-also implements cascade attention from upstream to reduce memory usage. 
-
-For currently supported use cases and recommendations, refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__, 
-where you can search for examples and best practices to optimize your workloads on AMD GPUs.
-
 .. _flashinfer-docker-compat:

-Docker image compatibility
+Compatibility matrix
 ================================================================================

 .. |docker-icon| raw:: html
@@ -95,6 +66,7 @@ Click |docker-icon| to view the image on Docker Hub.
      - PyTorch
      - Ubuntu
      - Python
+      - GPU

    * - .. raw:: html

@@ -104,5 +76,23 @@ Click |docker-icon| to view the image on Docker Hub.
      - `2.7.1 <https://github.com/ROCm/pytorch/releases/tag/v2.7.1>`__
      - 24.04
      - `3.12 <https://www.python.org/downloads/release/python-3129/>`__
+      - MI300X

+.. _flashinfer-recommendations:
+
+Use cases and recommendations
+================================================================================
+
+The release of FlashInfer on ROCm provides the decode functionality for LLM inferencing.
+In the decode phase, tokens are generated sequentially, with the model predicting each new 
+token based on the previously generated tokens and the input context.
+
+FlashInfer on ROCm brings over upstream features such as load balancing, sparse and dense 
+attention optimizations, and batching support, enabling efficient execution on AMD Instinct™ MI300X GPUs.
+
+Because large LLMs often require substantial KV caches or long context windows, FlashInfer on ROCm 
+also implements cascade attention from upstream to reduce memory usage. 
+
+For currently supported use cases and recommendations, refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__, 
+where you can search for examples and best practices to optimize your workloads on AMD GPUs.

--- a/docs/compatibility/ml-compatibility/llama-cpp-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/llama-cpp-compatibility.rst
@@ -36,47 +36,9 @@ Support overview
  - You can also consult the upstream `Installation guide <https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md>`__ 
    for additional context.

-Version support
--------------------------------------------------------------------------------
-
-llama.cpp is supported on `ROCm 7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__ and 
-`ROCm 6.4.x <https://repo.radeon.com/rocm/apt/6.4/>`__.
-
-Supported devices
--------------------------------------------------------------------------------
-
-**Officially Supported**: AMD Instinct™ MI325X, MI300X, MI210
-
-Use cases and recommendations
-================================================================================
-
-llama.cpp can be applied in a variety of scenarios, particularly when you need to meet one or more of the following requirements:
-
- Plain C/C++ implementation with no external dependencies
- Support for 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory usage
- Custom HIP (Heterogeneous-compute Interface for Portability) kernels for running large language models (LLMs) on AMD GPUs (graphics processing units)
- CPU (central processing unit) + GPU (graphics processing unit) hybrid inference for partially accelerating models larger than the total available VRAM (video random-access memory)
-
-llama.cpp is also used in a range of real-world applications, including:
-
- Games such as `Lucy's Labyrinth <https://github.com/MorganRO8/Lucys_Labyrinth>`__:
-  A simple maze game where AI-controlled agents attempt to trick the player.
- Tools such as `Styled Lines <https://marketplace.unity.com/packages/tools/ai-ml-integration/style-text-webgl-ios-stand-alone-llm-llama-cpp-wrapper-292902>`__:
-  A proprietary, asynchronous inference wrapper for Unity3D game development, including pre-built mobile and web platform wrappers and a model example.
- Various other AI applications use llama.cpp as their inference engine;  
-  for a detailed list, see the `user interfaces (UIs) section <https://github.com/ggml-org/llama.cpp?tab=readme-ov-file#description>`__.
-
-For more use cases and recommendations, refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__, 
-where you can search for llama.cpp examples and best practices to optimize your workloads on AMD GPUs.
-
- The `Llama.cpp Meets Instinct: A New Era of Open-Source AI Acceleration <https://rocm.blogs.amd.com/ecosystems-and-partners/llama-cpp/README.html>`__ 
-  blog post outlines how the open-source llama.cpp framework enables efficient LLM inference—including interactive inference with ``llama-cli``, 
-  server deployment with ``llama-server``, GGUF model preparation and quantization, performance benchmarking, and optimizations tailored for 
-  AMD Instinct GPUs within the ROCm ecosystem. 
-
 .. _llama-cpp-docker-compat:

-Docker image compatibility
+Compatibility matrix
 ================================================================================

 .. |docker-icon| raw:: html
@@ -106,6 +68,7 @@ Click |docker-icon| to view the image on Docker Hub.
      - llama.cpp
      - ROCm
      - Ubuntu
+      - GPU

    * - .. raw:: html

@@ -119,6 +82,7 @@ Click |docker-icon| to view the image on Docker Hub.
      - `b6652 <https://github.com/ROCm/llama.cpp/tree/release/b6652>`__
      - `7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__
      - 24.04
+      - MI325X, MI300X, MI210

    * - .. raw:: html

@@ -132,6 +96,7 @@ Click |docker-icon| to view the image on Docker Hub.
      - `b6652 <https://github.com/ROCm/llama.cpp/tree/release/b6652>`__
      - `7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__
      - 22.04
+      - MI325X, MI300X, MI210

    * - .. raw:: html

@@ -145,6 +110,7 @@ Click |docker-icon| to view the image on Docker Hub.
      - `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
      - `6.4.3 <https://repo.radeon.com/rocm/apt/6.4.3/>`__
      - 24.04
+      - MI325X, MI300X, MI210

    * - .. raw:: html

@@ -158,7 +124,7 @@ Click |docker-icon| to view the image on Docker Hub.
      - `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
      - `6.4.3 <https://repo.radeon.com/rocm/apt/6.4.3/>`__
      - 22.04
-
+      - MI325X, MI300X, MI210

    * - .. raw:: html

@@ -172,6 +138,7 @@ Click |docker-icon| to view the image on Docker Hub.
      - `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
      - `6.4.2 <https://repo.radeon.com/rocm/apt/6.4.2/>`__
      - 24.04
+      - MI325X, MI300X, MI210

    * - .. raw:: html

@@ -185,7 +152,7 @@ Click |docker-icon| to view the image on Docker Hub.
      - `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
      - `6.4.2 <https://repo.radeon.com/rocm/apt/6.4.2/>`__
      - 22.04
-
+      - MI325X, MI300X, MI210

    * - .. raw:: html

@@ -199,6 +166,7 @@ Click |docker-icon| to view the image on Docker Hub.
      - `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
      - `6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__
      - 24.04
+      - MI325X, MI300X, MI210

    * - .. raw:: html

@@ -212,6 +180,7 @@ Click |docker-icon| to view the image on Docker Hub.
      - `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
      - `6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__
      - 22.04
+      - MI325X, MI300X, MI210

    * - .. raw:: html

@@ -225,7 +194,9 @@ Click |docker-icon| to view the image on Docker Hub.
      - `b5997 <https://github.com/ROCm/llama.cpp/tree/release/b5997>`__
      - `6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__
      - 24.04
+      - MI300X, MI210

+.. _llama-cpp-key-rocm-libraries:

 Key ROCm libraries for llama.cpp
 ================================================================================
@@ -268,6 +239,36 @@ your corresponding ROCm version.
      - Can be used to enhance the flash attention performance on AMD compute, by enabling
        the flag during compile time.

+.. _llama-cpp-uses-recommendations:
+
+Use cases and recommendations
+================================================================================
+
+llama.cpp can be applied in a variety of scenarios, particularly when you need to meet one or more of the following requirements:
+
+- Plain C/C++ implementation with no external dependencies
+- Support for 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory usage
+- Custom HIP (Heterogeneous-compute Interface for Portability) kernels for running large language models (LLMs) on AMD GPUs (graphics processing units)
+- CPU (central processing unit) + GPU (graphics processing unit) hybrid inference for partially accelerating models larger than the total available VRAM (video random-access memory)
+
+llama.cpp is also used in a range of real-world applications, including:
+
+- Games such as `Lucy's Labyrinth <https://github.com/MorganRO8/Lucys_Labyrinth>`__:
+  A simple maze game where AI-controlled agents attempt to trick the player.
+- Tools such as `Styled Lines <https://marketplace.unity.com/packages/tools/ai-ml-integration/style-text-webgl-ios-stand-alone-llm-llama-cpp-wrapper-292902>`__:
+  A proprietary, asynchronous inference wrapper for Unity3D game development, including pre-built mobile and web platform wrappers and a model example.
+- Various other AI applications use llama.cpp as their inference engine;  
+  for a detailed list, see the `user interfaces (UIs) section <https://github.com/ggml-org/llama.cpp?tab=readme-ov-file#description>`__.
+
+For more use cases and recommendations, refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__, 
+where you can search for llama.cpp examples and best practices to optimize your workloads on AMD GPUs.
+
+- The `Llama.cpp Meets Instinct: A New Era of Open-Source AI Acceleration <https://rocm.blogs.amd.com/ecosystems-and-partners/llama-cpp/README.html>`__ 
+  blog post outlines how the open-source llama.cpp framework enables efficient LLM inference—including interactive inference with ``llama-cli``, 
+  server deployment with ``llama-server``, GGUF model preparation and quantization, performance benchmarking, and optimizations tailored for 
+  AMD Instinct GPUs within the ROCm ecosystem. 
+
+
 Previous versions
 ===============================================================================
 See :doc:`rocm-install-on-linux:install/3rd-party/previous-versions/llama-cpp-history` to find documentation for previous releases
--- a/docs/compatibility/ml-compatibility/megablocks-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/megablocks-compatibility.rst
@@ -33,19 +33,44 @@ Support overview
  - You can also consult the upstream `Installation guide <https://github.com/databricks/megablocks>`__ 
    for additional context.

-Version support
--------------------------------------------------------------------------------
+.. _megablocks-docker-compat:

-Megablocks is supported on `ROCm 6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`__.
+Compatibility matrix
+================================================================================

-Supported devices
--------------------------------------------------------------------------------
+.. |docker-icon| raw:: html

- **Officially Supported**: AMD Instinct™ MI300X
- **Partially Supported** (functionality or performance limitations): AMD Instinct™ MI250X, MI210
+   <i class="fab fa-docker"></i>

-Supported models and features
--------------------------------------------------------------------------------
+AMD validates and publishes `Megablocks images <https://hub.docker.com/r/rocm/megablocks/tags>`__
+with ROCm backends on Docker Hub. The following Docker image tag and associated
+inventories represent the latest available Megablocks version from the official Docker Hub. 
+Click |docker-icon| to view the image on Docker Hub.
+
+.. list-table:: 
+    :header-rows: 1
+    :class: docker-image-compatibility
+
+    * - Docker image
+      - ROCm
+      - Megablocks
+      - PyTorch
+      - Ubuntu
+      - Python
+      - GPU
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/megablocks/megablocks-0.7.0_rocm6.3.0_ubuntu24.04_py3.12_pytorch2.4.0/images/sha256-372ff89b96599019b8f5f9db469c84add2529b713456781fa62eb9a148659ab4"><i class="fab fa-docker fa-lg"></i> rocm/megablocks</a>
+      - `6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`_
+      - `0.7.0 <https://github.com/databricks/megablocks/releases/tag/v0.7.0>`_
+      - `2.4.0 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
+      - 24.04
+      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
+      - MI300X
+
+Supported models and features with ROCm 6.3.0
+================================================================================

 This section summarizes the Megablocks features supported by ROCm.

@@ -77,38 +102,3 @@ It features how to pre-process datasets and how to begin pre-training on AMD GPU
 * Single-GPU pre-training
 * Multi-GPU pre-training

-.. _megablocks-docker-compat:
-
-Docker image compatibility
-================================================================================
-
-.. |docker-icon| raw:: html
-
-   <i class="fab fa-docker"></i>
-
-AMD validates and publishes `Megablocks images <https://hub.docker.com/r/rocm/megablocks/tags>`__
-with ROCm backends on Docker Hub. The following Docker image tag and associated
-inventories represent the latest available Megablocks version from the official Docker Hub. 
-Click |docker-icon| to view the image on Docker Hub.
-
-.. list-table:: 
-    :header-rows: 1
-    :class: docker-image-compatibility
-
-    * - Docker image
-      - ROCm
-      - Megablocks
-      - PyTorch
-      - Ubuntu
-      - Python
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/megablocks/megablocks-0.7.0_rocm6.3.0_ubuntu24.04_py3.12_pytorch2.4.0/images/sha256-372ff89b96599019b8f5f9db469c84add2529b713456781fa62eb9a148659ab4"><i class="fab fa-docker fa-lg"></i> rocm/megablocks</a>
-      - `6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`_
-      - `0.7.0 <https://github.com/databricks/megablocks/releases/tag/v0.7.0>`_
-      - `2.4.0 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
-      - 24.04
-      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
-
-
--- a/docs/compatibility/ml-compatibility/ray-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/ray-compatibility.rst
@@ -12,8 +12,8 @@ Ray compatibility

 Ray is a unified framework for scaling AI and Python applications from your laptop 
 to a full cluster, without changing your code. Ray consists of `a core distributed 
-runtime  <https://docs.ray.io/en/latest/ray-core/walkthrough.html>`_ and a set of 
-`AI libraries <https://docs.ray.io/en/latest/ray-air/getting-started.html>`_ for 
+runtime  <https://docs.ray.io/en/latest/ray-core/walkthrough.html>`__ and a set of 
+`AI libraries <https://docs.ray.io/en/latest/ray-air/getting-started.html>`__ for 
 simplifying machine learning computations.

 Ray is a general-purpose framework that runs many types of workloads efficiently. 
@@ -29,25 +29,57 @@ Support overview
 - To get started and install Ray on ROCm, use the prebuilt :ref:`Docker image <ray-docker-compat>`, 
  which includes ROCm, Ray, and all required dependencies.

-  - The Docker image provided is based on the upstream Ray `Daily Release (Nightly) wheels 
-    <https://docs.ray.io/en/latest/ray-overview/installation.html#daily-releases-nightlies>`__ 
-    corresponding to commit `005c372 <https://github.com/ray-project/ray/commit/005c372262e050d5745f475e22e64305fa07f8b8>`__.
-
-  - See the :doc:`ROCm Ray installation guide <rocm-install-on-linux:install/3rd-party/ray-install>` 
+  - See the :doc:`ROCm Ray installation guide <rocm-install-on-linux:install/3rd-party/ray-install>`
    for installation and setup instructions.

  - You can also consult the upstream `Installation guide <https://docs.ray.io/en/latest/ray-overview/installation.html>`__ 
    for additional context.

-Version support
--------------------------------------------------------------------------------
+.. _ray-docker-compat:

-Ray is supported on `ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__.
+Compatibility matrix
+================================================================================

-Supported devices
--------------------------------------------------------------------------------
+.. |docker-icon| raw:: html

-**Officially Supported**: AMD Instinct™ MI300X, MI210
+   <i class="fab fa-docker"></i>
+
+AMD validates and publishes `ROCm Ray Docker images <https://hub.docker.com/r/rocm/ray/tags>`__
+with ROCm backends on Docker Hub. The following Docker image tags and
+associated inventories represent the latest Ray version from the official Docker Hub.
+Click |docker-icon| to view the image on Docker Hub.
+
+.. list-table::
+    :header-rows: 1
+    :class: docker-image-compatibility
+
+    * - Docker image
+      - ROCm
+      - Ray
+      - Pytorch
+      - Ubuntu
+      - Python
+      - GPU
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/ray/ray-2.51.1_rocm7.0.0_ubuntu22.04_py3.12_pytorch2.9.0/images/sha256-a02f6766b4ba406f88fd7e85707ec86c04b569834d869a08043ec9bcbd672168"><i class="fab fa-docker fa-lg"></i> rocm/ray</a>
+      - `7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__
+      - `2.51.1 <https://github.com/ROCm/ray/tree/release/2.51.1>`__
+      - 2.9.0a0+git1c57644
+      - 22.04
+      - `3.12.12 <https://www.python.org/downloads/release/python-31212/>`__
+      - MI300X
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/ray/ray-2.48.0.post0_rocm6.4.1_ubuntu24.04_py3.12_pytorch2.6.0/images/sha256-0d166fe6bdced38338c78eedfb96eff92655fb797da3478a62dd636365133cc0"><i class="fab fa-docker fa-lg"></i> rocm/ray</a>
+      - `6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__
+      - `2.48.0.post0 <https://github.com/ROCm/ray/tree/release/2.48.0.post0>`__
+      - 2.6.0+git684f6f2
+      - 24.04
+      - `3.12.10 <https://www.python.org/downloads/release/python-31210/>`__
+      - MI300X, MI210

 Use cases and recommendations
 ================================================================================
@@ -76,36 +108,7 @@ topic <https://docs.ray.io/en/latest/ray-core/scheduling/accelerators.html#accel
 of the Ray core documentation and refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__, 
 where you can search for Ray examples and best practices to optimize your workloads on AMD GPUs.

-.. _ray-docker-compat:
-
-Docker image compatibility
-================================================================================
-
-.. |docker-icon| raw:: html
-
-   <i class="fab fa-docker"></i>
-
-AMD validates and publishes ready-made `ROCm Ray Docker images <https://hub.docker.com/r/rocm/ray/tags>`__
-with ROCm backends on Docker Hub. The following Docker image tags and
-associated inventories represent the latest Ray version from the official Docker Hub.
-Click the |docker-icon| icon to view the image on Docker Hub.
-
-.. list-table::
-    :header-rows: 1
-    :class: docker-image-compatibility
-
-    * - Docker image
-      - ROCm
-      - Ray
-      - Pytorch
-      - Ubuntu
-      - Python
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/ray/ray-2.48.0.post0_rocm6.4.1_ubuntu24.04_py3.12_pytorch2.6.0/images/sha256-0d166fe6bdced38338c78eedfb96eff92655fb797da3478a62dd636365133cc0"><i class="fab fa-docker fa-lg"></i> rocm/ray</a>
-      - `6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__.
-      - `2.48.0.post0 <https://github.com/ROCm/ray/tree/release/2.48.0.post0>`_
-      - 2.6.0+git684f6f2
-      - 24.04
-      - `3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
+Previous versions
+===============================================================================
+See :doc:`rocm-install-on-linux:install/3rd-party/previous-versions/ray-history` to find documentation for previous releases
+of the ``ROCm/ray`` Docker image.
--- a/docs/compatibility/ml-compatibility/stanford-megatron-lm-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/stanford-megatron-lm-compatibility.rst
@@ -35,19 +35,45 @@ Support overview
  - You can also consult the upstream `Installation guide <https://github.com/NVIDIA/Megatron-LM>`__ 
    for additional context.

-Version support
--------------------------------------------------------------------------------
+.. _megatron-lm-docker-compat:

-Stanford Megatron-LM is supported on `ROCm 6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`__.
+Compatibility matrix
+================================================================================

-Supported devices
--------------------------------------------------------------------------------
+.. |docker-icon| raw:: html

- **Officially Supported**: AMD Instinct™ MI300X
- **Partially Supported** (functionality or performance limitations): AMD Instinct™ MI250X, MI210
+   <i class="fab fa-docker"></i>

-Supported models and features
--------------------------------------------------------------------------------
+AMD validates and publishes `Stanford Megatron-LM images <https://hub.docker.com/r/rocm/stanford-megatron-lm/tags>`_
+with ROCm and Pytorch backends on Docker Hub. The following Docker image tags and associated
+inventories represent the latest Stanford Megatron-LM version from the official Docker Hub.
+Click |docker-icon| to view the image on Docker Hub.
+
+.. list-table:: 
+    :header-rows: 1
+    :class: docker-image-compatibility
+
+    * - Docker image
+      - ROCm
+      - Stanford Megatron-LM
+      - PyTorch
+      - Ubuntu
+      - Python
+      - GPU
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/stanford-megatron-lm/stanford-megatron-lm85f95ae_rocm6.3.0_ubuntu24.04_py3.12_pytorch2.4.0/images/sha256-070556f078be10888a1421a2cb4f48c29f28b02bfeddae02588d1f7fc02a96a6"><i class="fab fa-docker fa-lg"></i> rocm/stanford-megatron-lm</a>
+
+      - `6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`_
+      - `85f95ae <https://github.com/stanford-futuredata/Megatron-LM/commit/85f95aef3b648075fe6f291c86714fdcbd9cd1f5>`_
+      - `2.4.0 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
+      - 24.04
+      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
+      - MI300X
+
+Supported models and features with ROCm 6.3.0
+================================================================================

 This section details models & features that are supported by the ROCm version on Stanford Megatron-LM.

@@ -88,41 +114,3 @@ It features how to pre-process datasets and how to begin pre-training on AMD GPU

 * Single-GPU pre-training
 * Multi-GPU pre-training
-
-.. _megatron-lm-docker-compat:
-
-Docker image compatibility
-================================================================================
-
-.. |docker-icon| raw:: html
-
-   <i class="fab fa-docker"></i>
-
-AMD validates and publishes `Stanford Megatron-LM images <https://hub.docker.com/r/rocm/stanford-megatron-lm/tags>`_
-with ROCm and Pytorch backends on Docker Hub. The following Docker image tags and associated
-inventories represent the latest Stanford Megatron-LM version from the official Docker Hub.
-Click |docker-icon| to view the image on Docker Hub.
-
-.. list-table:: 
-    :header-rows: 1
-    :class: docker-image-compatibility
-
-    * - Docker image
-      - ROCm
-      - Stanford Megatron-LM
-      - PyTorch
-      - Ubuntu
-      - Python
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/stanford-megatron-lm/stanford-megatron-lm85f95ae_rocm6.3.0_ubuntu24.04_py3.12_pytorch2.4.0/images/sha256-070556f078be10888a1421a2cb4f48c29f28b02bfeddae02588d1f7fc02a96a6"><i class="fab fa-docker fa-lg"></i></a>
-
-      - `6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`_
-      - `85f95ae <https://github.com/stanford-futuredata/Megatron-LM/commit/85f95aef3b648075fe6f291c86714fdcbd9cd1f5>`_
-      - `2.4.0 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
-      - 24.04
-      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
-
-      
-
--- a/docs/compatibility/ml-compatibility/verl-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/verl-compatibility.rst
@@ -37,67 +37,9 @@ Support overview
  - You can also consult the upstream `verl documentation <https://verl.readthedocs.io/en/latest/>`__ 
    for additional context.

-Version support
--------------------------------------------------------------------------------
-
-verl is supported on `ROCm 7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__ and
-`ROCm 6.2.0 <https://repo.radeon.com/rocm/apt/6.2/>`__.
-
-Supported devices
--------------------------------------------------------------------------------
-
-**Officially Supported**: AMD Instinct™ MI300X
-
-.. _verl-recommendations:
-
-Use cases and recommendations
-================================================================================
-
-* The benefits of verl in large-scale reinforcement learning from human feedback 
-  (RLHF) are discussed in the `Reinforcement Learning from Human Feedback on AMD 
-  GPUs with verl and ROCm Integration <https://rocm.blogs.amd.com/artificial-intelligence/verl-large-scale/README.html>`__ 
-  blog. The blog post outlines how the Volcano Engine Reinforcement Learning 
-  (verl) framework integrates with the AMD ROCm platform to optimize training on 
-  AMD Instinct™ GPUs. The guide details the process of building a Docker image, 
-  setting up single-node and multi-node training environments, and highlights 
-  performance benchmarks demonstrating improved throughput and convergence accuracy. 
-  This resource serves as a comprehensive starting point for deploying verl on AMD GPUs, 
-  facilitating efficient RLHF training workflows.
-
-.. _verl-supported_features:
-
-Supported features
-===============================================================================
-
-The following table shows verl on ROCm support for GPU-accelerated modules.
-
-.. list-table::
-    :header-rows: 1
-
-    * - Module
-      - Description
-      - verl version
-      - ROCm version
-    * - ``FSDP``
-      - Training engine
-      - 
-       * 0.6.0
-       * 0.3.0.post0
-      - 
-       * 7.0.0
-       * 6.2.0
-    * - ``vllm``
-      - Inference engine
-      - 
-       * 0.6.0
-       * 0.3.0.post0
-      - 
-       * 7.0.0
-       * 6.2.0
-
 .. _verl-docker-compat:

-Docker image compatibility
+Compatibility matrix
 ================================================================================

 .. |docker-icon| raw:: html
@@ -120,6 +62,7 @@ Click |docker-icon| to view the image on Docker Hub.
     - PyTorch
     - Python
     - vllm
+     - GPU

   * - .. raw:: html

@@ -130,6 +73,7 @@ Click |docker-icon| to view the image on Docker Hub.
     - `2.9.0 <https://github.com/ROCm/pytorch/tree/release/2.9-rocm7.x-gfx115x>`__
     - `3.12.11 <https://www.python.org/downloads/release/python-31211/>`__
     - `0.11.0 <https://github.com/vllm-project/vllm/releases/tag/v0.11.0>`__
+     - MI300X

   * - .. raw:: html

@@ -140,7 +84,33 @@ Click |docker-icon| to view the image on Docker Hub.
     - `2.5.0 <https://github.com/ROCm/pytorch/tree/release/2.5>`__
     - `3.9.19 <https://www.python.org/downloads/release/python-3919/>`__
     - `0.6.3 <https://github.com/vllm-project/vllm/releases/tag/v0.6.3>`__
+     - MI300X

+.. _verl-supported_features:
+
+Supported modules with verl on ROCm
+===============================================================================
+
+The following GPU-accelerated modules are supported with verl on ROCm:
+
+- ``FSDP``: Training engine
+- ``vllm``: Inference engine
+
+.. _verl-recommendations:
+
+Use cases and recommendations
+================================================================================
+
+* The benefits of verl in large-scale reinforcement learning from human feedback 
+  (RLHF) are discussed in the `Reinforcement Learning from Human Feedback on AMD 
+  GPUs with verl and ROCm Integration <https://rocm.blogs.amd.com/artificial-intelligence/verl-large-scale/README.html>`__ 
+  blog. The blog post outlines how the Volcano Engine Reinforcement Learning 
+  (verl) framework integrates with the AMD ROCm platform to optimize training on 
+  AMD Instinct™ GPUs. The guide details the process of building a Docker image, 
+  setting up single-node and multi-node training environments, and highlights 
+  performance benchmarks demonstrating improved throughput and convergence accuracy. 
+  This resource serves as a comprehensive starting point for deploying verl on AMD GPUs, 
+  facilitating efficient RLHF training workflows.

 Previous versions
 ===============================================================================
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -135,24 +135,35 @@ article_pages = [
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.5", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.6", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.7", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.8", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.9", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.10", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-primus-migration-guide", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.7", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/primus-megatron", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.7", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.8", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.9", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.10", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.3", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.4", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.6", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/inference/xdit-diffusion-inference", "os": ["linux"]},    
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.7", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.8", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.9", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.10", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-pytorch-v25.8", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-pytorch-v25.9", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-pytorch-v25.10", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.5", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/xdit-diffusion-inference", "os": ["linux"]},    

    {"file": "how-to/rocm-for-ai/fine-tuning/index", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/fine-tuning/overview", "os": ["linux"]},
@@ -177,8 +188,16 @@ article_pages = [
    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250702", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.0-20250812", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.1-20250909", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.2-20251006", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.11.1-20251103", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/sglang-history", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/xdit-diffusion-inference", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.10", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.11", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.12", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.13", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/deploy-your-model", "os": ["linux"]},

    {"file": "how-to/rocm-for-ai/inference-optimization/index", "os": ["linux"]},
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.11.1_20251103-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.11.1_20251103-benchmark-models.yaml
@@ -0,0 +1,316 @@
+dockers:
+  - pull_tag: rocm/vllm:rocm7.0.0_vllm_0.11.1_20251103
+    docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm7.0.0_vllm_0.11.1_20251103/images/sha256-8d60429043d4d00958da46039a1de0d9b82df814d45da482497eef26a6076506
+    components:
+      ROCm: 7.0.0
+      vLLM: 0.11.1 (0.11.1rc2.dev141+g38f225c2a.rocm700)
+      PyTorch: 2.9.0a0+git1c57644
+      hipBLASLt: 1.0.0
+    dockerfile:
+      commit: 38f225c2abeadc04c2cc398814c2f53ea02c3c72
+model_groups:
+  - group: Meta Llama
+    tag: llama
+    models:
+      - model: Llama 2 70B
+        mad_tag: pyt_vllm_llama-2-70b
+        model_repo: meta-llama/Llama-2-70b-chat-hf
+        url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
+        precision: float16
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: auto
+          max_num_batched_tokens: 4096
+          max_model_len: 4096
+      - model: Llama 3.1 8B
+        mad_tag: pyt_vllm_llama-3.1-8b
+        model_repo: meta-llama/Llama-3.1-8B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-8B
+        precision: float16
+        config:
+          tp: 1
+          dtype: auto
+          kv_cache_dtype: auto
+          max_num_batched_tokens: 131072
+          max_model_len: 8192
+      - model: Llama 3.1 8B FP8
+        mad_tag: pyt_vllm_llama-3.1-8b_fp8
+        model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
+        precision: float8
+        config:
+          tp: 1
+          dtype: auto
+          kv_cache_dtype: fp8
+          max_num_batched_tokens: 131072
+          max_model_len: 8192
+      - model: Llama 3.1 405B
+        mad_tag: pyt_vllm_llama-3.1-405b
+        model_repo: meta-llama/Llama-3.1-405B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
+        precision: float16
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: auto
+          max_num_batched_tokens: 131072
+          max_model_len: 8192
+      - model: Llama 3.1 405B FP8
+        mad_tag: pyt_vllm_llama-3.1-405b_fp8
+        model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
+        precision: float8
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: fp8
+          max_num_batched_tokens: 131072
+          max_model_len: 8192
+      - model: Llama 3.1 405B MXFP4
+        mad_tag: pyt_vllm_llama-3.1-405b_fp4
+        model_repo: amd/Llama-3.1-405B-Instruct-MXFP4-Preview
+        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-MXFP4-Preview
+        precision: float4
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: fp8
+          max_num_batched_tokens: 131072
+          max_model_len: 8192
+      - model: Llama 3.3 70B
+        mad_tag: pyt_vllm_llama-3.3-70b
+        model_repo: meta-llama/Llama-3.3-70B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
+        precision: float16
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: auto
+          max_num_batched_tokens: 131072
+          max_model_len: 8192
+      - model: Llama 3.3 70B FP8
+        mad_tag: pyt_vllm_llama-3.3-70b_fp8
+        model_repo: amd/Llama-3.3-70B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.3-70B-Instruct-FP8-KV
+        precision: float8
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: fp8
+          max_num_batched_tokens: 131072
+          max_model_len: 8192
+      - model: Llama 3.3 70B MXFP4
+        mad_tag: pyt_vllm_llama-3.3-70b_fp4
+        model_repo: amd/Llama-3.3-70B-Instruct-MXFP4-Preview
+        url: https://huggingface.co/amd/Llama-3.3-70B-Instruct-MXFP4-Preview
+        precision: float4
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: fp8
+          max_num_batched_tokens: 131072
+          max_model_len: 8192
+      - model: Llama 4 Scout 17Bx16E
+        mad_tag: pyt_vllm_llama-4-scout-17b-16e
+        model_repo: meta-llama/Llama-4-Scout-17B-16E-Instruct
+        url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct
+        precision: float16
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: auto
+          max_num_batched_tokens: 32768
+          max_model_len: 8192
+      - model: Llama 4 Maverick 17Bx128E
+        mad_tag: pyt_vllm_llama-4-maverick-17b-128e
+        model_repo: meta-llama/Llama-4-Maverick-17B-128E-Instruct
+        url: https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct
+        precision: float16
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: auto
+          max_num_batched_tokens: 32768
+          max_model_len: 8192
+      - model: Llama 4 Maverick 17Bx128E FP8
+        mad_tag: pyt_vllm_llama-4-maverick-17b-128e_fp8
+        model_repo: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
+        url: https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
+        precision: float8
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: fp8
+          max_num_batched_tokens: 131072
+          max_model_len: 8192
+  - group: DeepSeek
+    tag: deepseek
+    models:
+      - model: DeepSeek R1 0528 FP8
+        mad_tag: pyt_vllm_deepseek-r1
+        model_repo: deepseek-ai/DeepSeek-R1-0528
+        url: https://huggingface.co/deepseek-ai/DeepSeek-R1-0528
+        precision: float8
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: fp8
+          max_num_seqs: 1024
+          max_num_batched_tokens: 131072
+          max_model_len: 8192
+  - group: OpenAI GPT OSS
+    tag: gpt-oss
+    models:
+      - model: GPT OSS 20B
+        mad_tag: pyt_vllm_gpt-oss-20b
+        model_repo: openai/gpt-oss-20b
+        url: https://huggingface.co/openai/gpt-oss-20b
+        precision: bfloat16
+        config:
+          tp: 1
+          dtype: auto
+          kv_cache_dtype: auto
+          max_num_batched_tokens: 8192
+          max_model_len: 8192
+      - model: GPT OSS 120B
+        mad_tag: pyt_vllm_gpt-oss-120b
+        model_repo: openai/gpt-oss-120b
+        url: https://huggingface.co/openai/gpt-oss-120b
+        precision: bfloat16
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: auto
+          max_num_batched_tokens: 8192
+          max_model_len: 8192
+  - group: Mistral AI
+    tag: mistral
+    models:
+      - model: Mixtral MoE 8x7B
+        mad_tag: pyt_vllm_mixtral-8x7b
+        model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
+        url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
+        precision: float16
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: auto
+          max_num_batched_tokens: 32768
+          max_model_len: 8192
+      - model: Mixtral MoE 8x7B FP8
+        mad_tag: pyt_vllm_mixtral-8x7b_fp8
+        model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+        precision: float8
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: fp8
+          max_num_batched_tokens: 32768
+          max_model_len: 8192
+      - model: Mixtral MoE 8x22B
+        mad_tag: pyt_vllm_mixtral-8x22b
+        model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
+        url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
+        precision: float16
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: auto
+          max_num_batched_tokens: 65536
+          max_model_len: 8192
+      - model: Mixtral MoE 8x22B FP8
+        mad_tag: pyt_vllm_mixtral-8x22b_fp8
+        model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+        precision: float8
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: fp8
+          max_num_batched_tokens: 65536
+          max_model_len: 8192
+  - group: Qwen
+    tag: qwen
+    models:
+      - model: Qwen3 8B
+        mad_tag: pyt_vllm_qwen3-8b
+        model_repo: Qwen/Qwen3-8B
+        url: https://huggingface.co/Qwen/Qwen3-8B
+        precision: float16
+        config:
+          tp: 1
+          dtype: auto
+          kv_cache_dtype: auto
+          max_num_batched_tokens: 40960
+          max_model_len: 8192
+      - model: Qwen3 32B
+        mad_tag: pyt_vllm_qwen3-32b
+        model_repo: Qwen/Qwen3-32b
+        url: https://huggingface.co/Qwen/Qwen3-32B
+        precision: float16
+        config:
+          tp: 1
+          dtype: auto
+          kv_cache_dtype: auto
+          max_num_batched_tokens: 40960
+          max_model_len: 8192
+      - model: Qwen3 30B A3B
+        mad_tag: pyt_vllm_qwen3-30b-a3b
+        model_repo: Qwen/Qwen3-30B-A3B
+        url: https://huggingface.co/Qwen/Qwen3-30B-A3B
+        precision: float16
+        config:
+          tp: 1
+          dtype: auto
+          kv_cache_dtype: auto
+          max_num_batched_tokens: 40960
+          max_model_len: 8192
+      - model: Qwen3 30B A3B FP8
+        mad_tag: pyt_vllm_qwen3-30b-a3b_fp8
+        model_repo: Qwen/Qwen3-30B-A3B-FP8
+        url: https://huggingface.co/Qwen/Qwen3-30B-A3B-FP8
+        precision: float16
+        config:
+          tp: 1
+          dtype: auto
+          kv_cache_dtype: fp8
+          max_num_batched_tokens: 40960
+          max_model_len: 8192
+      - model: Qwen3 235B A22B
+        mad_tag: pyt_vllm_qwen3-235b-a22b
+        model_repo: Qwen/Qwen3-235B-A22B
+        url: https://huggingface.co/Qwen/Qwen3-235B-A22B
+        precision: float16
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: auto
+          max_num_batched_tokens: 40960
+          max_model_len: 8192
+      - model: Qwen3 235B A22B FP8
+        mad_tag: pyt_vllm_qwen3-235b-a22b_fp8
+        model_repo: Qwen/Qwen3-235B-A22B-FP8
+        url: https://huggingface.co/Qwen/Qwen3-235B-A22B-FP8
+        precision: float8
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: fp8
+          max_num_batched_tokens: 40960
+          max_model_len: 8192
+  - group: Microsoft Phi
+    tag: phi
+    models:
+      - model: Phi-4
+        mad_tag: pyt_vllm_phi-4
+        model_repo: microsoft/phi-4
+        url: https://huggingface.co/microsoft/phi-4
+        precision: float16
+        config:
+          tp: 1
+          dtype: auto
+          kv_cache_dtype: auto
+          max_num_batched_tokens: 16384
+          max_model_len: 8192
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.10-inference-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.10-inference-models.yaml
@@ -1,7 +1,7 @@
 xdit_diffusion_inference:
  docker:
    pull_tag: rocm/pytorch-xdit:v25.10
-    docker_hub_url: https://hub.docker.com/r/rocm/pytorch-xdit
+    docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-xdit/v25.10/images/sha256-d79715ff18a9470e3f907cec8a9654d6b783c63370b091446acffc0de4d7070e
    ROCm: 7.9.0
    components:
      TheRock: 7afbe45
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.11-inference-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.11-inference-models.yaml
@@ -0,0 +1,109 @@
+xdit_diffusion_inference:
+  docker:
+    - version: v25-11
+      pull_tag: rocm/pytorch-xdit:v25.11
+      docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-xdit/v25.11/images/sha256-c9fa659439bb024f854b4d5eea598347251b02c341c55f66c98110832bde4216
+      ROCm: 7.10.0
+      supported_models:
+        - group: Hunyuan Video
+          models:
+            - Hunyuan Video
+        - group: Wan-AI
+          models:
+            - Wan2.1
+            - Wan2.2
+        - group: FLUX
+          models:
+            - FLUX.1
+      whats_new:
+        - "Minor bug fixes and clarifications to READMEs."
+        - "Bumps TheRock, AITER, Diffusers, xDiT versions."
+        - "Changes Aiter rounding mode for faster gfx942 FWD Attention."
+      components:
+        TheRock: 3e3f834
+        rccl: d23d18f
+        composable_kernel: 2570462
+        rocm-libraries: 0588f07
+        rocm-systems: 473025a
+        torch: 73adac
+        torchvision: f5c6c2e
+        triton: 7416ffc
+        accelerate: 34c1779
+        aiter: de14bec
+        diffusers: 40528e9
+        xfuser: 83978b5
+        yunchang: 2c9b712
+
+    - version: v25-10
+      pull_tag: rocm/pytorch-xdit:v25.10
+      docker_hub_url: https://hub.docker.com/r/rocm/pytorch-xdit
+      ROCm: 7.9.0
+      supported_models:
+        - group: Hunyuan Video
+          models:
+            - Hunyuan Video
+        - group: Wan-AI
+          models:
+            - Wan2.1
+            - Wan2.2
+        - group: FLUX
+          models:
+            - FLUX.1
+      whats_new:
+        - "First official xDiT Docker Release for Diffusion Inference."
+        - "Supports gfx942 and gfx950 series (AMD Instinct™ MI300X, MI325X, MI350X, and MI355X)."
+        - "Support Wan 2.1, Wan 2.2, HunyuanVideo and Flux workloads."
+      components:
+        TheRock: 7afbe45
+        rccl: 9b04b2a
+        composable_kernel: b7a806f
+        rocm-libraries: f104555
+        rocm-systems: 25922d0
+        torch: 2.10.0a0+gite9c9017
+        torchvision: 0.22.0a0+966da7e
+        triton: 3.5.0+git52e49c12
+        accelerate: 1.11.0.dev0
+        aiter: 0.1.5.post4.dev20+ga25e55e79
+        diffusers: 0.36.0.dev0
+        xfuser: 0.4.4
+        yunchang: 0.6.3.post1
+
+  model_groups:
+    - group: Hunyuan Video
+      tag: hunyuan
+      models:
+        - model: Hunyuan Video
+          page_tag: hunyuan_tag
+          model_name: hunyuanvideo
+          model_repo: tencent/HunyuanVideo
+          revision: refs/pr/18
+          url: https://huggingface.co/tencent/HunyuanVideo
+          github: https://github.com/Tencent-Hunyuan/HunyuanVideo
+          mad_tag: pyt_xdit_hunyuanvideo
+    - group: Wan-AI
+      tag: wan
+      models:
+        - model: Wan2.1
+          page_tag: wan_21_tag
+          model_name: wan2_1-i2v-14b-720p
+          model_repo: Wan-AI/Wan2.1-I2V-14B-720P
+          url: https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P
+          github: https://github.com/Wan-Video/Wan2.1
+          mad_tag: pyt_xdit_wan_2_1
+        - model: Wan2.2
+          page_tag: wan_22_tag
+          model_name: wan2_2-i2v-a14b
+          model_repo: Wan-AI/Wan2.2-I2V-A14B
+          url: https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B
+          github: https://github.com/Wan-Video/Wan2.2
+          mad_tag: pyt_xdit_wan_2_2
+    - group: FLUX
+      tag: flux
+      models:
+        - model: FLUX.1
+          page_tag: flux_1_tag
+          model_name: FLUX.1-dev
+          model_repo: black-forest-labs/FLUX.1-dev
+          url: https://huggingface.co/black-forest-labs/FLUX.1-dev
+          github: https://github.com/black-forest-labs/flux
+          mad_tag: pyt_xdit_flux
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.12-inference-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.12-inference-models.yaml
@@ -0,0 +1,91 @@
+docker:
+  pull_tag: rocm/pytorch-xdit:v25.12
+  docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-xdit/v25.12/images/sha256-e06895132316bf3c393366b70a91eaab6755902dad0100e6e2b38310547d9256
+  ROCm: 7.10.0
+  whats_new:
+      - "Adds T2V and TI2V support for Wan models."
+      - "Adds support for SD-3.5 T2I model."
+  components:
+    TheRock: 
+      version: 3e3f834
+      url: https://github.com/ROCm/TheRock
+    rccl:
+      version: d23d18f
+      url: https://github.com/ROCm/rccl
+    composable_kernel:
+      version: 2570462
+      url: https://github.com/ROCm/composable_kernel
+    rocm-libraries:
+      version: 0588f07
+      url: https://github.com/ROCm/rocm-libraries
+    rocm-systems:
+      version: 473025a
+      url: https://github.com/ROCm/rocm-systems
+    torch:
+      version: 73adac
+      url: https://github.com/pytorch/pytorch
+    torchvision:
+      version: f5c6c2e
+      url: https://github.com/pytorch/vision
+    triton:
+      version: 7416ffc
+      url: https://github.com/triton-lang/triton
+    accelerate:
+      version: 34c1779
+      url: https://github.com/huggingface/accelerate
+    aiter:
+      version: de14bec
+      url: https://github.com/ROCm/aiter
+    diffusers:
+      version: 40528e9
+      url: https://github.com/huggingface/diffusers
+    xfuser:
+      version: ccba9d5
+      url: https://github.com/xdit-project/xDiT
+    yunchang:
+      version: 2c9b712
+      url: https://github.com/feifeibear/long-context-attention
+  supported_models:
+    - group: Hunyuan Video
+      js_tag: hunyuan
+      models:
+        - model: Hunyuan Video
+          model_repo: tencent/HunyuanVideo
+          revision: refs/pr/18
+          url: https://huggingface.co/tencent/HunyuanVideo
+          github: https://github.com/Tencent-Hunyuan/HunyuanVideo
+          mad_tag: pyt_xdit_hunyuanvideo
+          js_tag: hunyuan_tag
+    - group: Wan-AI
+      js_tag: wan
+      models:
+        - model: Wan2.1
+          model_repo: Wan-AI/Wan2.1-I2V-14B-720P-Diffusers
+          url: https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P-Diffusers
+          github: https://github.com/Wan-Video/Wan2.1
+          mad_tag: pyt_xdit_wan_2_1
+          js_tag: wan_21_tag
+        - model: Wan2.2
+          model_repo: Wan-AI/Wan2.2-I2V-A14B-Diffusers
+          url: https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B-Diffusers
+          github: https://github.com/Wan-Video/Wan2.2
+          mad_tag: pyt_xdit_wan_2_2
+          js_tag: wan_22_tag
+    - group: FLUX
+      js_tag: flux
+      models:
+        - model: FLUX.1
+          model_repo: black-forest-labs/FLUX.1-dev
+          url: https://huggingface.co/black-forest-labs/FLUX.1-dev
+          github: https://github.com/black-forest-labs/flux
+          mad_tag: pyt_xdit_flux
+          js_tag: flux_1_tag
+    - group: Stable Diffusion
+      js_tag: stablediffusion
+      models:
+        - model: stable-diffusion-3.5-large
+          model_repo: stabilityai/stable-diffusion-3.5-large
+          url: https://huggingface.co/stabilityai/stable-diffusion-3.5-large
+          github: https://github.com/Stability-AI/sd3.5
+          mad_tag: pyt_xdit_sd_3_5
+          js_tag: stable_diffusion_3_5_large_tag
--- a/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
@@ -1,13 +1,13 @@
 dockers:
-  - pull_tag: rocm/vllm:rocm7.0.0_vllm_0.11.1_20251103
-    docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm7.0.0_vllm_0.11.1_20251103/images/sha256-8d60429043d4d00958da46039a1de0d9b82df814d45da482497eef26a6076506
+  - pull_tag: rocm/vllm:rocm7.0.0_vllm_0.11.2_20251210
+    docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm7.0.0_vllm_0.11.2_20251210/images/sha256-e7f02dd2ce3824959658bc0391296f6158638e3ebce164f6c019c4eca8150ec7
    components:
      ROCm: 7.0.0
-      vLLM: 0.11.1 (0.11.1rc2.dev141+g38f225c2a.rocm700)
+      vLLM: 0.11.2 (0.11.2.dev673+g839868462.rocm700)
      PyTorch: 2.9.0a0+git1c57644
      hipBLASLt: 1.0.0
    dockerfile:
-      commit: 38f225c2abeadc04c2cc398814c2f53ea02c3c72
+      commit: 8398684622109c806a35d660647060b0b9910663
 model_groups:
  - group: Meta Llama
    tag: llama
--- a/docs/data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml
@@ -1,109 +1,105 @@
-xdit_diffusion_inference:
-  docker:
-    - version: v25-11
-      pull_tag: rocm/pytorch-xdit:v25.11
-      docker_hub_url: https://hub.docker.com/r/rocm/pytorch-xdit
-      ROCm: 7.10.0
-      supported_models:
-        - group: Hunyuan Video
-          models:
-            - Hunyuan Video
-        - group: Wan-AI
-          models:
-            - Wan2.1
-            - Wan2.2
-        - group: FLUX
-          models:
-            - FLUX.1
-      whats_new:
-        - "Minor bug fixes and clarifications to READMEs."
-        - "Bumps TheRock, AITER, Diffusers, xDiT versions."
-        - "Changes Aiter rounding mode for faster gfx942 FWD Attention."
-      components:
-        TheRock: 3e3f834
-        rccl: d23d18f
-        composable_kernel: 2570462
-        rocm-libraries: 0588f07
-        rocm-systems: 473025a
-        torch: 73adac
-        torchvision: f5c6c2e
-        triton: 7416ffc
-        accelerate: 34c1779
-        aiter: de14bec
-        diffusers: 40528e9
-        xfuser: 83978b5
-        yunchang: 2c9b712
-
-    - version: v25-10
-      pull_tag: rocm/pytorch-xdit:v25.10
-      docker_hub_url: https://hub.docker.com/r/rocm/pytorch-xdit
-      ROCm: 7.9.0
-      supported_models:
-        - group: Hunyuan Video
-          models:
-            - Hunyuan Video
-        - group: Wan-AI
-          models:
-            - Wan2.1
-            - Wan2.2
-        - group: FLUX
-          models:
-            - FLUX.1
-      whats_new:
-        - "First official xDiT Docker Release for Diffusion Inference."
-        - "Supports gfx942 and gfx950 series (AMD Instinct™ MI300X, MI325X, MI350X, and MI355X)."
-        - "Support Wan 2.1, Wan 2.2, HunyuanVideo and Flux workloads."
-      components:
-        TheRock: 7afbe45
-        rccl: 9b04b2a
-        composable_kernel: b7a806f
-        rocm-libraries: f104555
-        rocm-systems: 25922d0
-        torch: 2.10.0a0+gite9c9017
-        torchvision: 0.22.0a0+966da7e
-        triton: 3.5.0+git52e49c12
-        accelerate: 1.11.0.dev0
-        aiter: 0.1.5.post4.dev20+ga25e55e79
-        diffusers: 0.36.0.dev0
-        xfuser: 0.4.4
-        yunchang: 0.6.3.post1
-
-  model_groups:
+docker:
+  pull_tag: rocm/pytorch-xdit:v25.13
+  docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-xdit/v25.13/images/sha256-81954713070d67bde08595e03f62110c8a3dd66a9ae17a77d611e01f83f0f4ef
+  ROCm: 7.11.0
+  whats_new:
+    - "Flux.1 Kontext support"
+    - "Flux.2 Dev support"
+    - "Flux FP8 GEMM support"
+    - "Hybrid FP8 attention support for Wan models"
+  components:
+    TheRock: 
+      version: 1728a81
+      url: https://github.com/ROCm/TheRock
+    rccl:
+      version: d23d18f
+      url: https://github.com/ROCm/rccl
+    composable_kernel:
+      version: ab0101c
+      url: https://github.com/ROCm/composable_kernel
+    rocm-libraries:
+      version: a2f7c35
+      url: https://github.com/ROCm/rocm-libraries
+    rocm-systems:
+      version: 659737c
+      url: https://github.com/ROCm/rocm-systems
+    torch:
+      version: 91be249
+      url: https://github.com/ROCm/pytorch
+    torchvision:
+      version: b919bd0
+      url: https://github.com/pytorch/vision
+    triton:
+      version: a272dfa
+      url: https://github.com/ROCm/triton
+    accelerate:
+      version: b521400f
+      url: https://github.com/huggingface/accelerate
+    aiter:
+      version: de14bec0
+      url: https://github.com/ROCm/aiter
+    diffusers:
+      version: a1f36ee3e
+      url: https://github.com/huggingface/diffusers
+    xfuser:
+      version: adf2681
+      url: https://github.com/xdit-project/xDiT
+    yunchang:
+      version: 2c9b712
+      url: https://github.com/feifeibear/long-context-attention
+  supported_models:
    - group: Hunyuan Video
-      tag: hunyuan
+      js_tag: hunyuan
      models:
        - model: Hunyuan Video
-          page_tag: hunyuan_tag
-          model_name: hunyuanvideo
          model_repo: tencent/HunyuanVideo
          revision: refs/pr/18
          url: https://huggingface.co/tencent/HunyuanVideo
          github: https://github.com/Tencent-Hunyuan/HunyuanVideo
          mad_tag: pyt_xdit_hunyuanvideo
+          js_tag: hunyuan_tag
    - group: Wan-AI
-      tag: wan
+      js_tag: wan
      models:
        - model: Wan2.1
-          page_tag: wan_21_tag
-          model_name: wan2_1-i2v-14b-720p
-          model_repo: Wan-AI/Wan2.1-I2V-14B-720P
-          url: https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P
+          model_repo: Wan-AI/Wan2.1-I2V-14B-720P-Diffusers
+          url: https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P-Diffusers
          github: https://github.com/Wan-Video/Wan2.1
          mad_tag: pyt_xdit_wan_2_1
+          js_tag: wan_21_tag
        - model: Wan2.2
-          page_tag: wan_22_tag
-          model_name: wan2_2-i2v-a14b
-          model_repo: Wan-AI/Wan2.2-I2V-A14B
-          url: https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B
+          model_repo: Wan-AI/Wan2.2-I2V-A14B-Diffusers
+          url: https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B-Diffusers
          github: https://github.com/Wan-Video/Wan2.2
          mad_tag: pyt_xdit_wan_2_2
+          js_tag: wan_22_tag
    - group: FLUX
-      tag: flux
+      js_tag: flux
      models:
        - model: FLUX.1
-          page_tag: flux_1_tag
-          model_name: FLUX.1-dev
          model_repo: black-forest-labs/FLUX.1-dev
          url: https://huggingface.co/black-forest-labs/FLUX.1-dev
          github: https://github.com/black-forest-labs/flux
          mad_tag: pyt_xdit_flux
+          js_tag: flux_1_tag
+        - model: FLUX.1 Kontext
+          model_repo: black-forest-labs/FLUX.1-Kontext-dev
+          url: https://huggingface.co/black-forest-labs/FLUX.1-Kontext-dev
+          github: https://github.com/black-forest-labs/flux
+          mad_tag: pyt_xdit_flux_kontext
+          js_tag: flux_1_kontext_tag
+        - model: FLUX.2
+          model_repo: black-forest-labs/FLUX.2-dev
+          url: https://huggingface.co/black-forest-labs/FLUX.2-dev
+          github: https://github.com/black-forest-labs/flux2
+          mad_tag: pyt_xdit_flux_2
+          js_tag: flux_2_tag
+    - group: StableDiffusion
+      js_tag: stablediffusion
+      models:
+        - model: stable-diffusion-3.5-large
+          model_repo: stabilityai/stable-diffusion-3.5-large
+          url: https://huggingface.co/stabilityai/stable-diffusion-3.5-large
+          github: https://github.com/Stability-AI/sd3.5
+          mad_tag: pyt_xdit_sd_3_5
+          js_tag: stable_diffusion_3_5_large_tag
--- a/docs/data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
@@ -1,12 +1,12 @@
 dockers:
-  - pull_tag: rocm/jax-training:maxtext-v25.9
-    docker_hub_url: https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7/images/sha256-45f4c727d4019a63fc47313d3a5f5a5105569539294ddfd2d742218212ae9025
+  - pull_tag: rocm/jax-training:maxtext-v25.11
+    docker_hub_url: https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.11/images/sha256-18e4d8f0b8ce7a7422c58046940dd5f32249960449fca09a562b65fb8eb1562a
    components:
-      ROCm: 7.0.0
-      JAX: 0.6.2
-      Python: 3.10.18
-      Transformer Engine: 2.2.0.dev0+c91bac54
-      hipBLASLt: 1.x.x
+      ROCm: 7.1.0
+      JAX: 0.7.1
+      Python: 3.12
+      Transformer Engine: 2.4.0.dev0+281042de
+      hipBLASLt: 1.2.x
 model_groups:
  - group: Meta Llama
    tag: llama
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/jax-maxtext-v25.9-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/jax-maxtext-v25.9-benchmark-models.yaml
@@ -0,0 +1,64 @@
+dockers:
+  - pull_tag: rocm/jax-training:maxtext-v25.9.1
+    docker_hub_url: https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.9.1/images/sha256-60946cfbd470f6ee361fc9da740233a4fb2e892727f01719145b1f7627a1cff6
+    components:
+      ROCm: 7.0.0
+      JAX: 0.6.2
+      Python: 3.10.18
+      Transformer Engine: 2.2.0.dev0+c91bac54
+      hipBLASLt: 1.x.x
+model_groups:
+  - group: Meta Llama
+    tag: llama
+    models:
+      - model: Llama 2 7B
+        mad_tag: jax_maxtext_train_llama-2-7b
+        model_repo: Llama-2-7B
+        precision: bf16
+        multinode_training_script: llama2_7b_multinode.sh
+        doc_options: ["single-node", "multi-node"]
+      - model: Llama 2 70B
+        mad_tag: jax_maxtext_train_llama-2-70b
+        model_repo: Llama-2-70B
+        precision: bf16
+        multinode_training_script: llama2_70b_multinode.sh
+        doc_options: ["single-node", "multi-node"]
+      - model: Llama 3 8B (multi-node)
+        mad_tag: jax_maxtext_train_llama-3-8b
+        multinode_training_script: llama3_8b_multinode.sh
+        doc_options: ["multi-node"]
+      - model: Llama 3 70B (multi-node)
+        mad_tag: jax_maxtext_train_llama-3-70b
+        multinode_training_script: llama3_70b_multinode.sh
+        doc_options: ["multi-node"]
+      - model: Llama 3.1 8B
+        mad_tag: jax_maxtext_train_llama-3.1-8b
+        model_repo: Llama-3.1-8B
+        precision: bf16
+        doc_options: ["single-node"]
+      - model: Llama 3.1 70B
+        mad_tag: jax_maxtext_train_llama-3.1-70b
+        model_repo: Llama-3.1-70B
+        precision: bf16
+        doc_options: ["single-node"]
+      - model: Llama 3.3 70B
+        mad_tag: jax_maxtext_train_llama-3.3-70b
+        model_repo: Llama-3.3-70B
+        precision: bf16
+        doc_options: ["single-node"]
+  - group: DeepSeek
+    tag: deepseek
+    models:
+      - model: DeepSeek-V2-Lite (16B)
+        mad_tag: jax_maxtext_train_deepseek-v2-lite-16b
+        model_repo: DeepSeek-V2-lite
+        precision: bf16
+        doc_options: ["single-node"]
+  - group: Mistral AI
+    tag: mistral
+    models:
+      - model: Mixtral 8x7B
+        mad_tag: jax_maxtext_train_mixtral-8x7b
+        model_repo: Mixtral-8x7B
+        precision: bf16
+        doc_options: ["single-node"]
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.10-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.10-benchmark-models.yaml
@@ -0,0 +1,49 @@
+docker:
+  pull_tag: rocm/primus:v25.10
+  docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197
+  components:
+    ROCm: 7.1.0
+    Primus: 0.3.0
+    Primus Turbo: 0.1.1
+    PyTorch: 2.10.0.dev20251112+rocm7.1
+    Python: "3.10"
+    Transformer Engine: 2.4.0.dev0+32e2d1d4
+    Flash Attention: 2.8.3
+    hipBLASLt: 1.2.0-09ab7153e2
+    Triton: 3.4.0
+    RCCL: 2.27.7
+model_groups:
+  - group: Meta Llama
+    tag: llama
+    models:
+      - model: Llama 3.3 70B
+        mad_tag: pyt_megatron_lm_train_llama-3.3-70b
+      - model: Llama 3.1 8B
+        mad_tag: pyt_megatron_lm_train_llama-3.1-8b
+      - model: Llama 3.1 70B
+        mad_tag: pyt_megatron_lm_train_llama-3.1-70b
+      - model: Llama 2 7B
+        mad_tag: pyt_megatron_lm_train_llama-2-7b
+      - model: Llama 2 70B
+        mad_tag: pyt_megatron_lm_train_llama-2-70b
+  - group: DeepSeek
+    tag: deepseek
+    models:
+      - model: DeepSeek-V3 (proxy)
+        mad_tag: pyt_megatron_lm_train_deepseek-v3-proxy
+      - model: DeepSeek-V2-Lite
+        mad_tag: pyt_megatron_lm_train_deepseek-v2-lite-16b
+  - group: Mistral AI
+    tag: mistral
+    models:
+      - model: Mixtral 8x7B
+        mad_tag: pyt_megatron_lm_train_mixtral-8x7b
+      - model: Mixtral 8x22B (proxy)
+        mad_tag: pyt_megatron_lm_train_mixtral-8x22b-proxy
+  - group: Qwen
+    tag: qwen
+    models:
+      - model: Qwen 2.5 7B
+        mad_tag: pyt_megatron_lm_train_qwen2.5-7b
+      - model: Qwen 2.5 72B
+        mad_tag: pyt_megatron_lm_train_qwen2.5-72b
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.10-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.10-benchmark-models.yaml
@@ -0,0 +1,58 @@
+docker:
+  pull_tag: rocm/primus:v25.10
+  docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197
+  components:
+    ROCm: 7.1.0
+    PyTorch: 2.10.0.dev20251112+rocm7.1
+    Python: "3.10"
+    Transformer Engine: 2.4.0.dev0+32e2d1d4
+    Flash Attention: 2.8.3
+    hipBLASLt: 1.2.0-09ab7153e2
+    Triton: 3.4.0
+    RCCL: 2.27.7
+model_groups:
+  - group: Meta Llama
+    tag: llama
+    models:
+      - model: Llama 3.3 70B
+        mad_tag: primus_pyt_megatron_lm_train_llama-3.3-70b
+        config_name: llama3.3_70B-pretrain.yaml
+      - model: Llama 3.1 70B
+        mad_tag: primus_pyt_megatron_lm_train_llama-3.1-70b
+        config_name: llama3.1_70B-pretrain.yaml
+      - model: Llama 3.1 8B
+        mad_tag: primus_pyt_megatron_lm_train_llama-3.1-8b
+        config_name: llama3.1_8B-pretrain.yaml
+      - model: Llama 2 7B
+        mad_tag: primus_pyt_megatron_lm_train_llama-2-7b
+        config_name: llama2_7B-pretrain.yaml
+      - model: Llama 2 70B
+        mad_tag: primus_pyt_megatron_lm_train_llama-2-70b
+        config_name: llama2_70B-pretrain.yaml
+  - group: DeepSeek
+    tag: deepseek
+    models:
+      - model: DeepSeek-V3 (proxy)
+        mad_tag: primus_pyt_megatron_lm_train_deepseek-v3-proxy
+        config_name: deepseek_v3-pretrain.yaml
+      - model: DeepSeek-V2-Lite
+        mad_tag: primus_pyt_megatron_lm_train_deepseek-v2-lite-16b
+        config_name: deepseek_v2_lite-pretrain.yaml
+  - group: Mistral AI
+    tag: mistral
+    models:
+      - model: Mixtral 8x7B
+        mad_tag: primus_pyt_megatron_lm_train_mixtral-8x7b
+        config_name: mixtral_8x7B_v0.1-pretrain.yaml
+      - model: Mixtral 8x22B (proxy)
+        mad_tag: primus_pyt_megatron_lm_train_mixtral-8x22b-proxy
+        config_name: mixtral_8x22B_v0.1-pretrain.yaml
+  - group: Qwen
+    tag: qwen
+    models:
+      - model: Qwen 2.5 7B
+        mad_tag: primus_pyt_megatron_lm_train_qwen2.5-7b
+        config_name: primus_qwen2.5_7B-pretrain.yaml
+      - model: Qwen 2.5 72B
+        mad_tag: primus_pyt_megatron_lm_train_qwen2.5-72b
+        config_name: qwen2.5_72B-pretrain.yaml
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.10-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.10-benchmark-models.yaml
@@ -0,0 +1,32 @@
+docker:
+  pull_tag: rocm/primus:v25.10
+  docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197
+  components:
+    ROCm: 7.1.0
+    PyTorch: 2.10.0.dev20251112+rocm7.1
+    Python: "3.10"
+    Transformer Engine: 2.4.0.dev0+32e2d1d4
+    Flash Attention: 2.8.3
+    hipBLASLt: 1.2.0-09ab7153e2
+model_groups:
+  - group: Meta Llama
+    tag: llama
+    models:
+      - model: Llama 3.1 8B
+        mad_tag: primus_pyt_train_llama-3.1-8b
+        model_repo: Llama-3.1-8B
+        url: https://huggingface.co/meta-llama/Llama-3.1-8B
+        precision: BF16
+      - model: Llama 3.1 70B
+        mad_tag: primus_pyt_train_llama-3.1-70b
+        model_repo: Llama-3.1-70B
+        url: https://huggingface.co/meta-llama/Llama-3.1-70B
+        precision: BF16
+  - group: DeepSeek
+    tag: deepseek
+    models:
+      - model: DeepSeek V2 16B
+        mad_tag: primus_pyt_train_deepseek-v2
+        model_repo: DeepSeek-V2
+        url: https://huggingface.co/deepseek-ai/DeepSeek-V2
+        precision: BF16
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.10-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.10-benchmark-models.yaml
@@ -0,0 +1,197 @@
+docker:
+  pull_tag: rocm/primus:v25.10
+  docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197
+  components:
+    ROCm: 7.1.0
+    Primus: 0.3.0
+    Primus Turbo: 0.1.1
+    PyTorch: 2.10.0.dev20251112+rocm7.1
+    Python: "3.10"
+    Transformer Engine: 2.4.0.dev0+32e2d1d4
+    Flash Attention: 2.8.3
+    hipBLASLt: 1.2.0-09ab7153e2
+model_groups:
+  - group: Meta Llama
+    tag: llama
+    models:
+    - model: Llama 4 Scout 17B-16E
+      mad_tag: pyt_train_llama-4-scout-17b-16e
+      model_repo: Llama-4-17B_16E
+      url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3.3 70B
+      mad_tag: pyt_train_llama-3.3-70b
+      model_repo: Llama-3.3-70B
+      url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
+    - model: Llama 3.2 1B
+      mad_tag: pyt_train_llama-3.2-1b
+      model_repo: Llama-3.2-1B
+      url: https://huggingface.co/meta-llama/Llama-3.2-1B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3.2 3B
+      mad_tag: pyt_train_llama-3.2-3b
+      model_repo: Llama-3.2-3B
+      url: https://huggingface.co/meta-llama/Llama-3.2-3B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3.2 Vision 11B
+      mad_tag: pyt_train_llama-3.2-vision-11b
+      model_repo: Llama-3.2-Vision-11B
+      url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision
+      precision: BF16
+      training_modes: [finetune_fw]
+    - model: Llama 3.2 Vision 90B
+      mad_tag: pyt_train_llama-3.2-vision-90b
+      model_repo: Llama-3.2-Vision-90B
+      url: https://huggingface.co/meta-llama/Llama-3.2-90B-Vision
+      precision: BF16
+      training_modes: [finetune_fw]
+    - model: Llama 3.1 8B
+      mad_tag: pyt_train_llama-3.1-8b
+      model_repo: Llama-3.1-8B
+      url: https://huggingface.co/meta-llama/Llama-3.1-8B
+      precision: BF16
+      training_modes: [pretrain, finetune_fw, finetune_lora, HF_pretrain]
+    - model: Llama 3.1 70B
+      mad_tag: pyt_train_llama-3.1-70b
+      model_repo: Llama-3.1-70B
+      url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
+      precision: BF16
+      training_modes: [pretrain, finetune_fw, finetune_lora]
+    - model: Llama 3.1 405B
+      mad_tag: pyt_train_llama-3.1-405b
+      model_repo: Llama-3.1-405B
+      url: https://huggingface.co/meta-llama/Llama-3.1-405B
+      precision: BF16
+      training_modes: [finetune_qlora]
+    - model: Llama 3 8B
+      mad_tag: pyt_train_llama-3-8b
+      model_repo: Llama-3-8B
+      url: https://huggingface.co/meta-llama/Meta-Llama-3-8B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3 70B
+      mad_tag: pyt_train_llama-3-70b
+      model_repo: Llama-3-70B
+      url: https://huggingface.co/meta-llama/Meta-Llama-3-70B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 2 7B
+      mad_tag: pyt_train_llama-2-7b
+      model_repo: Llama-2-7B
+      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
+    - model: Llama 2 13B
+      mad_tag: pyt_train_llama-2-13b
+      model_repo: Llama-2-13B
+      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 2 70B
+      mad_tag: pyt_train_llama-2-70b
+      model_repo: Llama-2-70B
+      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
+      precision: BF16
+      training_modes: [finetune_lora, finetune_qlora]
+  - group: OpenAI
+    tag: openai
+    models:
+    - model: GPT OSS 20B
+      mad_tag: pyt_train_gpt_oss_20b
+      model_repo: GPT-OSS-20B
+      url: https://huggingface.co/openai/gpt-oss-20b
+      precision: BF16
+      training_modes: [HF_finetune_lora]
+    - model: GPT OSS 120B
+      mad_tag: pyt_train_gpt_oss_120b
+      model_repo: GPT-OSS-120B
+      url: https://huggingface.co/openai/gpt-oss-120b
+      precision: BF16
+      training_modes: [HF_finetune_lora]
+  - group: DeepSeek
+    tag: deepseek
+    models:
+    - model: DeepSeek V2 16B
+      mad_tag: primus_pyt_train_deepseek-v2
+      model_repo: DeepSeek-V2
+      url: https://huggingface.co/deepseek-ai/DeepSeek-V2
+      precision: BF16
+      training_modes: [pretrain]
+  - group: Qwen
+    tag: qwen
+    models:
+    - model: Qwen 3 8B
+      mad_tag: pyt_train_qwen3-8b
+      model_repo: Qwen3-8B
+      url: https://huggingface.co/Qwen/Qwen3-8B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Qwen 3 32B
+      mad_tag: pyt_train_qwen3-32b
+      model_repo: Qwen3-32
+      url: https://huggingface.co/Qwen/Qwen3-32B
+      precision: BF16
+      training_modes: [finetune_lora]
+    - model: Qwen 2.5 32B
+      mad_tag: pyt_train_qwen2.5-32b
+      model_repo: Qwen2.5-32B
+      url: https://huggingface.co/Qwen/Qwen2.5-32B
+      precision: BF16
+      training_modes: [finetune_lora]
+    - model: Qwen 2.5 72B
+      mad_tag: pyt_train_qwen2.5-72b
+      model_repo: Qwen2.5-72B
+      url: https://huggingface.co/Qwen/Qwen2.5-72B
+      precision: BF16
+      training_modes: [finetune_lora]
+    - model: Qwen 2 1.5B
+      mad_tag: pyt_train_qwen2-1.5b
+      model_repo: Qwen2-1.5B
+      url: https://huggingface.co/Qwen/Qwen2-1.5B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Qwen 2 7B
+      mad_tag: pyt_train_qwen2-7b
+      model_repo: Qwen2-7B
+      url: https://huggingface.co/Qwen/Qwen2-7B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+  - group: Stable Diffusion
+    tag: sd
+    models:
+    - model: Stable Diffusion XL
+      mad_tag: pyt_huggingface_stable_diffusion_xl_2k_lora_finetuning
+      model_repo: SDXL
+      url: https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0
+      precision: BF16
+      training_modes: [posttrain]
+  - group: Flux
+    tag: flux
+    models:
+    - model: FLUX.1-dev
+      mad_tag: pyt_train_flux
+      model_repo: Flux
+      url: https://huggingface.co/black-forest-labs/FLUX.1-dev
+      precision: BF16
+      training_modes: [posttrain]
+  - group: NCF
+    tag: ncf
+    models:
+    - model: NCF
+      mad_tag: pyt_ncf_training
+      model_repo:
+      url: https://github.com/ROCm/FluxBenchmark
+      precision: FP32
+  - group: DLRM
+    tag: dlrm
+    models:
+    - model: DLRM v2
+      mad_tag: pyt_train_dlrm
+      model_repo: DLRM
+      url: https://github.com/AMD-AGI/DLRMBenchmark
+      training_modes: [pretrain]
--- a/docs/data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
@@ -1,5 +1,5 @@
 docker:
-  pull_tag: rocm/primus:v25.10
+  pull_tag: rocm/primus:v25.11
  docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197
  components:
    ROCm: 7.1.0
--- a/docs/data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
@@ -1,5 +1,5 @@
 docker:
-  pull_tag: rocm/primus:v25.10
+  pull_tag: rocm/primus:v25.11
  docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197
  components:
    ROCm: 7.1.0
@@ -25,8 +25,8 @@ model_groups:
  - group: DeepSeek
    tag: deepseek
    models:
-      - model: DeepSeek V2 16B
-        mad_tag: primus_pyt_train_deepseek-v2
-        model_repo: DeepSeek-V2
-        url: https://huggingface.co/deepseek-ai/DeepSeek-V2
+      - model: DeepSeek V3 16B
+        mad_tag: primus_pyt_train_deepseek-v3-16b
+        model_repo: DeepSeek-V3
+        url: https://huggingface.co/deepseek-ai/DeepSeek-V3
        precision: BF16
--- a/docs/how-to/rocm-for-ai/inference-optimization/model-acceleration-libraries.rst
+++ b/docs/how-to/rocm-for-ai/inference-optimization/model-acceleration-libraries.rst
@@ -24,94 +24,102 @@ performance.
   :alt: Attention module of a large language module utilizing tiling
   :align: center

+Installation prerequisites
+----------------------------
+
+Before installing Flash Attention 2, ensure the following are available:
+
+*  ROCm-enabled PyTorch
+*  Triton
+
+These can be installed by following the official
+`PyTorch installation guide <https://pytorch.org/get-started/locally/>`_. Alternatively, for a simpler setup, you can use a preconfigured
+:ref:`ROCm PyTorch Docker image <using-docker-with-pytorch-pre-installed>`, which already includes the required libraries.
+
 Installing Flash Attention 2 
 ----------------------------

-ROCm provides two different implementations of Flash Attention 2 modules. They can be deployed interchangeably:
+`Flash Attention <https://github.com/Dao-AILab/flash-attention>`_ supports two backend implementations on AMD GPUs.

-*  ROCm `Composable Kernel <https://github.com/ROCm/composable_kernel/tree/develop/example/01_gemm>`_
-   (CK) Flash Attention 2
+*  `Composable Kernel (CK) <https://github.com/ROCm/composable_kernel>`__ - the default backend
+*  `OpenAI Triton <https://github.com/triton-lang/triton>`__ - an alternative backend

-*  `OpenAI Triton <https://triton-lang.org/main/index.html>`_ Flash Attention 2
+You can switch between these backends using the environment variable ``FLASH_ATTENTION_TRITON_AMD_ENABLE``:

-.. tab-set::
+``FLASH_ATTENTION_TRITON_AMD_ENABLE="FALSE"``
+→ Use Composable Kernel (CK) backend (Flash Attention 2)

-   .. tab-item:: CK Flash Attention 2
+``FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE"``
+→ Use OpenAI Triton backend (Flash Attention 2)

-      To install CK Flash Attention 2, use the following commands.
+To install Flash Attention 2, use the following commands:

-      .. code-block:: shell
+.. code-block:: shell

-         # Install from source
-         git clone https://github.com/ROCm/flash-attention.git
-         cd flash-attention/
-         GPU_ARCHS=gfx942 python setup.py install #MI300 Series
+   git clone https://github.com/Dao-AILab/flash-attention.git
+   cd flash-attention/
+   pip install ninja

-      Hugging Face Transformers can easily deploy the CK Flash Attention 2 module by passing an argument
-      ``attn_implementation="flash_attention_2"`` in the ``from_pretrained`` class.
+   # To install the CK backend flash attention
+   python setup.py install 

-      .. code-block:: python
+   # To install the Triton backend flash attention
+   FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE" python setup.py install 

-         import torch
-         from transformers import AutoModelForCausalLM, AutoTokenizer
-         device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-         model_name = "NousResearch/Meta-Llama-3-8B"
+   # To install both CK and Triton backend flash attention
+   FLASH_ATTENTION_TRITON_AMD_ENABLE=TRUE && FLASH_ATTENTION_SKIP_CK_BUILD=FALSE python setup.py install

-         tokenizer = AutoTokenizer.from_pretrained(model_name, torch_dtype=torch.float16, use_fast=False)
-         inputs = tokenizer('Today is', return_tensors='pt').to(device)
+For detailed installation instructions, see `Flash Attention <https://github.com/Dao-AILab/flash-attention>`_.

-         model_eager = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, attn_implementation="eager").cuda(device)
-         model_ckFAv2 = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, attn_implementation="flash_attention_2").cuda(device)
+Benchmarking Flash Attention 2 
+------------------------------

-         print("eager GQA: ", tokenizer.decode(model_eager.generate(**inputs, max_new_tokens=10)[0], skip_special_tokens=True))
-         print("ckFAv2 GQA: ", tokenizer.decode(model_ckFAv2.generate(**inputs, max_new_tokens=10)[0], skip_special_tokens=True))
+Benchmark scripts to evaluate the performance of Flash Attention 2 are stored in the ``flash-attention/benchmarks/`` directory.

-         #  eager GQA:  Today is the day of the Lord, and we are the
-         # ckFAv2 GQA: Today is the day of the Lord, and we are the
+To benchmark the CK backend 

-   .. tab-item:: Triton Flash Attention 2
+.. code-block:: shell

-      The Triton Flash Attention 2 module is implemented in Python and uses OpenAI’s JIT compiler. This module has been
-      upstreamed into the vLLM serving toolkit, discussed in :doc:'llm-inference-frameworks'. 
+   cd flash-attention/benchmarks
+   pip install transformers einops ninja

-      1. To install Triton Flash Attention 2 and run the benchmark, use the following commands.
+   python3 benchmark_flash_attention.py 

-         .. code-block:: shell
+To benchmark the Triton backend

-            # Install from the source
-            pip uninstall pytorch-triton-rocm triton -y
-            git clone https://github.com/ROCm/triton.git 
-            cd triton/python
-            GPU_ARCHS=gfx942 python setup.py install #MI300 series
-            pip install matplotlib pandas
+.. code-block:: shell

-      2. To test, run the Triton Flash Attention 2 performance benchmark.
+   FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE" python3 benchmark_flash_attention.py

-         .. code-block:: shell
-         
-            # Test the triton FA v2 kernel
-            python https://github.com/ROCm/triton/blob/triton-mlir/python/perf-kernels/flash-attention.py
-            # Results (Okay to release TFLOPS number ???)
-            fused-attention-fwd-d128:
-                BATCH    HQ    HK  N_CTX_Q  N_CTX_K      TFLOPS
-            0    16.0  16.0  16.0   1024.0   1024.0  287.528411
-            1     8.0  16.0  16.0   2048.0   2048.0  287.490806
-            2     4.0  16.0  16.0   4096.0   4096.0  345.966031
-            3     2.0  16.0  16.0   8192.0   8192.0  361.369510
-            4     1.0  16.0  16.0  16384.0  16384.0  356.873720
-            5     2.0  48.0  48.0   1024.0   1024.0  216.916235
-            6     2.0  48.0  48.0   2048.0   1024.0  271.027578
-            7     2.0  48.0  48.0   4096.0   8192.0  337.367372
-            8     2.0  48.0  48.0   8192.0   4096.0  363.481649
-            9     2.0  48.0  48.0  16384.0   8192.0  375.013622
-            10    8.0  16.0  16.0   1989.0  15344.0  321.791333
-            11    4.0  16.0  16.0   4097.0    163.0  122.104888
-            12    2.0  16.0  16.0   8122.0   2159.0  337.060283
-            13    1.0  16.0  16.0  16281.0      7.0    5.234012
-            14    2.0  48.0  48.0   1021.0   1020.0  214.657425
-            15    2.0  48.0  48.0   2001.0   2048.0  314.429118
-            16    2.0  48.0  48.0   3996.0   9639.0  330.411368
-            17    2.0  48.0  48.0   8181.0   1021.0  324.614980
+Using Flash Attention 2 
+-----------------------
+
+.. code-block:: python
+
+   import torch
+   from transformers import AutoModelForCausalLM, AutoTokenizer
+   device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+   model_name = "NousResearch/Llama-3.2-1B"
+
+   tokenizer = AutoTokenizer.from_pretrained(model_name, dtype=torch.bfloat16, use_fast=False)
+   inputs = tokenizer('Today is', return_tensors='pt').to(device)
+
+   model_eager = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.bfloat16, attn_implementation="eager").cuda(device)
+   model_ckFAv2 = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.bfloat16, attn_implementation="flash_attention_2").cuda(device)
+   model_eager.generation_config.pad_token_id = model_eager.generation_config.eos_token_id
+   model_ckFAv2.generation_config.pad_token_id = model_ckFAv2.generation_config.eos_token_id
+
+   print("eager\n GQA: ", tokenizer.decode(model_eager.generate(**inputs, max_new_tokens=22)[0], skip_special_tokens=True, do_sample=False, num_beams=1))
+   print("ckFAv2\n GQA: ", tokenizer.decode(model_ckFAv2.generate(**inputs, max_new_tokens=22)[0], skip_special_tokens=True, do_sample=False, num_beams=1))
+
+The outputs from eager mode and FlashAttention-2 are identical, although their performance behavior differs.
+
+.. code-block:: shell
+
+   eager
+   GQA:  Today is the 10th anniversary of the 9/11 attacks. I remember that day like it was yesterday.
+   ckFAv2
+   GQA:  Today is the 10th anniversary of the 9/11 attacks. I remember that day like it was yesterday.

 xFormers
 ========
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.11.1-20251103.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.11.1-20251103.rst
@@ -0,0 +1,472 @@
+:orphan:
+
+.. meta::
+   :description: Learn how to validate LLM inference performance on MI300X GPUs using AMD MAD and the ROCm vLLM Docker image.
+   :keywords: model, MAD, automation, dashboarding, validate
+
+**********************************
+vLLM inference performance testing
+**********************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm vLLM
+   inference performance documentation. See :doc:`../vllm` for the latest version.
+
+.. _vllm-benchmark-unified-docker-1103:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.11.1_20251103-benchmark-models.yaml
+
+   {% set docker = data.dockers[0] %}
+
+   The `ROCm vLLM Docker <{{ docker.docker_hub_url }}>`_ image offers a
+   prebuilt, optimized environment for validating large language model (LLM)
+   inference performance on AMD Instinct™ MI355X, MI350X, MI325X and MI300X
+   GPUs. This ROCm vLLM Docker image integrates vLLM and PyTorch tailored
+   specifically for AMD data center GPUs and includes the following components:
+
+   .. tab-set::
+
+      .. tab-item:: {{ docker.pull_tag }}
+
+         .. list-table::
+            :header-rows: 1
+
+            * - Software component
+              - Version
+
+            {% for component_name, component_version in docker.components.items() %}
+            * - {{ component_name }}
+              - {{ component_version }}
+            {% endfor %}
+
+With this Docker image, you can quickly test the :ref:`expected
+inference performance numbers <vllm-benchmark-performance-measurements-1103>` for
+AMD Instinct GPUs.
+
+What's new
+==========
+
+The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <vllm-history>`.
+
+* Enabled :ref:`AITER <vllm-optimization-aiter-switches>` by default.
+
+* Fixed ``rms_norm`` segfault issue with Qwen 3 235B.
+
+* Known performance degradation on Llama 4 models due to `an upstream vLLM issue <https://github.com/vllm-project/vllm/issues/26320>`_.
+
+.. _vllm-benchmark-supported-models-1103:
+
+Supported models
+================
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.11.1_20251103-benchmark-models.yaml
+
+   {% set docker = data.dockers[0] %}
+   {% set model_groups = data.model_groups %}
+
+   .. _vllm-benchmark-available-models-1103:
+
+   The following models are supported for inference performance benchmarking
+   with vLLM and ROCm. Some instructions, commands, and recommendations in this
+   documentation might vary by model -- select one to get started. MXFP4 models
+   are only supported on MI355X and MI350X GPUs.
+
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+         <div class="row gx-0">
+            <div class="col-2 me-1 px-2 model-param-head">Model</div>
+            <div class="row col-10 pe-0">
+      {% for model_group in model_groups %}
+               <div class="col-4 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+      {% endfor %}
+            </div>
+         </div>
+
+         <div class="row gx-0 pt-1">
+            <div class="col-2 me-1 px-2 model-param-head">Variant</div>
+            <div class="row col-10 pe-0">
+      {% for model_group in model_groups %}
+         {% set models = model_group.models %}
+         {% for model in models %}
+            {% if models|length % 3 == 0 %}
+               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% else %}
+               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% endif %}
+         {% endfor %}
+      {% endfor %}
+            </div>
+         </div>
+      </div>
+
+   .. _vllm-benchmark-vllm-1103:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{ model.mad_tag }}
+
+
+      {% if model.precision == "float4" %}
+      .. important::
+
+         MXFP4 is supported only on MI355X and MI350X GPUs.
+      {% endif %}
+
+      .. note::
+
+         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
+         Some models require access authorization prior to use via an external license agreement through a third party.
+      {% if model.precision == "float8" and model.model_repo.startswith("amd") %}
+         This model uses FP8 quantization via `AMD Quark <https://quark.docs.amd.com/latest/>`__ for efficient inference on AMD GPUs.
+      {% endif %}
+      {% if model.precision == "float4" and model.model_repo.startswith("amd") %}
+         This model uses FP4 quantization via `AMD Quark <https://quark.docs.amd.com/latest/>`__ for efficient inference on AMD GPUs.
+      {% endif %}
+
+      {% endfor %}
+   {% endfor %}
+
+.. _vllm-benchmark-performance-measurements-1103:
+
+Performance measurements
+========================
+
+To evaluate performance, the
+`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+page provides reference throughput and serving measurements for inferencing popular AI models.
+
+.. important::
+
+   The performance data presented in
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+   only reflects the latest version of this inference benchmarking environment.
+   The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct GPUs or ROCm software.
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting training.
+
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
+Pull the Docker image
+=====================
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.11.1_20251103-benchmark-models.yaml
+
+   {% set docker = data.dockers[0] %}
+
+   Download the `ROCm vLLM Docker image <{{ docker.docker_hub_url }}>`_.
+   Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull {{ docker.pull_tag }}
+
+Benchmarking
+============
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.11.1_20251103-benchmark-models.yaml
+
+   {% set docker = data.dockers[0] %}
+   {% set model_groups = data.model_groups %}
+
+   Once the setup is complete, choose between two options to reproduce the
+   benchmark results:
+
+   .. _vllm-benchmark-mad-1103:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. tab-set::
+
+         .. tab-item:: MAD-integrated benchmarking
+
+            The following run command is tailored to {{ model.model }}.
+            See :ref:`vllm-benchmark-supported-models-1103` to switch to another available model.
+
+            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+               directory and install the required packages on the host machine.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD
+                  pip install -r requirements.txt
+
+            2. On the host machine, use this command to run the performance benchmark test on
+               the `{{model.model}} <{{ model.url }}>`_ model using one node with the
+               :literal:`{{model.precision}}` data type.
+
+               .. code-block:: shell
+
+                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+                  madengine run \
+                      --tags {{model.mad_tag}} \
+                      --keep-model-dir \
+                      --live-output
+
+            MAD launches a Docker container with the name
+            ``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the
+            model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv``
+            and ``{{ model.mad_tag }}_serving.csv``.
+
+            Although the :ref:`available models
+            <vllm-benchmark-available-models-1103>` are preconfigured to collect
+            offline throughput and online serving performance data, you can
+            also change the benchmarking parameters. See the standalone
+            benchmarking tab for more information.
+
+            {% if model.tunableop %}
+
+            .. note::
+
+               For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
+               TunableOp automatically explores different implementations and configurations of certain PyTorch
+               operators to find the fastest one for your hardware.
+
+               By default, ``{{model.mad_tag}}`` runs with TunableOp disabled (see
+               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To enable it, include
+               the ``--tunableop on`` argument in your run.
+
+               Enabling TunableOp triggers a two-pass run -- a warm-up followed by the
+               performance-collection run.
+
+            {% endif %}
+
+         .. tab-item:: Standalone benchmarking
+
+            The following commands are optimized for {{ model.model }}.
+            See :ref:`vllm-benchmark-supported-models-1103` to switch to another available model.
+
+            .. seealso::
+
+               For more information on configuration, see the `config files
+               <https://github.com/ROCm/MAD/tree/develop/scripts/vllm/configs>`__
+               in the MAD repository. Refer to the `vLLM engine <https://docs.vllm.ai/en/latest/configuration/engine_args.html#engineargs>`__
+               for descriptions of available configuration options
+               and `Benchmarking vLLM <https://github.com/vllm-project/vllm/blob/main/benchmarks/README.md>`__ for
+               additional benchmarking information.
+
+            .. rubric:: Launch the container
+
+            You can run the vLLM benchmark tool independently by starting the
+            `Docker container <{{ docker.docker_hub_url }}>`_ as shown
+            in the following snippet.
+
+            .. code-block:: shell
+
+               docker pull {{ docker.pull_tag }}
+               docker run -it \
+                   --device=/dev/kfd \
+                   --device=/dev/dri \
+                   --group-add video \
+                   --shm-size 16G \
+                   --security-opt seccomp=unconfined \
+                   --security-opt apparmor=unconfined \
+                   --cap-add=SYS_PTRACE \
+                   -v $(pwd):/workspace \
+                   --env HUGGINGFACE_HUB_CACHE=/workspace \
+                   --name test \
+                   {{ docker.pull_tag }}
+
+            .. rubric:: Throughput command
+
+            Use the following command to start the throughput benchmark.
+
+            .. code-block:: shell
+
+               model={{ model.model_repo }}
+               tp={{ model.config.tp }}
+               num_prompts={{ model.config.num_prompts | default(1024) }}
+               in={{ model.config.in | default(128) }}
+               out={{ model.config.in | default(128) }}
+               dtype={{ model.config.dtype | default("auto") }}
+               kv_cache_dtype={{ model.config.kv_cache_dtype }}
+               max_num_seqs={{ model.config.max_num_seqs | default(1024) }}
+               max_num_batched_tokens={{ model.config.max_num_batched_tokens }}
+               max_model_len={{ model.config.max_model_len }}
+
+               vllm bench throughput --model $model \
+                   -tp $tp \
+                   --num-prompts $num_prompts \
+                   --input-len $in \
+                   --output-len $out \
+                   --dtype $dtype \
+                   --kv-cache-dtype $kv_cache_dtype \
+                   --max-num-seqs $max_num_seqs \
+                   --max-num-batched-tokens $max_num_batched_tokens \
+                   --max-model-len $max_model_len \
+                   --trust-remote-code \
+                   --output-json ${model}_throughput.json \
+                   --gpu-memory-utilization {{ model.config.gpu_memory_utilization | default(0.9) }}
+
+            .. rubric:: Serving command
+
+            1. Start the server using the following command:
+
+               .. code-block:: shell
+
+                  model={{ model.model_repo }}
+                  tp={{ model.config.tp }}
+                  dtype={{ model.config.dtype }}
+                  kv_cache_dtype={{ model.config.kv_cache_dtype }}
+                  max_num_seqs=256
+                  max_num_batched_tokens={{ model.config.max_num_batched_tokens }}
+                  max_model_len={{ model.config.max_model_len }}
+
+                  vllm serve $model \
+                      -tp $tp \
+                      --dtype $dtype \
+                      --kv-cache-dtype $kv_cache_dtype \
+                      --max-num-seqs $max_num_seqs \
+                      --max-num-batched-tokens $max_num_batched_tokens \
+                      --max-model-len $max_model_len \
+                      --no-enable-prefix-caching \
+                      --swap-space 16 \
+                      --disable-log-requests \
+                      --trust-remote-code \
+                      --gpu-memory-utilization 0.9
+
+               Wait until the model has loaded and the server is ready to accept requests.
+
+            2. On another terminal on the same machine, run the benchmark:
+
+               .. code-block:: shell
+
+                  # Connect to the container
+                  docker exec -it test bash
+
+                  # Wait for the server to start
+                  until curl -s http://localhost:8000/v1/models; do sleep 30; done
+
+                  # Run the benchmark
+                  model={{ model.model_repo }}
+                  max_concurrency=1
+                  num_prompts=10
+                  in=128
+                  out=128
+                  vllm bench serve --model $model \
+                      --percentile-metrics "ttft,tpot,itl,e2el" \
+                      --dataset-name random \
+                      --ignore-eos \
+                      --max-concurrency $max_concurrency \
+                      --num-prompts $num_prompts \
+                      --random-input-len $in \
+                      --random-output-len $out \
+                      --trust-remote-code \
+                      --save-result \
+                      --result-filename ${model}_serving.json
+
+            .. note::
+
+               For improved performance with certain Mixture of Experts models, such as Mixtral 8x22B,
+               try adding ``export VLLM_ROCM_USE_AITER=1`` to your commands.
+
+               If you encounter the following error, pass your access-authorized Hugging
+               Face token to the gated models.
+
+               .. code-block::
+
+                  OSError: You are trying to access a gated repo.
+
+                  # pass your HF_TOKEN
+                  export HF_TOKEN=$your_personal_hf_token
+
+            .. raw:: html
+
+               <style>
+               mjx-container[jax="CHTML"][display="true"] {
+                  text-align: left;
+                  margin: 0;
+               }
+               </style>
+
+            .. note::
+
+               Throughput is calculated as:
+
+               - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
+
+               - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
+      {% endfor %}
+   {% endfor %}
+
+Advanced usage
+==============
+
+For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
+see the developer's guide at `<https://github.com/ROCm/vllm/blob/documentation/docs/dev-docker/README.md>`__.
+
+.. note::
+
+   If you’re using this Docker image on other AMD GPUs such as the AMD Instinct MI200 Series or Radeon, add ``export VLLM_ROCM_USE_AITER=0`` to your command, since AITER is only supported on gfx942 and gfx950 architectures.
+
+Reproducing the Docker image
+----------------------------
+
+To reproduce this ROCm-enabled vLLM Docker image release, follow these steps:
+
+1. Clone the `vLLM repository <https://github.com/vllm-project/vllm>`__.
+
+   .. code-block:: shell
+
+      git clone https://github.com/vllm-project/vllm.git
+      cd vllm
+
+2. Use the following command to build the image directly from the specified commit.
+
+   .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.11.1_20251103-benchmark-models.yaml
+
+      {% set docker = data.dockers[0] %}
+      .. code-block:: shell
+
+         docker build -f docker/Dockerfile.rocm \
+             --build-arg REMOTE_VLLM=1 \
+             --build-arg VLLM_REPO=https://github.com/ROCm/vllm \
+             --build-arg VLLM_BRANCH="{{ docker.dockerfile.commit }}" \
+             -t vllm-rocm .
+
+   .. tip::
+
+      Replace ``vllm-rocm`` with your desired image tag.
+
+Further reading
+===============
+
+- To learn more about the options for latency and throughput benchmark scripts,
+  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
+
+- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
+
+- To learn more about system settings and management practices to configure your system for
+  AMD Instinct MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+
+- See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
+  a brief introduction to vLLM and optimization strategies.
+
+- For application performance optimization strategies for HPC and AI workloads,
+  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`vllm-history` to find documentation for previous releases
+of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst
@@ -16,15 +16,23 @@ previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub <https://hub.
     - Components
     - Resources

-   * - ``rocm/vllm:rocm7.0.0_vllm_0.11.1_20251024``
-       (latest)
+   * - ``rocm/vllm:rocm7.0.0_vllm_0.11.2_20251210``
+     -
+       * ROCm 7.0.0
+       * vLLM 0.11.2
+       * PyTorch 2.9.0
+     -
+       * :doc:`Documentation <../vllm>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm7.0.0_vllm_0.11.2_20251210/images/sha256-e7f02dd2ce3824959658bc0391296f6158638e3ebce164f6c019c4eca8150ec7>`__
+
+   * - ``rocm/vllm:rocm7.0.0_vllm_0.11.1_20251103``
     -
       * ROCm 7.0.0
       * vLLM 0.11.1
       * PyTorch 2.9.0
     -
-       * :doc:`Documentation <../vllm>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm7.0.0_vllm_0.10.2_20251006/images/sha256-94fd001964e1cf55c3224a445b1fb5be31a7dac302315255db8422d813edd7f5>`__
+       * :doc:`Documentation <vllm-0.11.1-20251103>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm7.0.0_vllm_0.11.1_20251103/images/sha256-8d60429043d4d00958da46039a1de0d9b82df814d45da482497eef26a6076506>`__

   * - ``rocm/vllm:rocm7.0.0_vllm_0.10.2_20251006``
     -
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.10.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.10.rst
@@ -1,3 +1,5 @@
+:orphan:
+
 .. meta::
   :description: Learn to validate diffusion model video generation on MI300X, MI350X and MI355X accelerators using
                 prebuilt and optimized docker images.
@@ -7,7 +9,7 @@
 xDiT diffusion inference
 ************************

-.. _xdit-video-diffusion:
+.. _xdit-video-diffusion-2510:

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.10-inference-models.yaml

@@ -150,7 +152,7 @@ run benchmarks and generate outputs.
     {% endfor %}
   {% endfor %}

-.. _xdit-video-diffusion-setup:
+.. _xdit-video-diffusion-setup-2510:

 Prepare the model
 -----------------
@@ -158,7 +160,7 @@ Prepare the model
 .. note::

   If you're using ROCm MAD to :ref:`run your model
-   <xdit-video-diffusion-run>`, you can skip this section. MAD will handle
+   <xdit-video-diffusion-run-2510>`, you can skip this section. MAD will handle
   starting the container and downloading required models inside the container.

 You can either use an existing Hugging Face cache or download the model fresh inside the container.
@@ -253,7 +255,7 @@ You can either use an existing Hugging Face cache or download the model fresh in
     {% endfor %}
   {% endfor %}

-.. _xdit-video-diffusion-run:
+.. _xdit-video-diffusion-run-2510:

 Run inference
 =============
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.11.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.11.rst
@@ -0,0 +1,389 @@
+:orphan:
+
+.. meta::
+   :description: Learn to validate diffusion model video generation on MI300X, MI350X and MI355X accelerators using
+                 prebuilt and optimized docker images.
+   :keywords: xDiT, diffusion, video, video generation, image, image generation, validate, benchmark
+
+************************
+xDiT diffusion inference
+************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm vLLM
+   inference performance documentation. See
+   :doc:`/how-to/rocm-for-ai/inference/xdit-diffusion-inference` for the latest
+   version.
+
+.. _xdit-video-diffusion-2511:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.11-inference-models.yaml
+
+   {% set docker = data.xdit_diffusion_inference.docker | selectattr("version", "equalto", "v25-11") | first %}
+   {% set model_groups = data.xdit_diffusion_inference.model_groups%}
+
+   The `rocm/pytorch-xdit <{{ docker.docker_hub_url }}>`_ Docker image offers a prebuilt, optimized environment based on `xDiT <https://github.com/xdit-project/xDiT>`_ for
+   benchmarking diffusion model video and image generation on gfx942 and gfx950 series (AMD Instinct™ MI300X, MI325X, MI350X, and MI355X) GPUs.
+   The image runs ROCm **{{docker.ROCm}}** (preview) based on `TheRock <https://github.com/ROCm/TheRock>`_
+   and includes the following components:
+
+   .. dropdown:: Software components
+
+      .. list-table::
+         :header-rows: 1
+
+         * - Software component
+           - Version
+
+         {% for component_name, component_version in docker.components.items() %}
+         * - {{ component_name }}
+           - {{ component_version }}
+         {% endfor %}
+
+Follow this guide to pull the required image, spin up a container, download the model, and run a benchmark.
+For preview and development releases, see `amdsiloai/pytorch-xdit <https://hub.docker.com/r/amdsiloai/pytorch-xdit>`_.
+
+What's new
+==========
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.11-inference-models.yaml
+
+   {% set docker = data.xdit_diffusion_inference.docker | selectattr("version", "equalto", "v25-11") | first %}
+   {% set model_groups = data.xdit_diffusion_inference.model_groups%}
+
+   {% for item in docker.whats_new %}
+   * {{ item }}
+   {% endfor %}
+
+.. _xdit-video-diffusion-supported-models-2511:
+
+Supported models
+================
+
+The following models are supported for inference performance benchmarking.
+Some instructions, commands, and recommendations in this documentation might
+vary by model -- select one to get started.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.11-inference-models.yaml
+
+   {% set docker = data.xdit_diffusion_inference.docker | selectattr("version", "equalto", "v25-11") | first %}
+   {% set model_groups = data.xdit_diffusion_inference.model_groups %}
+
+   {# Create a lookup for supported models #}
+   {% set supported_lookup = {} %}
+   {% for supported in docker.supported_models %}
+   {% set _ = supported_lookup.update({supported.group: supported.models}) %}
+   {% endfor %}
+
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+          <div class="row gx-0">
+              <div class="col-2 me-1 px-2 model-param-head">Model</div>
+              <div class="row col-10 pe-0">
+        {% for model_group in model_groups %}
+            {% if model_group.group in supported_lookup %}
+                  <div class="col-4 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+            {% endif %}
+        {% endfor %}
+              </div>
+          </div>
+
+          <div class="row gx-0 pt-1">
+              <div class="col-2 me-1 px-2 model-param-head">Variant</div>
+              <div class="row col-10 pe-0">
+        {% for model_group in model_groups %}
+            {% if model_group.group in supported_lookup %}
+            {% set supported_models = supported_lookup[model_group.group] %}
+            {% set models = model_group.models %}
+            {% for model in models %}
+                {% if model.model in supported_models %}
+                {% if models|length % 3 == 0 %}
+                <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.page_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+                {% else %}
+                <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.page_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+                {% endif %}
+                {% endif %}
+            {% endfor %}
+            {% endif %}
+        {% endfor %}
+              </div>
+          </div>
+      </div>
+
+   {% for model_group in model_groups %}
+       {% for model in model_group.models %}
+
+   .. container:: model-doc {{ model.page_tag }}
+
+      .. note::
+
+         To learn more about your specific model see the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_
+         or visit the `GitHub page <{{ model.github }}>`__. Note that some models require access authorization before use via an
+         external license agreement through a third party.
+
+       {% endfor %}
+   {% endfor %}
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting.
+
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
+Pull the Docker image
+=====================
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.11-inference-models.yaml
+
+   {% set docker = data.xdit_diffusion_inference.docker | selectattr("version", "equalto", "v25-11") | first %}
+
+   For this tutorial, it's recommended to use the latest ``{{ docker.pull_tag }}`` Docker image.
+   Pull the image using the following command:
+
+   .. code-block:: shell
+
+      docker pull {{ docker.pull_tag }}
+
+Validate and benchmark
+======================
+
+Once the image has been downloaded you can follow these steps to
+run benchmarks and generate outputs.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.11-inference-models.yaml
+
+   {% for model_group in model_groups %}
+     {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.page_tag}}
+
+      The following commands are written for {{ model.model }}.
+      See :ref:`xdit-video-diffusion-supported-models-2511` to switch to another available model.
+
+     {% endfor %}
+   {% endfor %}
+
+Choose your setup method
+------------------------
+
+You can either use an existing Hugging Face cache or download the model fresh inside the container.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.11-inference-models.yaml
+
+   {% set docker = data.xdit_diffusion_inference.docker | selectattr("version", "equalto", "v25-11") | first %}
+   {% set model_groups = data.xdit_diffusion_inference.model_groups%}
+
+   {% for model_group in model_groups %}
+     {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.page_tag}}
+
+      .. tab-set::
+
+         .. tab-item:: Option 1: Use existing Hugging Face cache
+
+            If you already have models downloaded on your host system, you can mount your existing cache.
+
+            1. Set your Hugging Face cache location.
+
+               .. code-block:: shell
+
+                  export HF_HOME=/your/hf_cache/location
+            2. Download the model (if not already cached).
+
+               .. code-block:: shell
+
+                  huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %}
+            3. Launch the container with mounted cache.
+
+               .. code-block:: shell
+
+                  docker run \
+                      -it --rm \
+                      --cap-add=SYS_PTRACE \
+                      --security-opt seccomp=unconfined \
+                      --user root \
+                      --device=/dev/kfd \
+                      --device=/dev/dri \
+                      --group-add video \
+                      --ipc=host \
+                      --network host \
+                      --privileged \
+                      --shm-size 128G \
+                      --name pytorch-xdit \
+                      -e HSA_NO_SCRATCH_RECLAIM=1 \
+                      -e OMP_NUM_THREADS=16 \
+                      -e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+                      -e HF_HOME=/app/huggingface_models \
+                      -v $HF_HOME:/app/huggingface_models \
+                      {{ docker.pull_tag }}
+         .. tab-item:: Option 2: Download inside container
+
+            If you prefer to keep the container self-contained or don't have an existing cache.
+
+            1. Launch the container
+
+               .. code-block:: shell
+
+                  docker run \
+                      -it --rm \
+                      --cap-add=SYS_PTRACE \
+                      --security-opt seccomp=unconfined \
+                      --user root \
+                      --device=/dev/kfd \
+                      --device=/dev/dri \
+                      --group-add video \
+                      --ipc=host \
+                      --network host \
+                      --privileged \
+                      --shm-size 128G \
+                      --name pytorch-xdit \
+                      -e HSA_NO_SCRATCH_RECLAIM=1 \
+                      -e OMP_NUM_THREADS=16 \
+                      -e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+                      {{ docker.pull_tag }}
+            2. Inside the container, set the Hugging Face cache location and download the model.
+
+               .. code-block:: shell
+
+                  export HF_HOME=/app/huggingface_models
+                  huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %}
+
+               .. warning::
+
+                  Models will be downloaded to the container's filesystem and will be lost when the container is removed unless you persist the data with a volume.
+     {% endfor %}
+   {% endfor %}
+
+Run inference
+=============
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.11-inference-models.yaml
+
+   {% set model_groups = data.xdit_diffusion_inference.model_groups%}
+   {% for model_group in model_groups %}
+     {% for model in model_group.models %}
+
+   .. container:: model-doc {{ model.page_tag }}
+
+      .. tab-set::
+
+         .. tab-item:: MAD-integrated benchmarking
+
+            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+               directory and install the required packages on the host machine.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD
+                  pip install -r requirements.txt
+
+            2. On the host machine, use this command to run the performance benchmark test on
+               the `{{model.model}} <{{ model.url }}>`_ model using one node.
+
+               .. code-block:: shell
+
+                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+                  madengine run \
+                      --tags {{model.mad_tag}} \
+                      --keep-model-dir \
+                      --live-output
+
+            MAD launches a Docker container with the name
+            ``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the
+            model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv``
+            and ``{{ model.mad_tag }}_serving.csv``.
+
+         .. tab-item:: Standalone benchmarking
+
+            To run the benchmarks for {{ model.model }}, use the following command:
+
+            .. code-block:: shell
+
+            {% if model.model == "Hunyuan Video" %}
+               cd /app/Hunyuanvideo
+               mkdir results
+               torchrun --nproc_per_node=8 run.py \
+                  --model tencent/HunyuanVideo \
+                  --prompt "In the large cage, two puppies were wagging their tails at each other." \
+                  --height 720 --width 1280 --num_frames 129 \
+                  --num_inference_steps 50 --warmup_steps 1 --n_repeats 1 \
+                  --ulysses_degree 8 \
+                  --enable_tiling --enable_slicing \
+                  --use_torch_compile \
+                  --bench_output results
+            {% endif %}
+            {% if model.model == "Wan2.1" %}
+               cd Wan2.1
+               mkdir results
+               torchrun --nproc_per_node=8 run.py \
+                  --task i2v-14B \
+                  --size 720*1280 --frame_num 81 \
+                  --ckpt_dir "${HF_HOME}/hub/models--Wan-AI--Wan2.1-I2V-14B-720P/snapshots/8823af45fcc58a8aa999a54b04be9abc7d2aac98/" \
+                  --image "/app/Wan2.1/examples/i2v_input.JPG" \
+                  --ulysses_size 8 --ring_size 1 \
+                  --prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \
+                  --benchmark_output_directory results --save_file video.mp4 --num_benchmark_steps 1 \
+                  --offload_model 0 \
+                  --vae_dtype bfloat16 \
+                  --allow_tf32 \
+                  --compile
+            {% endif %}
+            {% if model.model == "Wan2.2" %}
+               cd Wan2.2
+               mkdir results
+               torchrun --nproc_per_node=8 run.py \
+                  --task i2v-A14B \
+                  --size 720*1280 --frame_num 81 \
+                  --ckpt_dir "${HF_HOME}/hub/models--Wan-AI--Wan2.2-I2V-A14B/snapshots/206a9ee1b7bfaaf8f7e4d81335650533490646a3/" \
+                  --image "/app/Wan2.2/examples/i2v_input.JPG" \
+                  --ulysses_size 8 --ring_size 1 \
+                  --prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \
+                  --benchmark_output_directory results --save_file video.mp4 --num_benchmark_steps 1 \
+                  --offload_model 0 \
+                  --vae_dtype bfloat16 \
+                  --allow_tf32 \
+                  --compile
+            {% endif %}
+            {% if model.model == "FLUX.1" %}
+               cd Flux
+               mkdir results
+               torchrun --nproc_per_node=8 /app/Flux/run.py \
+                  --model black-forest-labs/FLUX.1-dev \
+                  --seed 42 \
+                  --prompt "A small cat" \
+                  --height 1024 \
+                  --width 1024 \
+                  --num_inference_steps 25 \
+                  --max_sequence_length 256 \
+                  --warmup_steps 5 \
+                  --no_use_resolution_binning \
+                  --ulysses_degree 8 \
+                  --use_torch_compile \
+                  --num_repetitions 1 \
+                  --benchmark_output_directory results
+            {% endif %}
+            The generated video will be stored under the results directory. For the actual benchmark step runtimes, see {% if model.model == "Hunyuan Video" %}stdout.{% elif model.model in ["Wan2.1", "Wan2.2"] %}results/outputs/rank0_*.json{% elif model.model == "FLUX.1" %}results/timing.json{% endif %}
+            {% if model.model == "FLUX.1" %}You may also use ``run_usp.py`` which implements USP without modifying the default diffusers pipeline. {% endif %}
+      {% endfor %}
+    {% endfor %}
+
+Previous versions
+=================
+
+See
+:doc:`/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-history`
+to find documentation for previous releases of xDiT diffusion inference
+performance testing.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.12.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.12.rst
@@ -0,0 +1,411 @@
+:orphan:
+
+.. meta::
+   :description: Learn to validate diffusion model video generation on MI300X, MI350X and MI355X accelerators using
+                 prebuilt and optimized docker images.
+   :keywords: xDiT, diffusion, video, video generation, image, image generation, validate, benchmark
+
+************************
+xDiT diffusion inference
+************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm vLLM
+   inference performance documentation. See
+   :doc:`/how-to/rocm-for-ai/inference/xdit-diffusion-inference` for the latest
+   version.
+
+.. _xdit-video-diffusion-2512:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.12-inference-models.yaml
+
+   {% set docker = data.docker %}
+
+   The `rocm/pytorch-xdit <{{ docker.docker_hub_url }}>`_ Docker image offers
+   a prebuilt, optimized environment based on `xDiT
+   <https://github.com/xdit-project/xDiT>`_ for benchmarking diffusion model
+   video and image generation on AMD Instinct MI355X, MI350X (gfx950), MI325X,
+   and MI300X (gfx942) GPUs.
+
+   The image runs ROCm **{{docker.ROCm}}** (preview) based on `TheRock <https://github.com/ROCm/TheRock>`_
+   and includes the following components:
+
+   .. dropdown:: Software components
+
+      .. list-table::
+         :header-rows: 1
+
+         * - Software component
+           - Version
+
+         {% for component_name, component_data in docker.components.items() %}
+         * - `{{ component_name }} <{{ component_data.url }}>`_
+           - {{ component_data.version }}
+         {% endfor %}
+
+Follow this guide to pull the required image, spin up a container, download the model, and run a benchmark.
+For preview and development releases, see `amdsiloai/pytorch-xdit <https://hub.docker.com/r/amdsiloai/pytorch-xdit>`_.
+
+What's new
+==========
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.12-inference-models.yaml
+
+   {% set docker = data.docker %}
+
+   {% for item in docker.whats_new %}
+   * {{ item }}
+   {% endfor %}
+
+.. _xdit-video-diffusion-supported-models-2512:
+
+Supported models
+================
+
+The following models are supported for inference performance benchmarking.
+Some instructions, commands, and recommendations in this documentation might
+vary by model -- select one to get started.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.12-inference-models.yaml
+
+   {% set docker = data.docker %}
+
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+          <div class="row gx-0">
+              <div class="col-2 me-1 px-2 model-param-head">Model</div>
+              <div class="row col-10 pe-0">
+        {% for model_group in docker.supported_models %}
+               <div class="col-6 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.js_tag }}" tabindex="0">{{ model_group.group }}</div>
+        {% endfor %}
+              </div>
+          </div>
+
+          <div class="row gx-0 pt-1">
+              <div class="col-2 me-1 px-2 model-param-head">Variant</div>
+              <div class="row col-10 pe-0">
+        {% for model_group in docker.supported_models %}
+            {% set models = model_group.models %}
+            {% for model in models %}
+                {% if models|length % 3 == 0 %}
+                <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.js_tag }}" data-param-group="{{ model_group.js_tag }}" tabindex="0">{{ model.model }}</div>
+                {% else %}
+                <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.js_tag }}" data-param-group="{{ model_group.js_tag }}" tabindex="0">{{ model.model }}</div>
+                {% endif %}
+            {% endfor %}
+        {% endfor %}
+              </div>
+          </div>
+      </div>
+
+   {% for model_group in docker.supported_models %}
+       {% for model in model_group.models %}
+
+   .. container:: model-doc {{ model.js_tag }}
+
+      .. note::
+
+         To learn more about your specific model see the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_
+         or visit the `GitHub page <{{ model.github }}>`__. Note that some models require access authorization before use via an
+         external license agreement through a third party.
+
+       {% endfor %}
+   {% endfor %}
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting.
+
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
+Pull the Docker image
+=====================
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.12-inference-models.yaml
+
+   {% set docker = data.docker %}
+
+   For this tutorial, it's recommended to use the latest ``{{ docker.pull_tag }}`` Docker image.
+   Pull the image using the following command:
+
+   .. code-block:: shell
+
+      docker pull {{ docker.pull_tag }}
+
+Validate and benchmark
+======================
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.12-inference-models.yaml
+
+   {% set docker = data.docker %}
+
+   Once the image has been downloaded you can follow these steps to
+   run benchmarks and generate outputs.
+
+   {% for model_group in docker.supported_models %}
+     {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.js_tag}}
+
+      The following commands are written for {{ model.model }}.
+      See :ref:`xdit-video-diffusion-supported-models` to switch to another available model.
+
+     {% endfor %}
+   {% endfor %}
+
+Choose your setup method
+------------------------
+
+You can either use an existing Hugging Face cache or download the model fresh inside the container.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.12-inference-models.yaml
+
+   {% set docker = data.docker %}
+
+   {% for model_group in docker.supported_models %}
+     {% for model in model_group.models %}
+   .. container:: model-doc {{model.js_tag}}
+
+      .. tab-set::
+
+         .. tab-item:: Option 1: Use existing Hugging Face cache
+
+            If you already have models downloaded on your host system, you can mount your existing cache.
+
+            1. Set your Hugging Face cache location.
+
+               .. code-block:: shell
+
+                  export HF_HOME=/your/hf_cache/location
+
+            2. Download the model (if not already cached).
+
+               .. code-block:: shell
+
+                  huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %}
+
+            3. Launch the container with mounted cache.
+
+               .. code-block:: shell
+
+                  docker run \
+                      -it --rm \
+                      --cap-add=SYS_PTRACE \
+                      --security-opt seccomp=unconfined \
+                      --user root \
+                      --device=/dev/kfd \
+                      --device=/dev/dri \
+                      --group-add video \
+                      --ipc=host \
+                      --network host \
+                      --privileged \
+                      --shm-size 128G \
+                      --name pytorch-xdit \
+                      -e HSA_NO_SCRATCH_RECLAIM=1 \
+                      -e OMP_NUM_THREADS=16 \
+                      -e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+                      -e HF_HOME=/app/huggingface_models \
+                      -v $HF_HOME:/app/huggingface_models \
+                      {{ docker.pull_tag }}
+
+         .. tab-item:: Option 2: Download inside container
+
+            If you prefer to keep the container self-contained or don't have an existing cache.
+
+            1. Launch the container
+
+               .. code-block:: shell
+
+                  docker run \
+                      -it --rm \
+                      --cap-add=SYS_PTRACE \
+                      --security-opt seccomp=unconfined \
+                      --user root \
+                      --device=/dev/kfd \
+                      --device=/dev/dri \
+                      --group-add video \
+                      --ipc=host \
+                      --network host \
+                      --privileged \
+                      --shm-size 128G \
+                      --name pytorch-xdit \
+                      -e HSA_NO_SCRATCH_RECLAIM=1 \
+                      -e OMP_NUM_THREADS=16 \
+                      -e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+                      {{ docker.pull_tag }}
+
+            2. Inside the container, set the Hugging Face cache location and download the model.
+
+               .. code-block:: shell
+
+                  export HF_HOME=/app/huggingface_models
+                  huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %}
+
+               .. warning::
+
+                  Models will be downloaded to the container's filesystem and will be lost when the container is removed unless you persist the data with a volume.
+     {% endfor %}
+   {% endfor %}
+
+Run inference
+=============
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.12-inference-models.yaml
+
+   {% set docker = data.docker %}
+
+   {% for model_group in docker.supported_models %}
+     {% for model in model_group.models %}
+
+   .. container:: model-doc {{ model.js_tag }}
+
+      .. tab-set::
+
+         .. tab-item:: MAD-integrated benchmarking
+
+            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+               directory and install the required packages on the host machine.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD
+                  pip install -r requirements.txt
+
+            2. On the host machine, use this command to run the performance benchmark test on
+               the `{{model.model}} <{{ model.url }}>`_ model using one node.
+
+               .. code-block:: shell
+
+                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+                  madengine run \
+                      --tags {{model.mad_tag}} \
+                      --keep-model-dir \
+                      --live-output
+                     
+            MAD launches a Docker container with the name
+            ``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the
+            model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv``
+            and ``{{ model.mad_tag }}_serving.csv``.
+
+         .. tab-item:: Standalone benchmarking
+
+            To run the benchmarks for {{ model.model }}, use the following command:
+
+            .. code-block:: shell
+            {% if model.model == "Hunyuan Video" %}
+               cd /app/Hunyuanvideo
+               mkdir results
+
+               torchrun --nproc_per_node=8 run.py \
+                  --model {{ model.model_repo }} \
+                  --prompt "In the large cage, two puppies were wagging their tails at each other." \
+                  --height 720 --width 1280 --num_frames 129 \
+                  --num_inference_steps 50 --warmup_steps 1 --n_repeats 1 \
+                  --ulysses_degree 8 \
+                  --enable_tiling --enable_slicing \
+                  --use_torch_compile \
+                  --bench_output results
+
+            {% endif %}
+            {% if model.model == "Wan2.1" %}
+               cd Wan
+               mkdir results
+
+               torchrun --nproc_per_node=8 /app/Wan/run.py \
+                  --task i2v \
+                  --height 720 \
+                  --width 1280 \
+                  --model {{ model.model_repo }} \
+                  --img_file_path /app/Wan/i2v_input.JPG \
+                  --ulysses_degree 8 \
+                  --seed 42 \
+                  --num_frames 81 \
+                  --prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \
+                  --num_repetitions 1 \
+                  --num_inference_steps 40 \
+                  --use_torch_compile
+
+            {% endif %}
+            {% if model.model == "Wan2.2" %}
+               cd Wan
+               mkdir results
+
+               torchrun --nproc_per_node=8 /app/Wan/run.py \
+                  --task i2v \
+                  --height 720 \
+                  --width 1280 \
+                  --model {{ model.model_repo }} \
+                  --img_file_path /app/Wan/i2v_input.JPG \
+                  --ulysses_degree 8 \
+                  --seed 42 \
+                  --num_frames 81 \
+                  --prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \
+                  --num_repetitions 1 \
+                  --num_inference_steps 40 \
+                  --use_torch_compile
+
+            {% endif %}
+
+            {% if model.model == "FLUX.1" %}
+               cd Flux
+               mkdir results
+
+               torchrun --nproc_per_node=8 /app/Flux/run.py \
+                  --model {{ model.model_repo }} \
+                  --seed 42 \
+                  --prompt "A small cat" \
+                  --height 1024 \
+                  --width 1024 \
+                  --num_inference_steps 25 \
+                  --max_sequence_length 256 \
+                  --warmup_steps 5 \
+                  --no_use_resolution_binning \
+                  --ulysses_degree 8 \
+                  --use_torch_compile \
+                  --num_repetitions 50
+
+            {% endif %}
+
+            {% if model.model == "stable-diffusion-3.5-large" %}
+               cd StableDiffusion3.5 
+               mkdir results
+
+               torchrun --nproc_per_node=8 /app/StableDiffusion3.5/run.py \
+                  --model {{ model.model_repo }} \
+                  --num_inference_steps 28 \
+                  --prompt "A capybara holding a sign that reads Hello World" \
+                  --use_torch_compile \
+                  --pipefusion_parallel_degree 4 \
+                  --use_cfg_parallel \
+                  --num_repetitions 50 \
+                  --dtype torch.float16 \
+                  --output_path results
+
+            {% endif %}
+
+            The generated video will be stored under the results directory. For the actual benchmark step runtimes, see {% if model.model == "Hunyuan Video" %}stdout.{% elif model.model in ["Wan2.1", "Wan2.2"] %}results/outputs/rank0_*.json{% elif model.model == "FLUX.1" %}results/timing.json{% elif model.model == "stable-diffusion-3.5-large"%}benchmark_results.csv{% endif %}
+
+            {% if model.model == "FLUX.1" %}You may also use ``run_usp.py`` which implements USP without modifying the default diffusers pipeline. {% endif %}
+
+      {% endfor %}
+    {% endfor %}
+
+Previous versions
+=================
+
+See
+:doc:`/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-history`
+to find documentation for previous releases of xDiT diffusion inference
+performance testing.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-history.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-history.rst
@@ -15,42 +15,33 @@ benchmarking, see the version-specific documentation.
     - Components
     - Resources

-   * - ``rocm/pytorch-xdit:v25.11`` (latest)
+   * - ``rocm/pytorch-xdit:v25.13`` (latest)
     - 
-       * ROCm 7.10.0 preview
-       * TheRock 3e3f834
-       * rccl d23d18f
-       * composable_kernel 2570462
-       * rocm-libraries 0588f07
-       * rocm-systems 473025a
-       * torch 73adac
-       * torchvision f5c6c2e
-       * triton 7416ffc
-       * accelerate 34c1779
-       * aiter de14bec
-       * diffusers 40528e9
-       * xfuser 83978b5
-       * yunchang 2c9b712
+       * TheRock 1728a81
     - 
       * :doc:`Documentation <../../xdit-diffusion-inference>`
-       * `Docker Hub <https://hub.docker.com/r/rocm/pytorch-xdit>`__
+       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-xdit/v25.13/images/sha256-81954713070d67bde08595e03f62110c8a3dd66a9ae17a77d611e01f83f0f4ef>`__
+
+   * - ``rocm/pytorch-xdit:v25.12``
+     - 
+       * `ROCm 7.10.0 preview <https://rocm.docs.amd.com/en/7.10.0-preview/about/release-notes.html>`__
+       * TheRock 3e3f834
+     - 
+       * :doc:`Documentation <xdit-25.12>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-xdit/v25.12/images/sha256-e06895132316bf3c393366b70a91eaab6755902dad0100e6e2b38310547d9256>`__
+
+   * - ``rocm/pytorch-xdit:v25.11``
+     - 
+       * `ROCm 7.10.0 preview <https://rocm.docs.amd.com/en/7.10.0-preview/about/release-notes.html>`__
+       * TheRock 3e3f834
+     - 
+       * :doc:`Documentation <xdit-25.11>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-xdit/v25.11/images/sha256-c9fa659439bb024f854b4d5eea598347251b02c341c55f66c98110832bde4216>`__

   * - ``rocm/pytorch-xdit:v25.10``
     - 
-       * ROCm 7.9.0 preview
+       * `ROCm 7.9.0 preview <https://rocm.docs.amd.com/en/7.9.0-preview/about/release-notes.html>`__
       * TheRock 7afbe45
-       * rccl 9b04b2a
-       * composable_kernel b7a806f
-       * rocm-libraries f104555
-       * rocm-systems 25922d0
-       * torch 2.10.0a0+gite9c9017
-       * torchvision 0.22.0a0+966da7e
-       * triton 3.5.0+git52e49c12
-       * accelerate 1.11.0.dev0
-       * aiter 0.1.5.post4.dev20+ga25e55e79
-       * diffusers 0.36.0.dev0
-       * xfuser 0.4.4
-       * yunchang 0.6.3.post1
     - 
       * :doc:`Documentation <xdit-25.10>`
-       * `Docker Hub <https://hub.docker.com/r/rocm/pytorch-xdit>`__
+       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-xdit/v25.10/images/sha256-d79715ff18a9470e3f907cec8a9654d6b783c63370b091446acffc0de4d7070e>`__
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
@@ -6,7 +6,7 @@
 vLLM inference performance testing
 **********************************

-.. _vllm-benchmark-unified-docker-1024:
+.. _vllm-benchmark-unified-docker-1210:

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml

@@ -34,21 +34,18 @@ vLLM inference performance testing
            {% endfor %}

 With this Docker image, you can quickly test the :ref:`expected
-inference performance numbers <vllm-benchmark-performance-measurements-1024>` for
+inference performance numbers <vllm-benchmark-performance-measurements-1210>` for
 AMD Instinct GPUs.

 What's new
 ==========

-The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <previous-versions/vllm-history>`.
+The following is summary of notable changes since the :doc:`previous ROCm/vLLM
+Docker release <previous-versions/vllm-history>`.

-* Enabled :ref:`AITER <vllm-optimization-aiter-switches>` by default.
+- Improved performance on Llama 3 MXFP4 through AITER optimizations and improved kernel fusion.

-* Fixed ``rms_norm`` segfault issue with Qwen 3 235B.
-
-* Known performance degradation on Llama 4 models due to `an upstream vLLM issue <https://github.com/vllm-project/vllm/issues/26320>`_.
-
-.. _vllm-benchmark-supported-models-1024:
+.. _vllm-benchmark-supported-models-1210:

 Supported models
 ================
@@ -58,7 +55,7 @@ Supported models
   {% set docker = data.dockers[0] %}
   {% set model_groups = data.model_groups %}

-   .. _vllm-benchmark-available-models-1024:
+   .. _vllm-benchmark-available-models-1210:

   The following models are supported for inference performance benchmarking
   with vLLM and ROCm. Some instructions, commands, and recommendations in this
@@ -94,7 +91,7 @@ Supported models
         </div>
      </div>

-   .. _vllm-benchmark-vllm-1024:
+   .. _vllm-benchmark-vllm-1210:

   {% for model_group in model_groups %}
      {% for model in model_group.models %}
@@ -108,6 +105,15 @@ Supported models
         MXFP4 is supported only on MI355X and MI350X GPUs.
      {% endif %}

+      {% if model.mad_tag in ["pyt_vllm_mixtral-8x7b", "pyt_vllm_mixtral-8x7b_fp8", "pyt_vllm_mixtral-8x22b", "pyt_vllm_mixtral-8x22b_fp8", "pyt_vllm_deepseek-r1"] %}
+      .. caution::
+
+         There is a known regression with AITER for MoE models such as Mixtral and
+         DeepSeek-R1. Consider using the :doc:`previous release
+         <previous-versions/vllm-0.11.1-20251103>`
+         ``rocm/vllm:rocm7.0.0_vllm_0.11.1_20251103`` for better performance.
+      {% endif %}
+
      .. note::

         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
@@ -122,7 +128,7 @@ Supported models
      {% endfor %}
   {% endfor %}

-.. _vllm-benchmark-performance-measurements-1024:
+.. _vllm-benchmark-performance-measurements-1210:

 Performance measurements
 ========================
@@ -178,7 +184,7 @@ Benchmarking
   Once the setup is complete, choose between two options to reproduce the
   benchmark results:

-   .. _vllm-benchmark-mad-1024:
+   .. _vllm-benchmark-mad-1210:

   {% for model_group in model_groups %}
      {% for model in model_group.models %}
@@ -190,7 +196,7 @@ Benchmarking
         .. tab-item:: MAD-integrated benchmarking

            The following run command is tailored to {{ model.model }}.
-            See :ref:`vllm-benchmark-supported-models-1024` to switch to another available model.
+            See :ref:`vllm-benchmark-supported-models-1210` to switch to another available model.

            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
               directory and install the required packages on the host machine.
@@ -219,7 +225,7 @@ Benchmarking
            and ``{{ model.mad_tag }}_serving.csv``.

            Although the :ref:`available models
-            <vllm-benchmark-available-models-1024>` are preconfigured to collect
+            <vllm-benchmark-available-models-1210>` are preconfigured to collect
            offline throughput and online serving performance data, you can
            also change the benchmarking parameters. See the standalone
            benchmarking tab for more information.
@@ -244,7 +250,7 @@ Benchmarking
         .. tab-item:: Standalone benchmarking

            The following commands are optimized for {{ model.model }}.
-            See :ref:`vllm-benchmark-supported-models-1024` to switch to another available model.
+            See :ref:`vllm-benchmark-supported-models-1210` to switch to another available model.

            .. seealso::

@@ -438,6 +444,14 @@ To reproduce this ROCm-enabled vLLM Docker image release, follow these steps:

      Replace ``vllm-rocm`` with your desired image tag.

+Known issues
+============
+
+There is a known regression with AITER for MoE models such as Mixtral and
+DeepSeek-R1. Consider using the :doc:`previous release
+<previous-versions/vllm-0.11.1-20251103>`
+(``rocm/vllm:rocm7.0.0_vllm_0.11.1_20251103``) for better performance.
+
 Further reading
 ===============

--- a/docs/how-to/rocm-for-ai/inference/index.rst
+++ b/docs/how-to/rocm-for-ai/inference/index.rst
@@ -26,7 +26,6 @@ training, fine-tuning, and inference. It leverages popular machine learning fram

 - :doc:`SGLang inference performance testing <benchmark-docker/sglang>`

- :doc:`Deploying your model <deploy-your-model>`
-
 - :doc:`xDiT diffusion inference <xdit-diffusion-inference>`

+- :doc:`Deploying your model <deploy-your-model>`
--- a/docs/how-to/rocm-for-ai/inference/xdit-diffusion-inference.rst
+++ b/docs/how-to/rocm-for-ai/inference/xdit-diffusion-inference.rst
@@ -11,15 +11,19 @@ xDiT diffusion inference

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml

-   {% set docker = data.xdit_diffusion_inference.docker | selectattr("version", "equalto", "v25-11") | first %}
-   {% set model_groups = data.xdit_diffusion_inference.model_groups%}
+   {% set docker = data.docker %}

-   The `rocm/pytorch-xdit <{{ docker.docker_hub_url }}>`_ Docker image offers a prebuilt, optimized environment based on `xDiT <https://github.com/xdit-project/xDiT>`_ for
-   benchmarking diffusion model video and image generation on gfx942 and gfx950 series (AMD Instinct™ MI300X, MI325X, MI350X, and MI355X) GPUs.
-   The image runs ROCm **{{docker.ROCm}}** (preview) based on `TheRock <https://github.com/ROCm/TheRock>`_
-   and includes the following components:
+   The `rocm/pytorch-xdit <{{ docker.docker_hub_url }}>`_ Docker image offers
+   a prebuilt, optimized environment based on `xDiT
+   <https://github.com/xdit-project/xDiT>`_ for benchmarking diffusion model
+   video and image generation on AMD Instinct MI355X, MI350X (gfx950), MI325X,
+   and MI300X (gfx942) GPUs.

-   .. dropdown:: Software components
+   The image runs a preview version of ROCm using the new `TheRock
+   <https://github.com/ROCm/TheRock>`__ build system and includes the following
+   components:
+
+   .. dropdown:: Software components - {{ docker.pull_tag.split('-')|last }}

      .. list-table::
         :header-rows: 1
@@ -27,9 +31,9 @@ xDiT diffusion inference
         * - Software component
           - Version

-         {% for component_name, component_version in docker.components.items() %}
-         * - {{ component_name }}
-           - {{ component_version }}
+         {% for component_name, component_data in docker.components.items() %}
+         * - `{{ component_name }} <{{ component_data.url }}>`_
+           - {{ component_data.version }}
         {% endfor %}

 Follow this guide to pull the required image, spin up a container, download the model, and run a benchmark.
@@ -39,8 +43,7 @@ What's new
 ==========
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml

-   {% set docker = data.xdit_diffusion_inference.docker | selectattr("version", "equalto", "v25-11") | first %}
-   {% set model_groups = data.xdit_diffusion_inference.model_groups%}
+   {% set docker = data.docker %}

   {% for item in docker.whats_new %}
   * {{ item }}
@@ -57,14 +60,7 @@ vary by model -- select one to get started.

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml

-   {% set docker = data.xdit_diffusion_inference.docker | selectattr("version", "equalto", "v25-11") | first %}
-   {% set model_groups = data.xdit_diffusion_inference.model_groups %}
-   
-   {# Create a lookup for supported models #}
-   {% set supported_lookup = {} %}
-   {% for supported in docker.supported_models %}
-   {% set _ = supported_lookup.update({supported.group: supported.models}) %}
-   {% endfor %}
+   {% set docker = data.docker %}

   .. raw:: html

@@ -72,10 +68,8 @@ vary by model -- select one to get started.
          <div class="row gx-0">
              <div class="col-2 me-1 px-2 model-param-head">Model</div>
              <div class="row col-10 pe-0">
-        {% for model_group in model_groups %}
-            {% if model_group.group in supported_lookup %}
-                  <div class="col-4 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
-            {% endif %}
+        {% for model_group in docker.supported_models %}
+               <div class="col-6 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.js_tag }}" tabindex="0">{{ model_group.group }}</div>
        {% endfor %}
              </div>
          </div>
@@ -83,29 +77,24 @@ vary by model -- select one to get started.
          <div class="row gx-0 pt-1">
              <div class="col-2 me-1 px-2 model-param-head">Variant</div>
              <div class="row col-10 pe-0">
-        {% for model_group in model_groups %}
-            {% if model_group.group in supported_lookup %}
-            {% set supported_models = supported_lookup[model_group.group] %}
+        {% for model_group in docker.supported_models %}
            {% set models = model_group.models %}
            {% for model in models %}
-                {% if model.model in supported_models %}
                {% if models|length % 3 == 0 %}
-                <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.page_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+                <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.js_tag }}" data-param-group="{{ model_group.js_tag }}" tabindex="0">{{ model.model }}</div>
                {% else %}
-                <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.page_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-                {% endif %}
+                <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.js_tag }}" data-param-group="{{ model_group.js_tag }}" tabindex="0">{{ model.model }}</div>
                {% endif %}
            {% endfor %}
-            {% endif %}
        {% endfor %}
              </div>
          </div>
      </div>

-   {% for model_group in model_groups %}
+   {% for model_group in docker.supported_models %}
       {% for model in model_group.models %}

-   .. container:: model-doc {{ model.page_tag }}
+   .. container:: model-doc {{ model.js_tag }}

      .. note::

@@ -116,6 +105,22 @@ vary by model -- select one to get started.
       {% endfor %}
   {% endfor %}

+Performance measurements
+========================
+
+To evaluate performance, the `Performance results with AMD ROCm software
+<https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8543b7e6d-item-9eda09e707-tab>`__
+page provides reference throughput and serving measurements for inferencing popular AI models.
+
+.. important::
+
+   The performance data presented in `Performance results with AMD ROCm
+   software
+   <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8543b7e6d-item-9eda09e707-tab>`__
+   only reflects the latest version of this inference benchmarking environment.
+   The listed measurements should not be interpreted as the peak performance
+   achievable by AMD Instinct GPUs or ROCm software.
+
 System validation
 =================

@@ -136,7 +141,7 @@ Pull the Docker image

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml

-   {% set docker = data.xdit_diffusion_inference.docker | selectattr("version", "equalto", "v25-11") | first %}
+   {% set docker = data.docker %}

   For this tutorial, it's recommended to use the latest ``{{ docker.pull_tag }}`` Docker image.
   Pull the image using the following command:
@@ -148,15 +153,17 @@ Pull the Docker image
 Validate and benchmark
 ======================

-Once the image has been downloaded you can follow these steps to
-run benchmarks and generate outputs.
-
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml

-   {% for model_group in model_groups %}
+   {% set docker = data.docker %}
+
+   Once the image has been downloaded you can follow these steps to
+   run benchmarks and generate outputs.
+
+   {% for model_group in docker.supported_models %}
     {% for model in model_group.models %}

-   .. container:: model-doc {{model.page_tag}}
+   .. container:: model-doc {{model.js_tag}}

      The following commands are written for {{ model.model }}.
      See :ref:`xdit-video-diffusion-supported-models` to switch to another available model.
@@ -171,12 +178,11 @@ You can either use an existing Hugging Face cache or download the model fresh in

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml

-   {% set docker = data.xdit_diffusion_inference.docker | selectattr("version", "equalto", "v25-11") | first %}
-   {% set model_groups = data.xdit_diffusion_inference.model_groups%}
+   {% set docker = data.docker %}

-   {% for model_group in model_groups %}
+   {% for model_group in docker.supported_models %}
     {% for model in model_group.models %}
-   .. container:: model-doc {{model.page_tag}}
+   .. container:: model-doc {{model.js_tag}}

      .. tab-set::

@@ -264,11 +270,12 @@ Run inference

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml

-   {% set model_groups = data.xdit_diffusion_inference.model_groups%}
-   {% for model_group in model_groups %}
+   {% set docker = data.docker %}
+
+   {% for model_group in docker.supported_models %}
     {% for model in model_group.models %}

-   .. container:: model-doc {{ model.page_tag }}
+   .. container:: model-doc {{ model.js_tag }}

      .. tab-set::

@@ -309,7 +316,7 @@ Run inference
               mkdir results

               torchrun --nproc_per_node=8 run.py \
-                  --model tencent/HunyuanVideo \
+                  --model {{ model.model_repo }} \
                  --prompt "In the large cage, two puppies were wagging their tails at each other." \
                  --height 720 --width 1280 --num_frames 129 \
                  --num_inference_steps 50 --warmup_steps 1 --n_repeats 1 \
@@ -317,48 +324,53 @@ Run inference
                  --enable_tiling --enable_slicing \
                  --use_torch_compile \
                  --bench_output results
+
            {% endif %}
            {% if model.model == "Wan2.1" %}
-               cd Wan2.1
+               cd /app/Wan
               mkdir results

-               torchrun --nproc_per_node=8 run.py \
-                  --task i2v-14B \
-                  --size 720*1280 --frame_num 81 \
-                  --ckpt_dir "${HF_HOME}/hub/models--Wan-AI--Wan2.1-I2V-14B-720P/snapshots/8823af45fcc58a8aa999a54b04be9abc7d2aac98/" \
-                  --image "/app/Wan2.1/examples/i2v_input.JPG" \
-                  --ulysses_size 8 --ring_size 1 \
+               torchrun --nproc_per_node=8 /app/Wan/run.py \
+                  --task i2v \
+                  --height 720 \
+                  --width 1280 \
+                  --model {{ model.model_repo }} \
+                  --img_file_path /app/Wan/i2v_input.JPG \
+                  --ulysses_degree 8 \
+                  --seed 42 \
+                  --num_frames 81 \
                  --prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \
-                  --benchmark_output_directory results --save_file video.mp4 --num_benchmark_steps 1 \
-                  --offload_model 0 \
-                  --vae_dtype bfloat16 \
-                  --allow_tf32 \
-                  --compile
+                  --num_repetitions 1 \
+                  --num_inference_steps 40 \
+                  --use_torch_compile
+
            {% endif %}
            {% if model.model == "Wan2.2" %}
-               cd Wan2.2
+               cd /app/Wan
               mkdir results

-               torchrun --nproc_per_node=8 run.py \
-                  --task i2v-A14B \
-                  --size 720*1280 --frame_num 81 \
-                  --ckpt_dir "${HF_HOME}/hub/models--Wan-AI--Wan2.2-I2V-A14B/snapshots/206a9ee1b7bfaaf8f7e4d81335650533490646a3/" \
-                  --image "/app/Wan2.2/examples/i2v_input.JPG" \
-                  --ulysses_size 8 --ring_size 1 \
+               torchrun --nproc_per_node=8 /app/Wan/run.py \
+                  --task i2v \
+                  --height 720 \
+                  --width 1280 \
+                  --model {{ model.model_repo }} \
+                  --img_file_path /app/Wan/i2v_input.JPG \
+                  --ulysses_degree 8 \
+                  --seed 42 \
+                  --num_frames 81 \
                  --prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \
-                  --benchmark_output_directory results --save_file video.mp4 --num_benchmark_steps 1 \
-                  --offload_model 0 \
-                  --vae_dtype bfloat16 \
-                  --allow_tf32 \
-                  --compile
+                  --num_repetitions 1 \
+                  --num_inference_steps 40 \
+                  --use_torch_compile
+
            {% endif %}

            {% if model.model == "FLUX.1" %}
-               cd Flux
+               cd /app/Flux
               mkdir results

               torchrun --nproc_per_node=8 /app/Flux/run.py \
-                  --model black-forest-labs/FLUX.1-dev \
+                  --model {{ model.model_repo }} \
                  --seed 42 \
                  --prompt "A small cat" \
                  --height 1024 \
@@ -369,12 +381,74 @@ Run inference
                  --no_use_resolution_binning \
                  --ulysses_degree 8 \
                  --use_torch_compile \
-                  --num_repetitions 1 \
-                  --benchmark_output_directory results
+                  --num_repetitions 50

            {% endif %}

-            The generated video will be stored under the results directory. For the actual benchmark step runtimes, see {% if model.model == "Hunyuan Video" %}stdout.{% elif model.model in ["Wan2.1", "Wan2.2"] %}results/outputs/rank0_*.json{% elif model.model == "FLUX.1" %}results/timing.json{% endif %}
+            {% if model.model == "FLUX.1 Kontext" %}
+               cd /app/Flux
+               mkdir results
+
+               torchrun --nproc_per_node=8 /app/Flux/run_usp.py \
+                  --model {{ model.model_repo }} \
+                  --seed 42 \
+                  --prompt "Add a cool hat to the cat" \
+                  --height 1024 \
+                  --width 1024 \
+                  --num_inference_steps 30 \
+                  --max_sequence_length 512 \
+                  --warmup_steps 5 \
+                  --no_use_resolution_binning \
+                  --ulysses_degree 8 \
+                  --use_torch_compile \
+                  --img_file_path /app/Flux/cat.png \
+                  --model_type flux_kontext \
+                  --guidance_scale 2.5 \
+                  --num_repetitions 25
+
+            {% endif %}
+
+            {% if model.model == "FLUX.2" %}
+               cd /app/Flux
+               mkdir results
+
+               torchrun --nproc_per_node=8 /app/Flux/run_usp.py \
+                  --model {{ model.model_repo }} \
+                  --seed 42 \
+                  --prompt "Add a cool hat to the cat" \
+                  --height 1024 \
+                  --width 1024 \
+                  --num_inference_steps 50 \
+                  --max_sequence_length 512 \
+                  --warmup_steps 5 \
+                  --no_use_resolution_binning \
+                  --ulysses_degree 8 \
+                  --use_torch_compile \
+                  --img_file_paths /app/Flux/cat.png \
+                  --model_type flux2 \
+                  --guidance_scale 4.0 \
+                  --num_repetitions 25
+
+            {% endif %}
+
+            {% if model.model == "stable-diffusion-3.5-large" %}
+               cd /app/StableDiffusion3.5 
+               mkdir results
+
+               torchrun --nproc_per_node=8 /app/StableDiffusion3.5/run.py \
+                  --model {{ model.model_repo }} \
+                  --num_inference_steps 28 \
+                  --prompt "A capybara holding a sign that reads Hello World" \
+                  --use_torch_compile \
+                  --pipefusion_parallel_degree 4 \
+                  --use_cfg_parallel \
+                  --num_repetitions 50 \
+                  --dtype torch.float16 \
+                  --output_path results
+
+            {% endif %}
+
+            The generated video will be stored under the results directory. For the actual benchmark step runtimes, see {% if model.model == "Hunyuan Video" %}stdout.{% elif model.model in ["Wan2.1", "Wan2.2"] %}results/outputs/rank0_*.json{% elif model.model in ["FLUX.1", "FLUX.1 Kontext", "FLUX.2"] %}results/timing.json{% elif model.model == "stable-diffusion-3.5-large"%}benchmark_results.csv{% endif %}

            {% if model.model == "FLUX.1" %}You may also use ``run_usp.py`` which implements USP without modifying the default diffusers pipeline. {% endif %}

@@ -385,4 +459,4 @@ Previous versions
 =================

 See :doc:`benchmark-docker/previous-versions/xdit-history` to find documentation for previous releases
-of xDiT diffusion inference performance testing.
+of xDiT diffusion inference performance testing.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
@@ -33,18 +33,15 @@ It includes the following software components:
              - {{ component_version }}

            {% endfor %}
-         {% if jax_version == "0.6.0" %}
-         .. note::
-
-            Shardy is a new config in JAX 0.6.0. You might get related errors if it's
-            not configured correctly. For now you can turn it off by setting
-            ``shardy=False`` during the training run. You can also follow the `migration
-            guide <https://docs.jax.dev/en/latest/shardy_jax_migration.html>`__ to enable
-            it.
-         {% endif %}
-
      {% endfor %}

+.. note::
+
+   The ``rocm/jax-training:maxtext-v25.9`` has been updated to
+   ``rocm/jax-training:maxtext-v25.9.1``. This revision should include
+   a fix to address segmentation fault issues during launch. See the
+   :doc:`versioned documentation <previous-versions/jax-maxtext-v25.9>`.
+
 MaxText with on ROCm provides the following key features to train large language models efficiently:

 - Transformer Engine (TE)
@@ -57,7 +54,7 @@ MaxText with on ROCm provides the following key features to train large language

 - NANOO FP8 (for MI300X series GPUs) and FP8 (for MI355X and MI350X) quantization support

-.. _amd-maxtext-model-support-v259:
+.. _amd-maxtext-model-support-v25.11:

 Supported models
 ================
@@ -139,7 +136,7 @@ Use the following command to pull the Docker image from Docker Hub.

      docker pull {{ docker.pull_tag }}

-.. _amd-maxtext-multi-node-setup-v259:
+.. _amd-maxtext-multi-node-setup-v25.11:

 Multi-node configuration
 ------------------------
@@ -147,7 +144,7 @@ Multi-node configuration
 See :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your
 environment for multi-node training.

-.. _amd-maxtext-get-started-v259:
+.. _amd-maxtext-get-started-v25.11:

 Benchmarking
 ============
@@ -172,7 +169,7 @@ benchmark results:
         .. tab-item:: MAD-integrated benchmarking

            The following run command is tailored to {{ model.model }}.
-            See :ref:`amd-maxtext-model-support-v259` to switch to another available model.
+            See :ref:`amd-maxtext-model-support-v25.11` to switch to another available model.

            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
               directory and install the required packages on the host machine.
@@ -203,7 +200,7 @@ benchmark results:
         .. tab-item:: Standalone benchmarking

            The following commands are optimized for {{ model.model }}. See
-            :ref:`amd-maxtext-model-support-v259` to switch to another
+            :ref:`amd-maxtext-model-support-v25.11` to switch to another
            available model. Some instructions and resources might not be
            available for all models and configurations.

@@ -325,15 +322,67 @@ benchmark results:

                  sbatch -N <num_nodes> {{ model.multinode_training_script }}

+            .. rubric:: Profiling with rocprofv3
+
+            If you need to collect a trace and the JAX profiler isn't working, use ``rocprofv3`` provided by the :doc:`ROCprofiler-SDK <rocprofiler-sdk:index>` as a workaround. For example:
+
+            .. code-block:: bash
+
+               rocprofv3 \
+                   --hip-trace \
+                   --kernel-trace \
+                   --memory-copy-trace \
+                   --rccl-trace \
+                   --output-format pftrace \
+                   -d ./v3_traces \ # output directory
+                   -- ./jax-maxtext_benchmark_report.sh -m {{ model.model_repo }} # or desired command
+
+            You can set the directory where you want the .json traces to be
+            saved using ``-d <TRACE_DIRECTORY>``. The resulting traces can be
+            opened in Perfetto: `<https://ui.perfetto.dev/>`__.
+
         {% else %}
            .. rubric:: Multi-node training

-            For multi-node training examples, choose a model from :ref:`amd-maxtext-model-support-v259`
+            For multi-node training examples, choose a model from :ref:`amd-maxtext-model-support-v25.11`
            with an available `multi-node training script <https://github.com/ROCm/MAD/tree/develop/scripts/jax-maxtext/gpu-rocm>`__.
         {% endif %}
      {% endfor %}
   {% endfor %}

+Known issues
+============
+
+- Minor performance regression (< 4%) for BF16 quantization in Llama models and Mixtral 8x7b.
+
+- You might see minor loss spikes, or loss curve may have slightly higher
+  convergence end values compared to the previous ``jax-training`` image.
+
+- For FP8 training on MI355, many models will display a warning message like:
+  ``Warning: Latency not found for MI_M=16, MI_N=16, MI_K=128,
+  mi_input_type=BFloat8Float8_fnuz. Returning latency value of 32 (really
+  slow).`` The compile step may take longer than usual, but training will run.
+  This will be fixed in a future release.
+
+- The built-in JAX profiler isn't working. If you need to collect a trace and
+  the JAX profiler isn't working, use ``rocprofv3`` provided by the
+  :doc:`ROCprofiler-SDK <rocprofiler-sdk:index>` as a workaround. For example:
+
+  .. code-block:: bash
+
+     rocprofv3 \
+         --hip-trace \
+         --kernel-trace \
+         --memory-copy-trace \
+         --rccl-trace \
+         --output-format pftrace \
+         -d ./v3_traces \ # output directory
+         -- ./jax-maxtext_benchmark_report.sh -m {{ model.model_repo }} # or desired command
+
+  You can set the directory where you want the .json traces to be
+  saved using ``-d <TRACE_DIRECTORY>``. The resulting traces can be
+  opened in Perfetto: `<https://ui.perfetto.dev/>`__.
+
 Further reading
 ===============

--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
@@ -52,7 +52,7 @@ accelerate training workloads:
              - {{ component_version }}
            {% endfor %}

-   .. _amd-megatron-lm-model-support-v2510:
+   .. _amd-megatron-lm-model-support-v25.11:

   Supported models
   ================
@@ -97,7 +97,7 @@ accelerate training workloads:
   Some models, such as Llama, require an external license agreement through
   a third party (for example, Meta).

-.. _amd-megatron-lm-performance-measurements-v2510:
+.. _amd-megatron-lm-performance-measurements-v25.11:

 Performance measurements
 ========================
@@ -129,7 +129,7 @@ To test for optimal performance, consult the recommended :ref:`System health ben
 <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
 system's configuration.

-.. _mi300x-amd-megatron-lm-training-v2510:
+.. _mi300x-amd-megatron-lm-training-v25.11:

 Environment setup
 =================
@@ -138,7 +138,7 @@ Use the following instructions to set up the environment, configure the script t
 reproduce the benchmark results on MI300X Series GPUs with the AMD Megatron-LM Docker
 image.

-.. _amd-megatron-lm-requirements-v2510:
+.. _amd-megatron-lm-requirements-v25.11:

 Download the Docker image
 -------------------------
@@ -190,7 +190,7 @@ Download the Docker image
 The Docker container hosts a verified commit of
 `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev>`__.

-.. _amd-megatron-lm-environment-setup-v2510:
+.. _amd-megatron-lm-environment-setup-v25.11:

 Configuration
 =============
@@ -200,39 +200,39 @@ Configuration
   Update the ``train_llama3.sh`` configuration script in the ``examples/llama``
   directory of
   `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/llama>`__ to configure your training run.
-   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v2510>`.
+   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v25.11>`.

 .. container:: model-doc pyt_megatron_lm_train_llama-2-7b pyt_megatron_lm_train_llama-2-70b

   Update the ``train_llama2.sh`` configuration script in the ``examples/llama``
   directory of
   `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/llama>`__ to configure your training run.
-   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v2510>`.
+   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v25.11>`.

 .. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy

   Update the ``train_deepseekv3.sh`` configuration script in the ``examples/deepseek_v3``
   directory of
   `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/deepseek_v3>`__ to configure your training run.
-   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v2510>`.
+   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v25.11>`.

 .. container:: model-doc pyt_megatron_lm_train_deepseek-v2-lite-16b

   Update the ``train_deepseekv2.sh`` configuration script in the ``examples/deepseek_v2``
   directory of
   `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/deepseek_v2>`__ to configure your training run.
-   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v2510>`.
+   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v25.11>`.

 .. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b pyt_megatron_lm_train_mixtral-8x22b-proxy

   Update the ``train_mixtral_moe.sh`` configuration script in the ``examples/mixtral``
   directory of
   `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/mixtral>`__ to configure your training run.
-   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v2510>`.
+   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v25.11>`.

 .. note::

-   See :ref:`Key options <amd-megatron-lm-benchmark-test-vars-v2510>` for more information on configuration options.
+   See :ref:`Key options <amd-megatron-lm-benchmark-test-vars-v25.11>` for more information on configuration options.

 Multi-node configuration
 ------------------------
@@ -240,7 +240,7 @@ Multi-node configuration
 Refer to :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your environment for multi-node
 training. See :ref:`amd-megatron-lm-multi-node-examples` for example run commands.

-.. _amd-megatron-lm-tokenizer-v2510:
+.. _amd-megatron-lm-tokenizer-v25.11:

 Tokenizer
 ---------
@@ -377,7 +377,7 @@ Download the dataset

   ``TOKENIZER_MODEL`` can be any accessible Hugging Face tokenizer.
   Remember to either pre-download the tokenizer or setup Hugging Face access
-   otherwise when needed -- see the :ref:`Tokenizer <amd-megatron-lm-tokenizer-v2510>` section.
+   otherwise when needed -- see the :ref:`Tokenizer <amd-megatron-lm-tokenizer-v25.11>` section.

   .. note::

@@ -479,13 +479,13 @@ Download the dataset

   Ensure that the files are accessible inside the Docker container.

-.. _amd-megatron-lm-run-training-v2510:
+.. _amd-megatron-lm-run-training-v25.11:

 Run training
 ============

 Use the following example commands to set up the environment, configure
-:ref:`key options <amd-megatron-lm-benchmark-test-vars-v2510>`, and run training on
+:ref:`key options <amd-megatron-lm-benchmark-test-vars-v25.11>`, and run training on
 MI300X Series GPUs with the AMD Megatron-LM environment.

 Before starting training, export the following environment variables.
@@ -920,7 +920,7 @@ Single node training
          RECOMPUTE_ACTIVATIONS=full \
          CKPT_FORMAT=torch_dist

-.. _amd-megatron-lm-multi-node-examples-v2510:
+.. _amd-megatron-lm-multi-node-examples-v25.11:

 Multi-node training examples
 ----------------------------
@@ -971,7 +971,7 @@ training on 16 nodes, try the following command:

   sbatch examples/deepseek_v3/train_deepseek_v3_slurm.sh

-.. _amd-megatron-lm-benchmark-test-vars-v2510:
+.. _amd-megatron-lm-benchmark-test-vars-v25.11:

 Key options
 -----------
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history.rst
@@ -17,13 +17,22 @@ previous releases of the ``ROCm/jax-training`` Docker image on `Docker Hub <http
     - Components
     - Resources

-   * - 25.9 (latest)
+   * - 25.11
+     -
+       * ROCm 7.1.0
+       * JAX 0.7.1
+     -
+       * :doc:`Documentation <../jax-maxtext>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.11/images/sha256-18e4d8f0b8ce7a7422c58046940dd5f32249960449fca09a562b65fb8eb1562a>`__
+
+   * - 25.9.1
     -
       * ROCm 7.0.0
       * JAX 0.6.2
     -
-       * :doc:`Documentation <../jax-maxtext>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7-jax060/images/sha256-7352212ae033a76dca2b9dceffc23c1b5f1a61a7a560082cf747a9bf1acfc9ce>`__
+       * :doc:`Documentation <jax-maxtext-v25.9>`
+       * `Docker Hub (25.9.1) <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.9.1/images/sha256-60946cfbd470f6ee361fc9da740233a4fb2e892727f01719145b1f7627a1cff6>`__
+       * `Docker Hub (25.9) <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.9/images/sha256-4bb16ab58279ef09cb7a5e362c38e3fe3f901de44d8dbac5d0cb3bac5686441e>`__

   * - 25.7
     -
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.7.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.7.rst
@@ -24,7 +24,7 @@ provides a prebuilt environment for training on AMD Instinct MI300X and MI325X G
 including essential components like JAX, XLA, ROCm libraries, and MaxText utilities.
 It includes the following software components:

-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/jax-maxtext-v25.7-benchmark-models.yaml

   {% set dockers = data.dockers %}
   .. tab-set::
@@ -80,7 +80,7 @@ series GPUs. Some instructions, commands, and available training
 configurations in this documentation might vary by model -- select one to get
 started.

-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/jax-maxtext-v25.7-benchmark-models.yaml

   {% set model_groups = data.model_groups %}
   .. raw:: html
@@ -144,7 +144,7 @@ Pull the Docker image

 Use the following command to pull the Docker image from Docker Hub.

-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/jax-maxtext-v25.7-benchmark-models.yaml

   {% set dockers = data.dockers %}
   .. tab-set::
@@ -177,7 +177,7 @@ Benchmarking
 Once the setup is complete, choose between two options to reproduce the
 benchmark results:

-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/jax-maxtext-v25.7-benchmark-models.yaml

   .. _vllm-benchmark-mad:

--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.9.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.9.rst
@@ -0,0 +1,365 @@
+:orphan:
+
+.. meta::
+   :description: How to train a model using JAX MaxText for ROCm.
+   :keywords: ROCm, AI, LLM, train, jax, torch, Llama, flux, tutorial, docker
+
+******************************************
+Training a model with JAX MaxText on ROCm
+******************************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm JAX MaxText
+   training performance documentation. See :doc:`../jax-maxtext` for the latest version.
+
+.. note::
+
+   We have refreshed the ``rocm/jax-training:maxtext-v25.9`` image as
+   `rocm/jax-training:maxtext-v25.9.1`. This should include a fix to address
+   segmentation fault issues during launch.
+
+The MaxText for ROCm training Docker image
+provides a prebuilt environment for training on AMD Instinct MI355X, MI350X, MI325X, and MI300X GPUs,
+including essential components like JAX, XLA, ROCm libraries, and MaxText utilities.
+It includes the following software components:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/jax-maxtext-v25.9-benchmark-models.yaml
+
+   {% set dockers = data.dockers %}
+   .. tab-set::
+
+      {% for docker in dockers %}
+      {% set jax_version = docker.components["JAX"] %}
+
+      .. tab-item:: ``{{ docker.pull_tag }}``
+         :sync: {{ docker.pull_tag }}
+
+         .. list-table::
+            :header-rows: 1
+
+            * - Software component
+              - Version
+
+            {% for component_name, component_version in docker.components.items() %}
+            * - {{ component_name }}
+              - {{ component_version }}
+
+            {% endfor %}
+         {% if jax_version == "0.6.0" %}
+         .. note::
+
+            Shardy is a new config in JAX 0.6.0. You might get related errors if it's
+            not configured correctly. For now you can turn it off by setting
+            ``shardy=False`` during the training run. You can also follow the `migration
+            guide <https://docs.jax.dev/en/latest/shardy_jax_migration.html>`__ to enable
+            it.
+         {% endif %}
+
+      {% endfor %}
+
+MaxText with on ROCm provides the following key features to train large language models efficiently:
+
+- Transformer Engine (TE)
+
+- Flash Attention (FA) 3 -- with or without sequence input packing
+
+- GEMM tuning
+
+- Multi-node support
+
+- NANOO FP8 (for MI300X series GPUs) and FP8 (for MI355X and MI350X) quantization support
+
+.. _amd-maxtext-model-support-v259:
+
+Supported models
+================
+
+The following models are pre-optimized for performance on AMD Instinct
+GPUs. Some instructions, commands, and available training
+configurations in this documentation might vary by model -- select one to get
+started.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/jax-maxtext-v25.9-benchmark-models.yaml
+
+   {% set model_groups = data.model_groups %}
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+         <div class="row gx-0">
+            <div class="col-2 me-1 px-2 model-param-head">Model</div>
+            <div class="row col-10 pe-0">
+      {% for model_group in model_groups %}
+               <div class="col-4 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+      {% endfor %}
+            </div>
+         </div>
+
+         <div class="row gx-0 pt-1">
+            <div class="col-2 me-1 px-2 model-param-head">Variant</div>
+            <div class="row col-10 pe-0">
+      {% for model_group in model_groups %}
+         {% set models = model_group.models %}
+         {% for model in models %}
+            {% if models|length % 3 == 0 %}
+               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% else %}
+               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% endif %}
+         {% endfor %}
+      {% endfor %}
+            </div>
+         </div>
+      </div>
+
+.. note::
+
+   Some models, such as Llama 3, require an external license agreement through
+   a third party (for example, Meta).
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting training.
+
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
+Environment setup
+=================
+
+This Docker image is optimized for specific model configurations outlined
+as follows. Performance can vary for other training workloads, as AMD
+doesn’t validate configurations and run conditions outside those described.
+
+Pull the Docker image
+---------------------
+
+Use the following command to pull the Docker image from Docker Hub.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/jax-maxtext-v25.9-benchmark-models.yaml
+
+   {% set docker = data.dockers[0] %}
+
+   .. code-block:: shell
+
+      docker pull {{ docker.pull_tag }}
+
+.. _amd-maxtext-multi-node-setup-v259:
+
+Multi-node configuration
+------------------------
+
+See :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your
+environment for multi-node training.
+
+.. _amd-maxtext-get-started-v259:
+
+Benchmarking
+============
+
+Once the setup is complete, choose between two options to reproduce the
+benchmark results:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/jax-maxtext-v25.9-benchmark-models.yaml
+
+   .. _vllm-benchmark-mad:
+
+   {% set docker = data.dockers[0] %}
+   {% set model_groups = data.model_groups %}
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. tab-set::
+
+         {% if model.mad_tag and "single-node" in model.doc_options %}
+         .. tab-item:: MAD-integrated benchmarking
+
+            The following run command is tailored to {{ model.model }}.
+            See :ref:`amd-maxtext-model-support-v259` to switch to another available model.
+
+            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+               directory and install the required packages on the host machine.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD
+                  pip install -r requirements.txt
+
+            2. Use this command to run the performance benchmark test on the {{ model.model }} model
+               using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
+
+               .. code-block:: shell
+
+                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+                  madengine run \
+                      --tags {{model.mad_tag}} \
+                      --keep-model-dir \
+                      --live-output \
+                      --timeout 28800
+
+            MAD launches a Docker container with the name
+            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
+            model are collected in the following path: ``~/MAD/perf.csv/``.
+         {% endif %}
+
+         .. tab-item:: Standalone benchmarking
+
+            The following commands are optimized for {{ model.model }}. See
+            :ref:`amd-maxtext-model-support-v259` to switch to another
+            available model. Some instructions and resources might not be
+            available for all models and configurations.
+
+            .. rubric:: Download the Docker image and required scripts
+
+            Run the JAX MaxText benchmark tool independently by starting the
+            Docker container as shown in the following snippet.
+
+            .. code-block:: shell
+
+               docker pull {{ docker.pull_tag }}
+
+            {% if model.model_repo and "single-node" in model.doc_options %}
+            .. rubric:: Single node training
+
+            1. Set up environment variables.
+
+               .. code-block:: shell
+
+                  export MAD_SECRETS_HFTOKEN=<Your Hugging Face token>
+                  export HF_HOME=<Location of saved/cached Hugging Face models>
+
+               ``MAD_SECRETS_HFTOKEN`` is your Hugging Face access token to access models, tokenizers, and data.
+               See `User access tokens <https://huggingface.co/docs/hub/en/security-tokens>`__.
+
+               ``HF_HOME`` is where ``huggingface_hub`` will store local data. See `huggingface_hub CLI <https://huggingface.co/docs/huggingface_hub/main/en/guides/cli#huggingface-cli-download>`__.
+               If you already have downloaded or cached Hugging Face artifacts, set this variable to that path.
+               Downloaded files typically get cached to ``~/.cache/huggingface``.
+
+            2. Launch the Docker container.
+
+               .. code-block:: shell
+
+                  docker run -it \
+                      --device=/dev/dri \
+                      --device=/dev/kfd \
+                      --network host \
+                      --ipc host \
+                      --group-add video \
+                      --cap-add=SYS_PTRACE \
+                      --security-opt seccomp=unconfined \
+                      --privileged \
+                      -v $HOME:$HOME \
+                      -v $HOME/.ssh:/root/.ssh \
+                      -v $HF_HOME:/hf_cache \
+                      -e HF_HOME=/hf_cache \
+                      -e MAD_SECRETS_HFTOKEN=$MAD_SECRETS_HFTOKEN
+                      --shm-size 64G \
+                      --name training_env \
+                      {{ docker.pull_tag }}
+
+            3. In the Docker container, clone the ROCm MAD repository and navigate to the
+               benchmark scripts directory at ``MAD/scripts/jax-maxtext``.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD/scripts/jax-maxtext
+
+            4. Run the setup scripts to install libraries and datasets needed
+               for benchmarking.
+
+               .. code-block:: shell
+
+                  ./jax-maxtext_benchmark_setup.sh -m {{ model.model_repo }}
+
+            5. To run the training benchmark without quantization, use the following command:
+
+               .. code-block:: shell
+
+                  ./jax-maxtext_benchmark_report.sh -m {{ model.model_repo }}
+
+               For quantized training, run the script with the appropriate option for your Instinct GPU.
+
+               .. tab-set::
+
+                  .. tab-item:: MI355X and MI350X
+
+                     For ``fp8`` quantized training on MI355X and MI350X GPUs, use the following command:
+
+                     .. code-block:: shell
+
+                        ./jax-maxtext_benchmark_report.sh -m {{ model.model_repo }} -q fp8
+
+                  {% if model.model_repo not in ["Llama-3.1-70B", "Llama-3.3-70B"] %}
+                  .. tab-item:: MI325X and MI300X
+
+                     For ``nanoo_fp8`` quantized training on MI300X series GPUs, use the following command:
+
+                     .. code-block:: shell
+
+                        ./jax-maxtext_benchmark_report.sh -m {{ model.model_repo }} -q nanoo_fp8
+                  {% endif %}
+
+            {% endif %}
+            {% if model.multinode_training_script and "multi-node" in model.doc_options %}
+            .. rubric:: Multi-node training
+
+            The following examples use SLURM to run on multiple nodes.
+
+            .. note::
+
+               The following scripts will launch the Docker container and run the
+               benchmark. Run them outside of any Docker container.
+
+            1. Make sure ``$HF_HOME`` is set before running the test. See
+               `ROCm benchmarking <https://github.com/ROCm/MAD/blob/develop/scripts/jax-maxtext/gpu-rocm/readme.md>`__
+               for more details on downloading the Llama models before running the
+               benchmark.
+
+            2. To run multi-node training for {{ model.model }},
+               use the
+               `multi-node training script <https://github.com/ROCm/MAD/blob/develop/scripts/jax-maxtext/gpu-rocm/{{ model.multinode_training_script }}>`__
+               under the ``scripts/jax-maxtext/gpu-rocm/`` directory.
+
+            3. Run the multi-node training benchmark script.
+
+               .. code-block:: shell
+
+                  sbatch -N <num_nodes> {{ model.multinode_training_script }}
+
+         {% else %}
+            .. rubric:: Multi-node training
+
+            For multi-node training examples, choose a model from :ref:`amd-maxtext-model-support-v259`
+            with an available `multi-node training script <https://github.com/ROCm/MAD/tree/develop/scripts/jax-maxtext/gpu-rocm>`__.
+         {% endif %}
+      {% endfor %}
+   {% endfor %}
+
+Further reading
+===============
+
+- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
+
+- To learn more about system settings and management practices to configure your system for
+  AMD Instinct MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`jax-maxtext-history` to find documentation for previous releases
+of the ``ROCm/jax-training`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst
@@ -16,7 +16,7 @@ previous releases of the ``ROCm/megatron-lm`` Docker image on `Docker Hub <https
     - Components
     - Resources

-   * - v25.10 (latest)
+   * - v25.11
     -
       * ROCm 7.1.0
       * PyTorch 2.10.0.dev20251112+rocm7.1
@@ -25,6 +25,15 @@ previous releases of the ``ROCm/megatron-lm`` Docker image on `Docker Hub <https
       * :doc:`Megatron-LM (legacy) documentation <../megatron-lm>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197>`__

+   * - v25.10
+     -
+       * ROCm 7.1.0
+       * PyTorch 2.10.0.dev20251112+rocm7.1
+     -
+       * :doc:`Primus Megatron documentation <primus-megatron-v25.10>`
+       * :doc:`Megatron-LM (legacy) documentation <megatron-lm-v25.10>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197>`__
+
   * - v25.9
     -
       * ROCm 7.0.0
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.10.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.10.rst
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.10.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.10.rst
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-pytorch-v25.10.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-pytorch-v25.10.rst
@@ -0,0 +1,448 @@
+:orphan:
+
+.. meta::
+   :description: How to train a model using PyTorch for ROCm.
+   :keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
+
+****************************************
+Training a model with Primus and PyTorch
+****************************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm Primus PyTorch training
+   performance benchmark documentation. See :doc:`../primus-pytorch` for the latest version.
+
+`Primus <https://github.com/AMD-AGI/Primus>`__ is a unified and flexible
+LLM training framework designed to streamline training. It streamlines LLM
+training on AMD Instinct GPUs using a modular, reproducible configuration paradigm.
+Primus now supports the PyTorch torchtitan backend.
+
+.. note::
+
+   For a unified training solution on AMD GPUs with ROCm, the `rocm/pytorch-training
+   <https://hub.docker.com/r/rocm/pytorch-training/>`__ Docker Hub registry will be
+   deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
+   The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
+   including torchtitan and :doc:`Megatron-LM <primus-megatron>`.
+
+   Primus with the PyTorch torchtitan backend is designed to replace the
+   :doc:`ROCm PyTorch training <pytorch-training>` workflow. See
+   :doc:`pytorch-training` to see steps to run workloads without Primus.
+
+AMD provides a ready-to-use Docker image for MI355X, MI350X, MI325X, and
+MI300X GPUs containing essential components for Primus and PyTorch training
+with Primus Turbo optimizations.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
+
+   .. tab-set::
+
+      .. tab-item:: {{ data.docker.pull_tag }}
+         :sync: {{ data.docker.pull_tag }}
+
+         .. list-table::
+            :header-rows: 1
+
+            * - Software component
+              - Version
+
+            {% for component_name, component_version in data.docker.components.items() %}
+            * - {{ component_name }}
+              - {{ component_version }}
+            {% endfor %}
+
+.. _amd-primus-pytorch-model-support-v2510:
+
+Supported models
+================
+
+The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X GPUs.
+Some instructions, commands, and training recommendations in this documentation might
+vary by model -- select one to get started.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
+
+   {% set model_groups = data.model_groups %}
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+         <div class="row gx-0">
+            <div class="col-2 me-1 px-2 model-param-head">Model</div>
+            <div class="row col-10 pe-0">
+      {% for model_group in model_groups %}
+               <div class="col-6 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+      {% endfor %}
+            </div>
+         </div>
+
+         <div class="row gx-0 pt-1">
+            <div class="col-2 me-1 px-2 model-param-head">Variant</div>
+            <div class="row col-10 pe-0">
+      {% for model_group in model_groups %}
+         {% set models = model_group.models %}
+         {% for model in models %}
+            {% if models|length % 3 == 0 %}
+               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% else %}
+               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% endif %}
+         {% endfor %}
+      {% endfor %}
+            </div>
+         </div>
+      </div>
+
+.. seealso::
+
+   For additional workloads, including Llama 3.3, Llama 3.2, Llama 2, GPT OSS, Qwen, and Flux models,
+   see the documentation :doc:`pytorch-training` (without Primus)
+
+.. _amd-primus-pytorch-performance-measurements-v2510:
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting training.
+
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
+This Docker image is optimized for specific model configurations outlined
+below. Performance can vary for other training workloads, as AMD
+doesn’t test configurations and run conditions outside those described.
+
+Pull the Docker image
+=====================
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
+
+   Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull {{ data.docker.pull_tag }}
+
+Run training
+============
+
+Once the setup is complete, choose between the following two workflows to start benchmarking training.
+For fine-tuning workloads and multi-node training examples, see :doc:`pytorch-training` (without Primus).
+For best performance on MI325X, MI350X, and MI355X GPUs, you might need to
+tweak some configurations (such as batch sizes).
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
+
+   {% set docker = data.docker %}
+   {% set model_groups = data.model_groups %}
+
+   .. tab-set::
+
+      .. tab-item:: MAD-integrated benchmarking
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+         .. container:: model-doc {{ model.mad_tag }}
+
+            The following run command is tailored to {{ model.model }}.
+            See :ref:`amd-primus-pytorch-model-support-v2510` to switch to another available model.
+
+            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+               directory and install the required packages on the host machine.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD
+                  pip install -r requirements.txt
+
+            2. For example, use this command to run the performance benchmark test on the {{ model.model }} model
+               using one node with the {{ model.precision }} data type on the host machine.
+
+               .. code-block:: shell
+
+                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+                  madengine run \
+                      --tags {{ model.mad_tag }} \
+                      --keep-model-dir \
+                      --live-output \
+                      --timeout 28800
+
+               MAD launches a Docker container with the name
+               ``container_ci-{{ model.mad_tag }}``. The latency and throughput reports of the
+               model are collected in ``~/MAD/perf.csv``.
+
+      {% endfor %}
+   {% endfor %}
+
+      .. tab-item:: Primus benchmarking
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+         .. container:: model-doc {{ model.mad_tag }}
+
+            The following run commands are tailored to {{ model.model }}.
+            See :ref:`amd-primus-pytorch-model-support-v2510` to switch to another available model.
+
+            .. rubric:: Download the Docker image and required packages
+
+            1. Pull the ``{{ docker.pull_tag }}`` Docker image from Docker Hub.
+
+               .. code-block:: shell
+
+                  docker pull {{ docker.pull_tag }}
+
+            2. Run the Docker container.
+
+               .. code-block:: shell
+
+                  docker run -it \
+                      --device /dev/dri \
+                      --device /dev/kfd \
+                      --network host \
+                      --ipc host \
+                      --group-add video \
+                      --cap-add SYS_PTRACE \
+                      --security-opt seccomp=unconfined \
+                      --privileged \
+                      -v $HOME:$HOME \
+                      -v $HOME/.ssh:/root/.ssh \
+                      --shm-size 64G \
+                      --name training_env \
+                      {{ docker.pull_tag }}
+
+               Use these commands if you exit the ``training_env`` container and need to return to it.
+
+               .. code-block:: shell
+
+                  docker start training_env
+                  docker exec -it training_env bash
+
+            .. rubric:: Prepare training datasets and dependencies
+
+            The following benchmarking examples require downloading models and datasets
+            from Hugging Face. To ensure successful access to gated repos, set your
+            ``HF_TOKEN``.
+
+            .. code-block:: shell
+
+               export HF_TOKEN=$your_personal_hugging_face_access_token
+
+            .. rubric:: Pretraining
+
+            To get started, navigate to the ``Primus`` directory in your container.
+
+            .. code-block::
+
+               cd /workspace/Primus
+
+            Now, to start the pretraining benchmark, use the ``run_pretrain.sh`` script
+            included with Primus with the appropriate options.
+
+            .. rubric:: Benchmarking examples
+
+            .. container:: model-doc primus_pyt_train_llama-3.1-8b
+
+               Use the following command to run train Llama 3.1 8B with BF16 precision using Primus torchtitan.
+
+               .. tab-set::
+
+                  .. tab-item:: MI355X and MI350X
+                     :sync: MI355X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/MI355X/llama3.1_8B-BF16-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.local_batch_size 6
+
+                  .. tab-item:: MI325X
+                     :sync: MI325X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-BF16-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.local_batch_size 6
+
+                  .. tab-item:: MI300X
+                     :sync: MI300X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-BF16-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.local_batch_size 4
+
+
+               To train Llama 3.1 8B with FP8 precision, use the following command.
+
+               .. tab-set::
+
+                  .. tab-item:: MI355X and MI350X
+                     :sync: MI355X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/MI355X/llama3.1_8B-BF16-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.local_batch_size 8
+
+                  .. tab-item:: MI325X
+                     :sync: MI325X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.local_batch_size 7
+
+                  .. tab-item:: MI300X
+                     :sync: MI300X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.local_batch_size 5
+
+            .. container:: model-doc primus_pyt_train_llama-3.1-70b
+
+               Use the following command to run train Llama 3.1 70B with BF16 precision using Primus torchtitan.
+
+               .. tab-set::
+
+                  .. tab-item:: MI355X and MI350X
+                     :sync: MI355X and MI300X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/MI355X/llama3.1_70B-BF16-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.local_batch_size 8
+
+                  .. tab-item:: MI325X
+                     :sync: MI325X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.local_batch_size 6
+
+                  .. tab-item:: MI300X
+                     :sync: MI300X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.local_batch_size 4
+
+               To train Llama 3.1 70B with FP8 precision, use the following command.
+
+               .. tab-set::
+
+                  .. tab-item:: MI355X and MI350X
+                     :sync: MI355X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/MI355X/llama3.1_70B-FP8-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.local_batch_size 6
+
+                  .. tab-item:: MI325X
+                     :sync: MI325X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.local_batch_size 5
+
+                  .. tab-item:: MI300X
+                     :sync: MI300X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.local_batch_size 3
+
+            .. container:: model-doc primus_pyt_train_deepseek-v2
+
+               Use the following command to run train DeepSeek V2 16B with BF16 precision using Primus torchtitan.
+
+               .. tab-set::
+
+                  .. tab-item:: MI355X and MI350X
+                     :sync: MI355X and MI300X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/MI355X/deepseek_v3_16b-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.local_batch_size 16
+
+                  .. tab-item:: MI325X
+                     :sync: MI325X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.local_batch_size 10
+
+                  .. tab-item:: MI300X
+                     :sync: MI300X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.local_batch_size 8
+
+               To train DeepSeek V2 16B with FP8 precision, use the following command.
+
+               .. tab-set::
+
+                  .. tab-item:: MI355X and MI350X
+                     :sync: MI355X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/MI355X/deepseek_v3_16b-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.local_batch_size 16
+
+                  .. tab-item:: MI325X
+                     :sync: MI325X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.local_batch_size 8
+
+                  .. tab-item:: MI300X
+                     :sync: MI300X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.local_batch_size 8
+      {% endfor %}
+   {% endfor %}
+
+Further reading
+===============
+
+- For an introduction to Primus, see `Primus: A Lightweight, Unified Training
+  Framework for Large Models on AMD GPUs <https://rocm.blogs.amd.com/software-tools-optimization/primus/README.html>`__.
+
+- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
+
+- To learn more about system settings and management practices to configure your system for
+  AMD Instinct MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`pytorch-training-history` to find documentation for previous releases
+of the ``ROCm/pytorch-training`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst
@@ -16,7 +16,7 @@ previous releases of the ``ROCm/pytorch-training`` Docker image on `Docker Hub <
     - Components
     - Resources

-   * - v25.10 (latest)
+   * - v25.11
     -
       * ROCm 7.1.0
       * PyTorch 2.10.0.dev20251112+rocm7.1
@@ -25,6 +25,15 @@ previous releases of the ``ROCm/pytorch-training`` Docker image on `Docker Hub <
       * :doc:`PyTorch training (legacy) documentation <../pytorch-training>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197>`__

+   * - v25.10
+     -
+       * ROCm 7.1.0
+       * PyTorch 2.10.0.dev20251112+rocm7.1
+     -
+       * :doc:`Primus PyTorch Training documentation <primus-pytorch-v25.10>`
+       * :doc:`PyTorch training (legacy) documentation <pytorch-training-v25.10>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197>`__
+
   * - v25.9
     -
       * ROCm 7.0.0
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.10.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.10.rst
@@ -0,0 +1,669 @@
+:orphan:
+
+.. meta::
+   :description: How to train a model using PyTorch for ROCm.
+   :keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
+
+**************************************
+Training a model with PyTorch on ROCm
+**************************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm PyTorch training
+   performance benchmark documentation. See :doc:`../pytorch-training` for the latest version.
+
+.. note::
+
+   For a unified training solution on AMD GPUs with ROCm, the `rocm/pytorch-training
+   <https://hub.docker.com/r/rocm/pytorch-training/>`__ Docker Hub registry will be
+   deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
+   The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
+   including torchtitan and :doc:`Megatron-LM <../primus-megatron>`.
+
+   See :doc:`../primus-pytorch` for details.
+
+PyTorch is an open-source machine learning framework that is widely used for
+model training with GPU-optimized components for transformer-based models.
+The PyTorch for ROCm training Docker image provides a prebuilt optimized
+environment for fine-tuning and pretraining a model on AMD Instinct MI325X
+and MI300X GPUs. It includes the following software components to accelerate
+training workloads:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
+
+   .. tab-set::
+
+      .. tab-item:: {{ data.docker.pull_tag }}
+         :sync: {{ data.docker.pull_tag }}
+
+         .. list-table::
+            :header-rows: 1
+
+            * - Software component
+              - Version
+
+            {% for component_name, component_version in data.docker.components.items() %}
+            * - {{ component_name }}
+              - {{ component_version }}
+            {% endfor %}
+
+.. _amd-pytorch-training-model-support-v2510:
+
+Supported models
+================
+
+The following models are pre-optimized for performance on the AMD Instinct
+MI355X, MI350X, MI325X, and MI300X GPUs. Some instructions, commands, and
+training recommendations in this documentation might vary by model -- select
+one to get started.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
+
+   {% set model_groups = data.model_groups %}
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+         <div class="row gx-0">
+            <div class="col-2 me-1 px-2 model-param-head">Model</div>
+            <div class="row col-10 pe-0">
+      {% for model_group in model_groups %}
+               <div class="col-4 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+      {% endfor %}
+            </div>
+         </div>
+
+         <div class="row gx-0 pt-1">
+            <div class="col-2 me-1 px-2 model-param-head">Variant</div>
+            <div class="row col-10 pe-0">
+      {% for model_group in model_groups %}
+         {% set models = model_group.models %}
+         {% for model in models %}
+            {% if models|length % 3 == 0 %}
+               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% else %}
+               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% endif %}
+         {% endfor %}
+      {% endfor %}
+            </div>
+         </div>
+      </div>
+
+.. _amd-pytorch-training-supported-training-modes-v2510:
+
+The following table lists supported training modes per model.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
+
+   {% set model_groups = data.model_groups %}
+   .. dropdown:: Supported training modes
+
+      .. list-table::
+         :header-rows: 1
+
+         * - Model
+           - Supported training modes
+
+      {% for model_group in model_groups %}
+         {% set models = model_group.models %}
+         {% for model in models %}
+         {% if model.training_modes %}
+         * - {{ model.model }}
+           - ``{{ model.training_modes | join('``, ``') }}``
+
+         {% endif %}
+         {% endfor %}
+      {% endfor %}
+
+      .. note::
+
+         Some model and fine-tuning combinations are not listed. This is
+         because the `upstream torchtune repository <https://github.com/pytorch/torchtune>`__
+         doesn't provide default YAML configurations for them.
+         For advanced usage, you can create a custom configuration to enable
+         unlisted fine-tuning methods by using an existing file in the
+         ``/workspace/torchtune/recipes/configs`` directory as a template.
+
+.. _amd-pytorch-training-performance-measurements-v2510:
+
+Performance measurements
+========================
+
+To evaluate performance, the
+`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
+page provides reference throughput and latency measurements for training
+popular AI models.
+
+.. note::
+
+   The performance data presented in
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
+   should not be interpreted as the peak performance achievable by AMD
+   Instinct MI325X and MI300X GPUs or ROCm software.
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting training.
+
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
+This Docker image is optimized for specific model configurations outlined
+below. Performance can vary for other training workloads, as AMD
+doesn’t test configurations and run conditions outside those described.
+
+Run training
+============
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
+
+   {% set docker = data.docker %}
+   {% set model_groups = data.model_groups %}
+
+   Once the setup is complete, choose between two options to start benchmarking training:
+
+   .. tab-set::
+
+      .. tab-item:: MAD-integrated benchmarking
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+         .. container:: model-doc {{ model.mad_tag }}
+
+            The following run command is tailored to {{ model.model }}.
+            See :ref:`amd-pytorch-training-model-support-v2510` to switch to another available model.
+
+            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+               directory and install the required packages on the host machine.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD
+                  pip install -r requirements.txt
+
+            2. For example, use this command to run the performance benchmark test on the {{ model.model }} model
+               using one node with the {{ model.precision }} data type on the host machine.
+
+               .. code-block:: shell
+
+                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+                  madengine run \
+                      --tags {{ model.mad_tag }} \
+                      --keep-model-dir \
+                      --live-output \
+                      --timeout 28800
+
+               MAD launches a Docker container with the name
+               ``container_ci-{{ model.mad_tag }}``. The latency and throughput reports of the
+               model are collected in ``~/MAD/perf.csv``.
+
+      {% endfor %}
+   {% endfor %}
+
+      .. tab-item:: Standalone benchmarking
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+         .. container:: model-doc {{ model.mad_tag }}
+
+            The following commands are tailored to {{ model.model }}.
+            See :ref:`amd-pytorch-training-model-support-v2510` to switch to another available model.
+
+      {% endfor %}
+   {% endfor %}
+
+         .. rubric:: Download the Docker image and required packages
+
+         1. Use the following command to pull the Docker image from Docker Hub.
+
+            .. code-block:: shell
+
+               docker pull {{ docker.pull_tag }}
+
+         2. Launch the Docker container.
+
+            .. code-block:: shell
+
+               docker run -it \
+                   --device /dev/dri \
+                   --device /dev/kfd \
+                   --network host \
+                   --ipc host \
+                   --group-add video \
+                   --cap-add SYS_PTRACE \
+                   --security-opt seccomp=unconfined \
+                   --privileged \
+                   -v $HOME:$HOME \
+                   -v $HOME/.ssh:/root/.ssh \
+                   --shm-size 64G \
+                   --name training_env \
+                   {{ docker.pull_tag }}
+
+            Use these commands if you exit the ``training_env`` container and need to return to it.
+
+            .. code-block:: shell
+
+               docker start training_env
+               docker exec -it training_env bash
+
+         3. In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
+            repository and navigate to the benchmark scripts directory
+            ``/workspace/MAD/scripts/pytorch_train``.
+
+            .. code-block:: shell
+
+               git clone https://github.com/ROCm/MAD
+               cd MAD/scripts/pytorch_train
+
+         .. rubric:: Prepare training datasets and dependencies
+
+         1. The following benchmarking examples require downloading models and datasets
+            from Hugging Face. To ensure successful access to gated repos, set your
+            ``HF_TOKEN``.
+
+            .. code-block:: shell
+
+               export HF_TOKEN=$your_personal_hugging_face_access_token
+
+         2. Run the setup script to install libraries and datasets needed for benchmarking.
+
+            .. code-block:: shell
+
+               ./pytorch_benchmark_setup.sh
+
+            .. container:: model-doc pyt_train_llama-3.1-8b
+
+               ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 8B:
+
+               .. list-table::
+                  :header-rows: 1
+
+                  * - Library
+                    - Reference
+
+                  * - ``accelerate``
+                    - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
+
+                  * - ``datasets``
+                    - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
+
+            .. container:: model-doc pyt_train_llama-3.1-70b
+
+               ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 70B:
+
+               .. list-table::
+                  :header-rows: 1
+
+                  * - Library
+                    - Reference
+
+                  * - ``datasets``
+                    - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
+
+                  * - ``torchdata``
+                    - `TorchData <https://meta-pytorch.org/data/beta/index.html#torchdata>`__
+
+                  * - ``tomli``
+                    - `Tomli <https://pypi.org/project/tomli/>`__
+
+                  * - ``tiktoken``
+                    - `tiktoken <https://github.com/openai/tiktoken>`__
+
+                  * - ``blobfile``
+                    - `blobfile <https://pypi.org/project/blobfile/>`__
+
+                  * - ``tabulate``
+                    - `tabulate <https://pypi.org/project/tabulate/>`__
+
+                  * - ``wandb``
+                    - `Weights & Biases <https://github.com/wandb/wandb>`__
+
+                  * - ``sentencepiece``
+                    - `SentencePiece <https://github.com/google/sentencepiece>`__ 0.2.0
+
+                  * - ``tensorboard``
+                    - `TensorBoard <https://www.tensorflow.org/tensorboard>`__ 2.18.0
+
+            .. container:: model-doc pyt_train_flux
+
+               ``pytorch_benchmark_setup.sh`` installs the following libraries for FLUX:
+
+               .. list-table::
+                  :header-rows: 1
+
+                  * - Library
+                    - Reference
+
+                  * - ``accelerate``
+                    - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
+
+                  * - ``datasets``
+                    - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`__ 3.2.0
+
+                  * - ``sentencepiece``
+                    - `SentencePiece <https://github.com/google/sentencepiece>`__ 0.2.0
+
+                  * - ``tensorboard``
+                    - `TensorBoard <https://www.tensorflow.org/tensorboard>`__ 2.18.0
+
+                  * - ``csvkit``
+                    - `csvkit <https://csvkit.readthedocs.io/en/latest/>`__ 2.0.1
+
+                  * - ``deepspeed``
+                    - `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`__ 0.16.2
+
+                  * - ``diffusers``
+                    - `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`__ 0.31.0
+
+                  * - ``GitPython``
+                    - `GitPython <https://github.com/gitpython-developers/GitPython>`__ 3.1.44
+
+                  * - ``opencv-python-headless``
+                    - `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`__ 4.10.0.84
+
+                  * - ``peft``
+                    - `PEFT <https://huggingface.co/docs/peft/en/index>`__ 0.14.0
+
+                  * - ``protobuf``
+                    - `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`__ 5.29.2
+
+                  * - ``pytest``
+                    - `PyTest <https://docs.pytest.org/en/stable/>`__ 8.3.4
+
+                  * - ``python-dotenv``
+                    - `python-dotenv <https://pypi.org/project/python-dotenv/>`__ 1.0.1
+
+                  * - ``seaborn``
+                    - `Seaborn <https://seaborn.pydata.org/>`__ 0.13.2
+
+                  * - ``transformers``
+                    - `Transformers <https://huggingface.co/docs/transformers/en/index>`__ 4.47.0
+
+            ``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face:
+
+            * `frank-chieng/chinese_architecture_siheyuan <https://huggingface.co/datasets/frank-chieng/chinese_architecture_siheyuan>`__
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+         {% set training_modes = model.training_modes %}
+         {% set training_mode_descs = {
+            "pretrain": "Benchmark pre-training.",
+            "HF_pretrain": "Llama 3.1 8B pre-training with FP8 precision."
+         } %}
+         {% set available_modes = training_modes | select("in", ["pretrain", "HF_pretrain"]) | list %}
+         {% if available_modes %}
+
+         .. container:: model-doc {{ model.mad_tag }}
+
+            .. rubric:: Pretraining
+
+            To start the pre-training benchmark, use the following command with the
+            appropriate options. See the following list of options and their descriptions.
+
+            {% if model.mad_tag == "pyt_train_dlrm" %}
+
+            1. Go to the DLRM directory.
+
+               .. code-block:: shell
+
+                  cd /workspace/DLRMBenchmark
+
+            2. To run the single node training benchmark for DLRM-v2 with TF32 precision,
+               run the following script.
+
+               .. code-block:: shell
+
+                  ./launch_training_single_node.sh
+
+               To run with MAD within the Docker container, use the following command.
+
+               .. code-block:: shell
+
+                  ./pytorch_benchmark_report.sh -t pretrain -m DLRM
+
+            {% else %}
+
+            .. code-block:: shell
+
+               ./pytorch_benchmark_report.sh -t {% if available_modes | length == 1 %}{{ available_modes[0] }}{% else %}$training_mode{% endif %} \
+                   -m {{ model.model_repo }} \
+                   -p $datatype \
+                   -s $sequence_length
+
+            {% if model.mad_tag == "pyt_train_flux" %}
+            .. container:: model-doc {{ model.mad_tag }}
+
+               .. note::
+
+                  Currently, FLUX models are not supported out-of-the-box on this Docker.
+                  To use FLUX, refer to ``rocm/pytorch-training`` Docker: :doc:`pytorch-training-v25.6`
+
+                  Occasionally, downloading the Flux dataset might fail. In the event of this
+                  error, manually download it from Hugging Face at
+                  `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
+                  and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
+                  the required dataset.
+            {% endif %}
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Name
+                 - Options
+                 - Description
+
+               {% for mode in available_modes %}
+               * - {% if loop.first %}``$training_mode``{% endif %}
+                 - ``{{ mode }}``
+                 - {{ training_mode_descs[mode] }}
+               {% endfor %}
+
+               * - ``$datatype``
+                 - ``BF16``{% if model.mad_tag == "pyt_train_llama-3.1-8b" %} or ``FP8``{% endif %}
+                 - Only Llama 3.1 8B supports FP8 precision.
+
+               * - ``$sequence_length``
+                 - Sequence length for the language model.
+                 - Between 2048 and 8192. 8192 by default.
+            {% endif %}
+         {% endif %}
+
+         {% set training_modes = model.training_modes %}
+         {% set training_mode_descs = {
+            "posttrain": "Benchmark post-training.",
+         } %}
+         {% set available_modes = training_modes | select("in", ["posttrain"]) | list %}
+         {% if available_modes %}
+
+         .. container:: model-doc {{ model.mad_tag }}
+
+            .. rubric:: Post-training
+
+            To start the post-training benchmark, use the following command with the
+            appropriate options. See the following list of options and their descriptions.
+
+            .. code-block:: shell
+
+               ./pytorch_benchmark_report.sh -t {% if available_modes | length == 1 %}{{ available_modes[0] }}{% else %}$training_mode{% endif %} \
+                   -m {{ model.model_repo }} \
+                   -p $datatype \
+                   -s $sequence_length
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Name
+                 - Options
+                 - Description
+
+               {% for mode in available_modes %}
+               * - {% if loop.first %}``$training_mode``{% endif %}
+                 - ``{{ mode }}``
+                 - {{ training_mode_descs[mode] }}
+               {% endfor %}
+
+               * - ``$datatype``
+                 - ``BF16``{% if model.mad_tag == "pyt_train_llama-3.1-8b" %} or ``FP8``{% endif %}
+                 - Only Llama 3.1 8B supports FP8 precision.
+
+               * - ``$sequence_length``
+                 - Sequence length for the language model.
+                 - Between 2048 and 8192. 8192 by default.
+         {% endif %}
+
+         {% set training_mode_descs = {
+            "finetune_fw": "Full weight fine-tuning (BF16 and FP8 supported).",
+            "finetune_lora": "LoRA fine-tuning (BF16 supported).",
+            "finetune_qlora": "QLoRA fine-tuning (BF16 supported).",
+            "HF_finetune_lora": "LoRA fine-tuning with Hugging Face PEFT.",
+         } %}
+         {% set available_modes = training_modes | select("in", ["finetune_fw", "finetune_lora", "finetune_qlora", "HF_finetune_lora"]) | list %}
+         {% if available_modes %}
+         .. container:: model-doc {{ model.mad_tag }}
+
+            .. rubric:: Fine-tuning
+
+            To start the fine-tuning benchmark, use the following command with the
+            appropriate options. See the following list of options and their descriptions.
+            See :ref:`supported training modes <amd-pytorch-training-supported-training-modes-v2510>`.
+
+            .. code-block:: shell
+
+               ./pytorch_benchmark_report.sh -t $training_mode \
+                   -m {{ model.model_repo }} \
+                   -p $datatype \
+                   -s $sequence_length
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Name
+                 - Options
+                 - Description
+
+               {% for mode in available_modes %}
+               * - {% if loop.first %}``$training_mode``{% endif %}
+                 - ``{{ mode }}``
+                 - {{ training_mode_descs[mode] }}
+               {% endfor %}
+
+               * - ``$datatype``
+                 - ``BF16``{% if "finetune_fw" in available_modes %} or ``FP8``{% endif %}
+                 - All models support BF16.{% if "finetune_fw" in available_modes %} FP8 is only available for full weight fine-tuning.{% endif %}
+
+               * - ``$sequence_length``
+                 - Between 2048 and 16384.
+                 - Sequence length for the language model.
+
+            {% if model.mad_tag in ["pyt_train_llama3.2-vision-11b", "pyt_train_llama-3.2-vision-90b"] %}
+            .. note::
+
+               For LoRA and QLoRA support with vision models (Llama 3.2 11B and 90B),
+               use the following torchtune commit for compatibility:
+
+               .. code-block:: shell
+
+                  git checkout 48192e23188b1fc524dd6d127725ceb2348e7f0e
+
+            {% elif model.mad_tag in ["pyt_train_llama-2-7b", "pyt_train_llama-2-13b", "pyt_train_llama-2-70b"] %}
+            .. note::
+
+               You might encounter the following error with Llama 2: ``ValueError: seq_len (16384) of
+               input tensor should be smaller than max_seq_len (4096)``.
+               This error indicates that an input sequence is longer than the model's maximum context window.
+
+               Ensure your tokenized input does not exceed the model's ``max_seq_len`` (4096
+               tokens in this case). You can resolve this by truncating the input or splitting
+               it into smaller chunks before passing it to the model.
+
+               Note on reproducibility: The results in this guide are based on
+               commit ``b4c98ac`` from the upstream
+               `<https://github.com/pytorch/torchtune>`__ repository. For the
+               latest updates, you can use the main branch.
+
+            {% endif %}
+         {% endif %}
+      {% endfor %}
+   {% endfor %}
+
+            .. rubric:: Benchmarking examples
+
+            For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.
+
+.. _amd-pytorch-training-multinode-examples-v2510:
+
+Multi-node training
+-------------------
+
+Refer to :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your environment for multi-node
+training. See :ref:`rocm-for-ai-multi-node-setup-pyt-train-example` for example Slurm run commands.
+
+Pre-training
+~~~~~~~~~~~~
+
+Multi-node training with torchtitan is supported. The provided SLURM script is pre-configured for Llama 3 70B.
+
+To launch the training job on a SLURM cluster for Llama 3 70B, run the following commands from the MAD repository.
+
+.. code-block:: shell
+
+   # In the MAD repository
+   cd scripts/pytorch_train
+   sbatch run_slurm_train.sh
+
+Fine-tuning
+~~~~~~~~~~~
+
+Multi-node training with torchtune is supported. The provided SLURM script is pre-configured for Llama 3.3 70B.
+
+To launch the training job on a SLURM cluster for Llama 3.3 70B, run the following commands from the MAD repository.
+
+.. code-block:: shell
+
+   huggingface-cli login # Get access to HF Llama model space
+   huggingface-cli download meta-llama/Llama-3.3-70B-Instruct --local-dir ./models/Llama-3.3-70B-Instruct # Download the Llama 3.3 model locally
+   # In the MAD repository
+   cd scripts/pytorch_train
+   sbatch Torchtune_Multinode.sh
+
+.. note::
+
+   Information regarding benchmark setup:
+
+   * By default, Llama 3.3 70B is fine-tuned using ``alpaca_dataset``.
+   * You can adjust the torchtune `YAML configuration file
+     <https://github.com/pytorch/torchtune/blob/main/recipes/configs/llama3_3/70B_full_multinode.yaml>`__
+     if you're using a different model.
+   * The number of nodes and other parameters can be tuned in the SLURM script ``Torchtune_Multinode.sh``.
+   * Set the ``mounting_paths`` inside the SLURM script.
+
+Once the run is finished, you can find the log files in the ``result_torchtune/`` directory.
+
+Further reading
+===============
+
+- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
+
+- To learn more about system settings and management practices to configure your system for
+  AMD Instinct MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`pytorch-training-history` to find documentation for previous releases
+of the ``ROCm/pytorch-training`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst
@@ -47,7 +47,7 @@ Megatron-LM.
              - {{ component_version }}
            {% endfor %}

-.. _amd-primus-megatron-lm-model-support-v2510:
+.. _amd-primus-megatron-lm-model-support-v25.11:

 Supported models
 ================
@@ -108,7 +108,7 @@ To test for optimal performance, consult the recommended :ref:`System health ben
 <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
 system's configuration.

-.. _mi300x-amd-primus-megatron-lm-training-v2510:
+.. _mi300x-amd-primus-megatron-lm-training-v25.11:

 Environment setup
 =================
@@ -118,7 +118,7 @@ Environment setup
   Use the following instructions to set up the environment, configure the script to train models, and
   reproduce the benchmark results on AMD Instinct GPUs.

-.. _amd-primus-megatron-lm-requirements-v2510:
+.. _amd-primus-megatron-lm-requirements-v25.11:

 Pull the Docker image

@@ -157,16 +157,16 @@ Pull the Docker image
         docker start primus_training_env
         docker exec -it primus_training_env bash

-The Docker container hosts verified branch ``release/v25.10`` of the `Primus
-<https://github.com/AMD-AGI/Primus/tree/release/v25.10>`__ repository.
+The Docker container hosts verified commit ``c4c083de`` of the `Primus
+<https://github.com/AMD-AGI/Primus/tree/c4c083de64ba3e8f19ccc9629411267108931f9e/>`__ repository.

-.. _amd-primus-megatron-lm-environment-setup-v2510:
+.. _amd-primus-megatron-lm-environment-setup-v25.11:

 Configuration
 =============

 Primus defines a training configuration in YAML for each model in
-`examples/megatron/configs <https://github.com/AMD-AGI/Primus/tree/e16b27bf6c1b2798f38848fc574fee60d9a9b902/examples/megatron/configs>`__.
+`examples/megatron/configs <https://github.com/AMD-AGI/Primus/tree/c4c083de64ba3e8f19ccc9629411267108931f9e/examples/megatron/configs>`__.

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml

@@ -207,7 +207,7 @@ You can use either mock data or real data for training.

  Ensure that the files are accessible inside the Docker container.

-.. _amd-primus-megatron-lm-tokenizer-v2510:
+.. _amd-primus-megatron-lm-tokenizer-v25.11:

 Tokenizer
 ---------
@@ -228,7 +228,7 @@ right permissions to access the tokenizer for each model.
   <https://github.com/AMD-AGI/Primus/blob/e16b27bf6c1b2798f38848fc574fee60d9a9b902/examples/megatron/configs/llama3.1_8B-pretrain.yaml>`__
   definition.

-.. _amd-primus-megatron-lm-run-training-v2510:
+.. _amd-primus-megatron-lm-run-training-v25.11:

 Run training
 ============
@@ -252,7 +252,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the

   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to Llama 3.3 70B.
-   See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.

   To run pre-training for Llama 3.3 70B BF16, run:

@@ -263,11 +263,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the

         .. code-block:: shell

-            EXP=examples/megatron/configs/MI355X/llama3.3_70B-pretrain.yaml \
-            bash ./examples/run_pretrain.sh \
-                --train_iters 50 \
-                --micro_batch_size 6 \
-                --global_batch_size 48 \
+            EXP=examples/megatron/configs/MI355X/llama3.3_70B-BF16-pretrain.yaml \
+            bash ./examples/run_pretrain.sh

      .. tab-item:: MI300X
         :sync: MI325X and MI300X
@@ -279,17 +276,14 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
            export NVTE_CK_IS_V3_ATOMIC_FP32=1

-            EXP=examples/megatron/configs/MI300X/llama3.3_70B-pretrain.yaml \
-            bash ./examples/run_pretrain.sh \
-                --train_iters 50 \
-                --micro_batch_size 2 \
-                --global_batch_size 16
+            EXP=examples/megatron/configs/MI300X/llama3.3_70B-BF16-pretrain.yaml \
+            bash ./examples/run_pretrain.sh

 .. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b

   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to Llama 3.1 8B.
-   See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.

   To run pre-training for Llama 3.1 8B FP8, run:

@@ -300,12 +294,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the

         .. code-block:: shell

-            EXP=examples/megatron/configs/MI355X/llama3.1_8B-pretrain.yaml \
-            bash ./examples/run_pretrain.sh \
-                --train_iters 50 \
-                --fp8 hybrid \
-                --micro_batch_size 4 \
-                --global_batch_size 512 \
+            EXP=examples/megatron/configs/MI355X/llama3.1_8B-FP8-pretrain.yaml \
+            bash ./examples/run_pretrain.sh

      .. tab-item:: MI300X
         :sync: MI325X and MI300X
@@ -317,10 +307,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
            export NVTE_CK_IS_V3_ATOMIC_FP32=1

-            EXP=examples/megatron/configs/MI300X/llama3.1_8B-pretrain.yaml \
-            bash ./examples/run_pretrain.sh \
-                --train_iters 50 \
-                --fp8 hybrid
+            EXP=examples/megatron/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml \
+            bash ./examples/run_pretrain.sh

   For Llama 3.1 8B BF16, use the following command:

@@ -331,11 +319,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the

         .. code-block:: shell

-            EXP=examples/megatron/configs/MI355X/llama3.1_8B-pretrain.yaml \
-            bash ./examples/run_pretrain.sh \
-                --train_iters 50 \
-                --micro_batch_size 4 \
-                --global_batch_size 512 \
+            EXP=examples/megatron/configs/MI355X/llama3.1_BF16-pretrain.yaml \
+            bash ./examples/run_pretrain.sh

      .. tab-item:: MI300X
         :sync: MI325X and MI300X
@@ -347,15 +332,14 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
            export NVTE_CK_IS_V3_ATOMIC_FP32=1

-            EXP=examples/megatron/configs/MI300X/llama3.1_8B-pretrain.yaml \
-            bash ./examples/run_pretrain.sh \
-                --train_iters 50
+            EXP=examples/megatron/configs/MI300X/llama3.1_8B-BF16-pretrain.yaml \
+            bash ./examples/run_pretrain.sh

 .. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-70b

   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to Llama 3.1 70B.
-   See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.

   To run pre-training for Llama 3.1 70B BF16, run:

@@ -366,11 +350,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the

         .. code-block:: shell

-            EXP=examples/megatron/configs/MI355X/llama3.1_70B-pretrain.yaml \
-            bash ./examples/run_pretrain.sh \
-                 --train_iters 50 \
-                 --micro_batch_size 4 \
-                 --global_batch_size 32
+            EXP=examples/megatron/configs/MI355X/llama3.1_70B-BF16-pretrain.yaml \
+            bash ./examples/run_pretrain.sh

      .. tab-item:: MI300X
         :sync: MI325X and MI300X
@@ -382,9 +363,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
            export NVTE_CK_IS_V3_ATOMIC_FP32=1

-            EXP=examples/megatron/configs/MI300X/llama3.1_70B-pretrain.yaml \
-            bash ./examples/run_pretrain.sh \
-                 --train_iters 50
+            EXP=examples/megatron/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml \
+            bash ./examples/run_pretrain.sh

   To run the training on a single node for Llama 3.1 70B FP8, use the following command.

@@ -401,13 +381,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the

         .. code-block:: shell

-            EXP=examples/megatron/configs/MI355X/llama3.1_70B-pretrain.yaml \
-            bash ./examples/run_pretrain.sh \
-                --train_iters 50 \
-                --fp8 hybrid \
-                --no_fp8_weight_transpose_cache true \
-                --micro_batch_size 3 \
-                --global_batch_size 24
+            EXP=examples/megatron/configs/MI355X/llama3.1_70B-FP8-pretrain.yaml \
+            bash ./examples/run_pretrain.sh

      .. tab-item:: MI300X
         :sync: MI325X and MI300X
@@ -419,7 +394,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
            export NVTE_CK_IS_V3_ATOMIC_FP32=1

-            EXP=examples/megatron/configs/MI300X/llama3.1_70B-pretrain.yaml \
+            EXP=examples/megatron/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml \
            bash ./examples/run_pretrain.sh \
                --train_iters 50 \
                --num_layers 40 \
@@ -430,7 +405,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the

   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to Llama 2 7B.
-   See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.

   To run pre-training for Llama 2 7B FP8, run:

@@ -441,12 +416,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the

         .. code-block:: shell

-            EXP=examples/megatron/configs/MI355X/llama2_7B-pretrain.yaml \
-            bash ./examples/run_pretrain.sh \
-                --train_iters 50 \
-                --fp8 hybrid \
-                --micro_batch_size 13 \
-                --global_batch_size 416
+            EXP=examples/megatron/configs/MI355X/llama2_7B-FP8-pretrain.yaml \
+            bash ./examples/run_pretrain.sh

      .. tab-item:: MI300X
         :sync: MI325X and MI300X
@@ -458,10 +429,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
            export NVTE_CK_IS_V3_ATOMIC_FP32=1

-            EXP=examples/megatron/configs/MI300X/llama2_7B-pretrain.yaml \
-            bash ./examples/run_pretrain.sh \
-                --train_iters 50 \
-                --fp8 hybrid
+            EXP=examples/megatron/configs/MI300X/llama2_7B-FP8-pretrain.yaml \
+            bash ./examples/run_pretrain.sh

   To run pre-training for Llama 2 7B BF16, run:

@@ -472,11 +441,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the

         .. code-block:: shell

-            EXP=examples/megatron/configs/MI355X/llama2_7B-pretrain.yaml \
-            bash ./examples/run_pretrain.sh \
-                --train_iters 50 \
-                --micro_batch_size 10 \
-                --global_batch_size 640
+            EXP=examples/megatron/configs/MI355X/llama2_7B-BF16-pretrain.yaml \
+            bash ./examples/run_pretrain.sh

      .. tab-item:: MI300X
         :sync: MI325X and MI300X
@@ -488,15 +454,14 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
            export NVTE_CK_IS_V3_ATOMIC_FP32=1

-            EXP=examples/megatron/configs/MI300X/llama2_7B-pretrain.yaml \
-            bash ./examples/run_pretrain.sh \
-                --train_iters 50
+            EXP=examples/megatron/configs/MI300X/llama2_7B-BF16-pretrain.yaml \
+            bash ./examples/run_pretrain.sh

 .. container:: model-doc primus_pyt_megatron_lm_train_llama-2-70b

   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to Llama 2 70B.
-   See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.

   To run pre-training for Llama 2 70B BF16, run:

@@ -507,11 +472,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the

         .. code-block:: shell

-            EXP=examples/megatron/configs/MI355X/llama2_70B-pretrain.yaml \
-            bash ./examples/run_pretrain.sh \
-                --train_iters 50 \
-                --micro_batch_size 17 \
-                --global_batch_size 272
+            EXP=examples/megatron/configs/MI355X/llama2_70B-BF16-pretrain.yaml \
+            bash ./examples/run_pretrain.sh

      .. tab-item:: MI300X
         :sync: MI325X and MI300X
@@ -523,15 +485,14 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
            export NVTE_CK_IS_V3_ATOMIC_FP32=1

-            EXP=examples/megatron/configs/MI300X/llama2_70B-pretrain.yaml \
-            bash ./examples/run_pretrain.sh \
-                --train_iters 50
+            EXP=examples/megatron/configs/MI300X/llama2_70B-BF16-pretrain.yaml \
+            bash ./examples/run_pretrain.sh

 .. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v3-proxy

   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to DeepSeek-V3.
-   See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.

   To run training on a single node for DeepSeek-V3 (MoE with expert parallel) BF16 with 3-layer proxy,
   use the following command:
@@ -543,7 +504,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the

         .. code-block:: shell

-            EXP=examples/megatron/configs/MI355X/deepseek_v3-pretrain.yaml \
+            EXP=examples/megatron/configs/MI355X/deepseek_v3-BF16-pretrain.yaml \
            bash examples/run_pretrain.sh \
                --num_layers 3 \
                --moe_layer_freq 1 \
@@ -561,19 +522,17 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
            export NVTE_CK_IS_V3_ATOMIC_FP32=1

-            EXP=examples/megatron/configs/MI300X/deepseek_v3-pretrain.yaml \
+            EXP=examples/megatron/configs/MI300X/deepseek_v3-BF16-pretrain.yaml \
            bash examples/run_pretrain.sh \
                --num_layers 3 \
                --moe_layer_freq 1 \
-                --micro_batch_size 3 \
-                --global_batch_size 192 \
                --train_iters 50

 .. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v2-lite-16b

   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to DeepSeek-V2-Lite.
-   See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.

   To run training on a single node for DeepSeek-V2-Lite (MoE with expert parallel) BF16,
   use the following command:
@@ -585,11 +544,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the

         .. code-block:: shell

-            EXP=examples/megatron/configs/MI355X/deepseek_v2_lite-pretrain.yaml \
-            bash examples/run_pretrain.sh \
-                --train_iters 50 \
-                --micro_batch_size 12 \
-                --global_batch_size 768
+            EXP=examples/megatron/configs/MI355X/deepseek_v2_lite-BF16-pretrain.yaml \
+            bash examples/run_pretrain.sh

      .. tab-item:: MI300X
         :sync: MI325X and MI300X
@@ -601,16 +557,14 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
            export NVTE_CK_IS_V3_ATOMIC_FP32=1

-            EXP=examples/megatron/configs/MI300X/deepseek_v2_lite-pretrain.yaml \
-            bash examples/run_pretrain.sh \
-                --train_iters 50 \
-                --global_batch_size 256
+            EXP=examples/megatron/configs/MI300X/deepseek_v2_lite-BF16-pretrain.yaml \
+            bash examples/run_pretrain.sh

 .. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x7b

   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to Mixtral 8x7B.
-   See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.

   To run training on a single node for Mixtral 8x7B (MoE with expert parallel),
   use the following command:
@@ -622,11 +576,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the

         .. code-block:: shell

-            EXP=examples/megatron/configs/MI355X/mixtral_8x7B_v0.1-pretrain.yaml \
-            bash examples/run_pretrain.sh \
-                --train_iters 50 \
-                --micro_batch_size 4 \
-                --global_batch_size 256
+            EXP=examples/megatron/configs/MI355X/mixtral_8x7B_v0.1-BF16-pretrain.yaml \
+            bash examples/run_pretrain.sh

      .. tab-item:: MI300X
         :sync: MI325X and MI300X
@@ -638,7 +589,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
            export NVTE_CK_IS_V3_ATOMIC_FP32=1

-            EXP=examples/megatron/configs/MI300X/mixtral_8x7B_v0.1-pretrain.yaml \
+            EXP=examples/megatron/configs/MI300X/mixtral_8x7B_v0.1-BF16-pretrain.yaml \
            bash examples/run_pretrain.sh \
                --train_iters 50

@@ -646,7 +597,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the

   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to Mixtral 8x22B.
-   See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.

   To run training on a single node for Mixtral 8x22B BF16 (MoE with expert parallel) 4-layer proxy,
   use the following command:
@@ -658,13 +609,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the

         .. code-block:: shell

-            EXP=examples/megatron/configs/MI355X/mixtral_8x22B_v0.1-pretrain.yaml \
-            bash examples/run_pretrain.sh \
-                --train_iters 50 \
-                --num_layers 4 \
-                --pipeline_model_parallel_size 1 \
-                --micro_batch_size 2 \
-                --global_batch_size 16
+            EXP=examples/megatron/configs/MI355X/mixtral_8x22B_v0.1-BF16-pretrain.yaml \
+            bash examples/run_pretrain.sh

      .. tab-item:: MI300X
         :sync: MI325X and MI300X
@@ -676,7 +622,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
            export NVTE_CK_IS_V3_ATOMIC_FP32=1

-            EXP=examples/megatron/configs/MI300X/mixtral_8x22B_v0.1-pretrain.yaml \
+            EXP=examples/megatron/configs/MI300X/mixtral_8x22B_v0.1-BF16-pretrain.yaml \
            bash examples/run_pretrain.sh \
                --train_iters 50 \
                --num_layers 4 \
@@ -688,7 +634,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the

   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to Qwen 2.5 7B.
-   See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.

   To run training on a single node for Qwen 2.5 7B BF16, use the following
   command:
@@ -700,11 +646,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the

         .. code-block:: shell

-            EXP=examples/megatron/configs/MI355X/qwen2.5_7B-pretrain.yaml \
-            bash examples/run_pretrain.sh \
-                --train_iters 50 \
-                --micro_batch_size 16 \
-                --global_batch_size 768
+            EXP=examples/megatron/configs/MI355X/qwen2.5_7B-BF16-pretrain.yaml \
+            bash examples/run_pretrain.sh

      .. tab-item:: MI300X
         :sync: MI325X and MI300X
@@ -716,9 +659,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
            export NVTE_CK_IS_V3_ATOMIC_FP32=1

-            EXP=examples/megatron/configs/MI300X/qwen2.5_7B-pretrain.yaml \
-            bash examples/run_pretrain.sh \
-                --train_iters 50
+            EXP=examples/megatron/configs/MI300X/qwen2.5_7B-BF16-pretrain.yaml \
+            bash examples/run_pretrain.sh

   For FP8, use the following command.

@@ -729,12 +671,8 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the

         .. code-block:: shell

-            EXP=examples/megatron/configs/MI355X/qwen2.5_7B-pretrain.yaml \
-            bash examples/run_pretrain.sh \
-                --train_iters 50 \
-                --fp8 hybrid
-                --micro_batch_size 20 \
-                --global_batch_size 800
+            EXP=examples/megatron/configs/MI355X/qwen2.5_7B-FP8-pretrain.yaml \
+            bash examples/run_pretrain.sh

      .. tab-item:: MI300X
         :sync: MI325X and MI300X
@@ -746,16 +684,14 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
            export NVTE_CK_IS_V3_ATOMIC_FP32=1

-            EXP=examples/megatron/configs/MI300X/qwen2.5_7B-pretrain.yaml \
-            bash examples/run_pretrain.sh \
-                --train_iters 50 \
-                --fp8 hybrid
+            EXP=examples/megatron/configs/MI300X/qwen2.5_7B-FP8-pretrain.yaml \
+            bash examples/run_pretrain.sh

 .. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-72b

   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to Qwen 2.5 72B.
-   See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.

   To run the training on a single node for Qwen 2.5 72B BF16, use the following command.

@@ -782,11 +718,10 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the
            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
            export NVTE_CK_IS_V3_ATOMIC_FP32=1

-            EXP=examples/megatron/configs/MI300X/qwen2.5_72B-pretrain.yaml \
-            bash examples/run_pretrain.sh \
-                --train_iters 50
+            EXP=examples/megatron/configs/MI300X/qwen2.5_72B-BF16-pretrain.yaml \
+            bash examples/run_pretrain.sh

-.. _amd-primus-megatron-multi-node-examples-v2510:
+.. _amd-primus-megatron-multi-node-examples-v25.11:

 Multi-node training examples
 ----------------------------
@@ -805,7 +740,7 @@ to launch the multi-node workload. Use the following steps to setup your environ

      git clone --recurse-submodules https://github.com/AMD-AGI/Primus.git
      cd Primus
-      git checkout release/v25.10
+      git checkout c4c083de64ba3e8f19ccc9629411267108931f9e
      git submodule update --init --recursive

      export DOCKER_IMAGE={{ docker.pull_tag }}
@@ -828,13 +763,13 @@ to launch the multi-node workload. Use the following steps to setup your environ
   * If ``NCCL_IB_HCA`` and ``NCCL_SOCKET_IFNAME`` are not set, Primus will try to auto-detect. However, since NICs can vary accross different cluster, it is encouraged to explicitly export your NCCL parameters for the cluster.
   * To find your network interface, you can use ``ip a``.
   * To find RDMA interfaces, you can use ``ibv_devices`` to get the list of all the RDMA/IB  devices.
-   * Remember to set ``DOCKER_IMAGE`` and ``HF_TOKEN`` (see :ref:`amd-primus-megatron-lm-tokenizer-v2510`) as appropriate.
+   * Remember to set ``DOCKER_IMAGE`` and ``HF_TOKEN`` (see :ref:`amd-primus-megatron-lm-tokenizer-v25.11`) as appropriate.

 .. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b

   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to Llama 3.1 8B.
-   See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.

   To train Llama 3.1 8B FP8 on 8 nodes, run:

@@ -843,16 +778,15 @@ to launch the multi-node workload. Use the following steps to setup your environ
      # Adjust the training parameters.
      # For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case.
      NNODES=8 \
-      EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
+      EXP=examples/megatron/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml \
      bash ./examples/run_slurm_pretrain.sh \
          --global_batch_size 1024 \
-          --fp8 hybrid

 .. container:: model-doc primus_pyt_megatron_lm_train_llama-2-7b

   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to Llama 2 7B.
-   See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.

   To train Llama 2 7B FP8 on 8 nodes, run:

@@ -861,16 +795,15 @@ to launch the multi-node workload. Use the following steps to setup your environ
      # Adjust the training parameters.
      # For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case.
      NNODES=8 \
-      EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
+      EXP=examples/megatron/configs/MI300X/llama2_7B-FP8-pretrain.yaml \
      bash ./examples/run_slurm_pretrain.sh \
          --global_batch_size 2048 \
-          --fp8 hybrid

 .. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-70b

   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to Llama 3.1 70B.
-   See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.

   To train Llama 3.1 70B FP8 on 8 nodes, run:

@@ -879,20 +812,18 @@ to launch the multi-node workload. Use the following steps to setup your environ
      # Adjust the training parameters.
      # For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case.
      NNODES=8 \
-      EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
+      EXP=examples/megatron/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml \
      bash examples/run_slurm_pretrain.sh \
          --micro_batch_size 4 \
          --global_batch_size 256 \
          --recompute_num_layers 80 \
-          --no_fp8_weight_transpose_cache true \
-          --fp8 hybrid

   To train Llama 3.1 70B BF16 on 8 nodes, run:

   .. code-block:: shell

      NNODES=8 \
-      EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
+      EXP=examples/megatron/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml \
      bash examples/run_slurm_pretrain.sh \
          --micro_batch_size 1 \
          --global_batch_size 256 \
@@ -902,7 +833,7 @@ to launch the multi-node workload. Use the following steps to setup your environ

   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to Llama 2 70B.
-   See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.

   To train Llama 2 70B FP8 on 8 nodes, run:

@@ -911,20 +842,18 @@ to launch the multi-node workload. Use the following steps to setup your environ
      # Adjust the training parameters.
      # For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case.
      NNODES=8 \
-      EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
+      EXP=examples/megatron/configs/MI300X/llama2_70B-FP8-pretrain.yaml \
      bash examples/run_slurm_pretrain.sh \
          --micro_batch_size 10 \
          --global_batch_size 640 \
          --recompute_num_layers 80 \
-          --no_fp8_weight_transpose_cache true \
-          --fp8 hybrid

   To train Llama 2 70B BF16 on 8 nodes, run:

   .. code-block:: shell

      NNODES=8 \
-      EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
+      EXP=examples/megatron/configs/MI300X/llama2_70B-BF16-pretrain.yaml \
      bash ./examples/run_slurm_pretrain.sh \
          --micro_batch_size 2 \
          --global_batch_size 1536 \
@@ -934,7 +863,7 @@ to launch the multi-node workload. Use the following steps to setup your environ

   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to Llama 3.3 70B.
-   See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.

   To train Llama 3.3 70B FP8 on 8 nodes, run:

@@ -943,20 +872,18 @@ to launch the multi-node workload. Use the following steps to setup your environ
      # Adjust the training parameters.
      # For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case
      NNODES=8 \
-      EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
+      EXP=examples/megatron/configs/MI300X/llama3.3_70B-FP8-pretrain.yaml \
      bash examples/run_slurm_pretrain.sh \
          --micro_batch_size 4 \
          --global_batch_size 256 \
          --recompute_num_layers 80 \
-          --no_fp8_weight_transpose_cache true \
-          --fp8 hybrid

   To train Llama 3.3 70B BF16 on 8 nodes, run:

   .. code-block:: shell

      NNODES=8 \
-      EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
+      EXP=examples/megatron/configs/MI300X/llama3.3_70B-BF16-pretrain.yaml \
      bash examples/run_slurm_pretrain.sh \
          --micro_batch_size 1 \
          --global_batch_size 256 \
@@ -966,7 +893,7 @@ to launch the multi-node workload. Use the following steps to setup your environ

   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to Llama 2 70B.
-   See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.

   To train Mixtral 8x7B BF16 on 8 nodes, run:

@@ -975,7 +902,7 @@ to launch the multi-node workload. Use the following steps to setup your environ
      # Adjust the training parameters.
      # For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case
      NNODES=8 \
-      EXP=examples/megatron/configs/mixtral_8x7B_v0.1-pretrain.yaml \
+      EXP=examples/megatron/configs/MI300X/mixtral_8x7B_v0.1-BF16-pretrain.yaml \
      bash examples/run_slurm_pretrain.sh \
          --micro_batch_size 2 \
          --global_batch_size 256
@@ -984,7 +911,7 @@ to launch the multi-node workload. Use the following steps to setup your environ

   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to Llama 2 70B.
-   See :ref:`amd-primus-megatron-lm-model-support-v2510` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.

   To train Qwen2.5 72B FP8 on 8 nodes, run:

@@ -993,15 +920,13 @@ to launch the multi-node workload. Use the following steps to setup your environ
      # Adjust the training parameters.
      # For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case
      NNODES=8 \
-      EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \
+      EXP=examples/megatron/configs/qwen2.5_72B-FP8-pretrain.yaml \
      bash examples/run_slurm_pretrain.sh \
          --micro_batch_size 8 \
          --global_batch_size 512 \
          --recompute_num_layers 80 \
-          --no_fp8_weight_transpose_cache true \
-          --fp8 hybrid

-.. _amd-primus-megatron-lm-benchmark-test-vars-v2510:
+.. _amd-primus-megatron-lm-benchmark-test-vars-v25.11:

 Key options
 -----------
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch.rst
@@ -45,7 +45,7 @@ with Primus Turbo optimizations.
              - {{ component_version }}
            {% endfor %}

-.. _amd-primus-pytorch-model-support-v2510:
+.. _amd-primus-pytorch-model-support-v25.11:

 Supported models
 ================
@@ -91,7 +91,7 @@ vary by model -- select one to get started.
   For additional workloads, including Llama 3.3, Llama 3.2, Llama 2, GPT OSS, Qwen, and Flux models,
   see the documentation :doc:`pytorch-training` (without Primus)

-.. _amd-primus-pytorch-performance-measurements-v2510:
+.. _amd-primus-pytorch-performance-measurements-v25.11:

 System validation
 =================
@@ -146,7 +146,7 @@ tweak some configurations (such as batch sizes).
         .. container:: model-doc {{ model.mad_tag }}

            The following run command is tailored to {{ model.model }}.
-            See :ref:`amd-primus-pytorch-model-support-v2510` to switch to another available model.
+            See :ref:`amd-primus-pytorch-model-support-v25.11` to switch to another available model.

            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
               directory and install the required packages on the host machine.
@@ -184,7 +184,7 @@ tweak some configurations (such as batch sizes).
         .. container:: model-doc {{ model.mad_tag }}

            The following run commands are tailored to {{ model.model }}.
-            See :ref:`amd-primus-pytorch-model-support-v2510` to switch to another available model.
+            See :ref:`amd-primus-pytorch-model-support-v25.11` to switch to another available model.

            .. rubric:: Download the Docker image and required packages

@@ -220,6 +220,9 @@ tweak some configurations (such as batch sizes).
                  docker start training_env
                  docker exec -it training_env bash

+               The Docker container hosts verified commit ``c4c083de`` of the `Primus
+               <https://github.com/AMD-AGI/Primus/tree/c4c083de64ba3e8f19ccc9629411267108931f9e/>`__ repository.
+
            .. rubric:: Prepare training datasets and dependencies

            The following benchmarking examples require downloading models and datasets
@@ -255,7 +258,7 @@ tweak some configurations (such as batch sizes).
                     .. code-block:: shell

                        EXP=examples/torchtitan/configs/MI355X/llama3.1_8B-BF16-pretrain.yaml \
-                        bash examples/run_pretrain.sh --training.local_batch_size 6
+                        bash examples/run_pretrain.sh

                  .. tab-item:: MI325X
                     :sync: MI325X
@@ -263,7 +266,7 @@ tweak some configurations (such as batch sizes).
                     .. code-block:: shell

                        EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-BF16-pretrain.yaml \
-                        bash examples/run_pretrain.sh --training.local_batch_size 6
+                        bash examples/run_pretrain.sh --training.local_batch_size 6 

                  .. tab-item:: MI300X
                     :sync: MI300X
@@ -271,8 +274,7 @@ tweak some configurations (such as batch sizes).
                     .. code-block:: shell

                        EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-BF16-pretrain.yaml \
-                        bash examples/run_pretrain.sh --training.local_batch_size 4
-
+                        bash examples/run_pretrain.sh

               To train Llama 3.1 8B with FP8 precision, use the following command.

@@ -283,8 +285,8 @@ tweak some configurations (such as batch sizes).

                     .. code-block:: shell

-                        EXP=examples/torchtitan/configs/MI355X/llama3.1_8B-BF16-pretrain.yaml \
-                        bash examples/run_pretrain.sh --training.local_batch_size 8
+                        EXP=examples/torchtitan/configs/MI355X/llama3.1_8B-FP8-pretrain.yaml \
+                        bash examples/run_pretrain.sh

                  .. tab-item:: MI325X
                     :sync: MI325X
@@ -292,7 +294,7 @@ tweak some configurations (such as batch sizes).
                     .. code-block:: shell

                        EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml \
-                        bash examples/run_pretrain.sh --training.local_batch_size 7
+                        bash examples/run_pretrain.sh --training.local_batch_size 7 

                  .. tab-item:: MI300X
                     :sync: MI300X
@@ -300,7 +302,7 @@ tweak some configurations (such as batch sizes).
                     .. code-block:: shell

                        EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml \
-                        bash examples/run_pretrain.sh --training.local_batch_size 5
+                        bash examples/run_pretrain.sh

            .. container:: model-doc primus_pyt_train_llama-3.1-70b

@@ -314,7 +316,7 @@ tweak some configurations (such as batch sizes).
                     .. code-block:: shell

                        EXP=examples/torchtitan/configs/MI355X/llama3.1_70B-BF16-pretrain.yaml \
-                        bash examples/run_pretrain.sh --training.local_batch_size 8
+                        bash examples/run_pretrain.sh

                  .. tab-item:: MI325X
                     :sync: MI325X
@@ -322,7 +324,7 @@ tweak some configurations (such as batch sizes).
                     .. code-block:: shell

                        EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml \
-                        bash examples/run_pretrain.sh --training.local_batch_size 6
+                        bash examples/run_pretrain.sh --training.local_batch_size 6 

                  .. tab-item:: MI300X
                     :sync: MI300X
@@ -330,7 +332,7 @@ tweak some configurations (such as batch sizes).
                     .. code-block:: shell

                        EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml \
-                        bash examples/run_pretrain.sh --training.local_batch_size 4
+                        bash examples/run_pretrain.sh

               To train Llama 3.1 70B with FP8 precision, use the following command.

@@ -342,7 +344,7 @@ tweak some configurations (such as batch sizes).
                     .. code-block:: shell

                        EXP=examples/torchtitan/configs/MI355X/llama3.1_70B-FP8-pretrain.yaml \
-                        bash examples/run_pretrain.sh --training.local_batch_size 6
+                        bash examples/run_pretrain.sh

                  .. tab-item:: MI325X
                     :sync: MI325X
@@ -350,7 +352,7 @@ tweak some configurations (such as batch sizes).
                     .. code-block:: shell

                        EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml \
-                        bash examples/run_pretrain.sh --training.local_batch_size 5
+                        bash examples/run_pretrain.sh --training.local_batch_size 5 

                  .. tab-item:: MI300X
                     :sync: MI300X
@@ -358,11 +360,11 @@ tweak some configurations (such as batch sizes).
                     .. code-block:: shell

                        EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml \
-                        bash examples/run_pretrain.sh --training.local_batch_size 3
+                        bash examples/run_pretrain.sh

-            .. container:: model-doc primus_pyt_train_deepseek-v2
+            .. container:: model-doc primus_pyt_train_deepseek-v3-16b

-               Use the following command to run train DeepSeek V2 16B with BF16 precision using Primus torchtitan.
+               Use the following command to run train DeepSeek V3 16B with BF16 precision using Primus torchtitan.

               .. tab-set::

@@ -372,7 +374,7 @@ tweak some configurations (such as batch sizes).
                     .. code-block:: shell

                        EXP=examples/torchtitan/configs/MI355X/deepseek_v3_16b-pretrain.yaml \
-                        bash examples/run_pretrain.sh --training.local_batch_size 16
+                        bash examples/run_pretrain.sh

                  .. tab-item:: MI325X
                     :sync: MI325X
@@ -380,7 +382,7 @@ tweak some configurations (such as batch sizes).
                     .. code-block:: shell

                        EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \
-                        bash examples/run_pretrain.sh --training.local_batch_size 10
+                        bash examples/run_pretrain.sh --training.local_batch_size 10 

                  .. tab-item:: MI300X
                     :sync: MI300X
@@ -388,35 +390,7 @@ tweak some configurations (such as batch sizes).
                     .. code-block:: shell

                        EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \
-                        bash examples/run_pretrain.sh --training.local_batch_size 8
-
-               To train DeepSeek V2 16B with FP8 precision, use the following command.
-
-               .. tab-set::
-
-                  .. tab-item:: MI355X and MI350X
-                     :sync: MI355X
-
-                     .. code-block:: shell
-
-                        EXP=examples/torchtitan/configs/MI355X/deepseek_v3_16b-pretrain.yaml \
-                        bash examples/run_pretrain.sh --training.local_batch_size 16
-
-                  .. tab-item:: MI325X
-                     :sync: MI325X
-
-                     .. code-block:: shell
-
-                        EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \
-                        bash examples/run_pretrain.sh --training.local_batch_size 8
-
-                  .. tab-item:: MI300X
-                     :sync: MI300X
-
-                     .. code-block:: shell
-
-                        EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \
-                        bash examples/run_pretrain.sh --training.local_batch_size 8
+                        bash examples/run_pretrain.sh
      {% endfor %}
   {% endfor %}

--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
@@ -43,7 +43,7 @@ training workloads:
              - {{ component_version }}
            {% endfor %}

-.. _amd-pytorch-training-model-support-v2510:
+.. _amd-pytorch-training-model-support-v25.11:

 Supported models
 ================
@@ -85,7 +85,7 @@ one to get started.
         </div>
      </div>

-.. _amd-pytorch-training-supported-training-modes-v2510:
+.. _amd-pytorch-training-supported-training-modes-v25.11:

 The following table lists supported training modes per model.

@@ -120,7 +120,7 @@ The following table lists supported training modes per model.
         unlisted fine-tuning methods by using an existing file in the
         ``/workspace/torchtune/recipes/configs`` directory as a template.

-.. _amd-pytorch-training-performance-measurements-v2510:
+.. _amd-pytorch-training-performance-measurements-v25.11:

 Performance measurements
 ========================
@@ -176,7 +176,7 @@ Run training
         .. container:: model-doc {{ model.mad_tag }}

            The following run command is tailored to {{ model.model }}.
-            See :ref:`amd-pytorch-training-model-support-v2510` to switch to another available model.
+            See :ref:`amd-pytorch-training-model-support-v25.11` to switch to another available model.

            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
               directory and install the required packages on the host machine.
@@ -214,7 +214,7 @@ Run training
         .. container:: model-doc {{ model.mad_tag }}

            The following commands are tailored to {{ model.model }}.
-            See :ref:`amd-pytorch-training-model-support-v2510` to switch to another available model.
+            See :ref:`amd-pytorch-training-model-support-v25.11` to switch to another available model.

      {% endfor %}
   {% endfor %}
@@ -532,7 +532,7 @@ Run training

            To start the fine-tuning benchmark, use the following command with the
            appropriate options. See the following list of options and their descriptions.
-            See :ref:`supported training modes <amd-pytorch-training-supported-training-modes-v2510>`.
+            See :ref:`supported training modes <amd-pytorch-training-supported-training-modes-v25.11>`.

            .. code-block:: shell

@@ -597,7 +597,7 @@ Run training

            For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.

-.. _amd-pytorch-training-multinode-examples-v2510:
+.. _amd-pytorch-training-multinode-examples-v25.11:

 Multi-node training
 -------------------
--- a/docs/sphinx/_toc.yml.in
+++ b/docs/sphinx/_toc.yml.in
@@ -75,8 +75,14 @@ subtrees:
        - entries:
          - file: how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst
            title: Train a model with Primus and Megatron-LM
+            entries:
+            - file: how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
+              title: Train a model with Megatron-LM
          - file: how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch.rst
            title: Train a model with Primus and PyTorch
+            entries:
+            - file: how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
+              title: Train a model with PyTorch
          - file: how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
            title: Train a model with JAX MaxText
          - file: how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -164,7 +164,7 @@ pygments==2.19.2
    #   sphinx
 pyjwt[crypto]==2.10.1
    # via pygithub
-pynacl==1.6.1
+pynacl==1.6.2
    # via pygithub
 python-dateutil==2.9.0.post0
    # via jupyter-client
@@ -282,7 +282,7 @@ typing-extensions==4.15.0
    #   pygithub
    #   referencing
    #   sqlalchemy
-urllib3==2.5.0
+urllib3==2.6.3
    # via
    #   pygithub
    #   requests
Author	SHA1	Message	Date
anisha-amd	773f5de407	Docs: Ray release 25.12 and compatibility version format standardization (#5845 )	2026-01-08 12:09:11 -05:00
dependabot[bot]	b297ced032	Bump urllib3 from 2.5.0 to 2.6.3 in /docs/sphinx (#5842 ) Bumps [urllib3](https://github.com/urllib3/urllib3) from 2.5.0 to 2.6.3. - [Release notes](https://github.com/urllib3/urllib3/releases) - [Changelog](https://github.com/urllib3/urllib3/blob/main/CHANGES.rst) - [Commits](https://github.com/urllib3/urllib3/compare/2.5.0...2.6.3) --- updated-dependencies: - dependency-name: urllib3 dependency-version: 2.6.3 dependency-type: indirect ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2026-01-08 08:22:01 -05:00
peterjunpark	2dc22ca890	fix(primus-pytorch.rst): FP8 config instead of BF16 (#5839 )	2026-01-07 13:49:31 -05:00
Joseph Macaranas	85102079ed	[External CI] Add SIMDe dev package to HIP runtime pipeline (#5838 )	2026-01-07 11:00:38 -05:00
dependabot[bot]	ba95e0e689	Bump pynacl from 1.6.1 to 1.6.2 in /docs/sphinx (#5836 ) Bumps [pynacl](https://github.com/pyca/pynacl) from 1.6.1 to 1.6.2. - [Changelog](https://github.com/pyca/pynacl/blob/main/CHANGELOG.rst) - [Commits](https://github.com/pyca/pynacl/compare/1.6.1...1.6.2) --- updated-dependencies: - dependency-name: pynacl dependency-version: 1.6.2 dependency-type: indirect ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2026-01-06 14:10:42 -05:00
Pratik Basyal	1691d369e9	ROCM-core version fixed (#5827 )	2026-01-02 16:06:27 -05:00
peterjunpark	172b0f7c08	Fix inconsistency in xDiT doc Fix inconsistency in xDiT doc	2025-12-29 10:26:25 -05:00
peterjunpark	c67fac78bd	Update docs for xDiT diffusion inference 25.13 Docker release (#5820 ) * archive previous version * add xdit 25.13 * update history index * add perf results section	2025-12-29 08:44:45 -05:00
peterjunpark	e0b8ec4dfb	Update training docs for Primus/25.11 (#5819 ) * update conf and toc.yml.in * archive previous versions archive data files update anchors * primus pytorch: remove training batch size args * update primus megatron run cmds multi-node * update primus pytorch update * update update * update docker tag	2025-12-29 08:05:47 -05:00
Pratik Basyal	38f2d043dc	OS table removed from compatibility table [develop] (#5810 ) * OS table removed from compatibility table * Feedback added * Azure Linux 3.0 and compatibility version update * Version fix * Review feedback added * Minor change	2025-12-23 16:28:19 -05:00
peterjunpark	3a43bacdda	Update xdit diffusion inference history (#5808 ) * Update xdit diffusion inference history * fix	2025-12-22 11:05:32 -05:00
peterjunpark	48d8fe139b	fix link to ROCm PyT docker image (#5803 )	2025-12-19 15:47:55 -05:00
peterjunpark	7455fe57b8	clean up formatting in FA2 page (#5795 )	2025-12-19 09:21:41 -05:00
peterjunpark	52c0a47e84	Update Flash Attention guidance in "Model acceleration libraries" (#5793 ) * flash attention update Signed-off-by: seungrok.jung <seungrok.jung@amd.com> flash attention update Signed-off-by: seungrok.jung <seungrok.jung@amd.com> flash attention update Signed-off-by: seungrok.jung <seungrok.jung@amd.com> sentence-case heading * Update docs/how-to/rocm-for-ai/inference-optimization/model-acceleration-libraries.rst Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com> * Apply suggestions from code review Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com> --------- Co-authored-by: seungrok.jung <seungrok.jung@amd.com> Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com>	2025-12-19 08:48:52 -05:00
peterjunpark	cbab9a465d	Update documentation for JAX training MaxText 25.11 release (#5789 )	2025-12-18 11:23:58 -05:00
peterjunpark	459283da3c	xDiT diffusion inference v25.12 documentation update (#5786 ) * Add xdit-diffusion ROCm docs page. * Update template formatting and fix sphinx warnings * Add System Validation section. * Add sw component versions/commits. * Update to use latest v25.10 image instead of v25.9 * Update commands and add FLUX instructions. * Update Flux instructions. Change image tag. Describe as diffusion inference instead of specifically video. * git rm xdit-video-diffusion.rst * Docs for v25.12 * Add hyperlinks to components * Command fixes * -Diffusers suffix * Simplify yaml file and cleanup main rst page. * Spelling, added 'js' * fix merge conflict fix --------- Co-authored-by: Kristoffer <kristoffer.torp@amd.com>	2025-12-17 10:20:10 -05:00
peterjunpark	1b4f25733d	vLLM inference benchmark 1210 (#5776 ) * Archive previous ver fix anchors * Update vllm.rst and data yaml for 20251210	2025-12-17 09:21:57 -05:00
Ibrahim Wani	b287372be5	[origami] Test update (#5768 ) * Fix the skipping of origami tests * Update dependencies for origami refactor * test * Unsupress test output. * Ctest implementation * Test ctest * Test ctest 2 * Add pip install test * Fix python version * Add python dep * test * test 2 * Debug for readme * Fix pip install * Fix pip install 2 * Clean up * Run tests on 950 * Replace 950 with 1201 * 1101 * Add more archs * Add more archs 2 * Comment out archs * Move pip install script to ./azuredevops/scripts * Fix path * Fix path 2 * Fix path 3 * Fix path 4 * Remove pip install testing: * Use inline script * Add old deps	2025-12-16 15:37:41 -07:00