Docs: Ray release 25.12 and compatibility version format standardization (#5845 )

Bump urllib3 from 2.5.0 to 2.6.3 in /docs/sphinx (#5842 )
Bumps [urllib3](https://github.com/urllib3/urllib3) from 2.5.0 to 2.6.3. - [Release notes](https://github.com/urllib3/urllib3/releases) - [Changelog](https://github.com/urllib3/urllib3/blob/main/CHANGES.rst) - [Commits](https://github.com/urllib3/urllib3/compare/2.5.0...2.6.3) --- updated-dependencies: - dependency-name: urllib3 dependency-version: 2.6.3 dependency-type: indirect ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2026-01-09 22:58:17 -05:00 · 2026-01-08 12:09:11 -05:00 · 2026-01-08 08:22:01 -05:00 · 2026-01-07 13:49:31 -05:00 · 2026-01-07 11:00:38 -05:00 · 2026-01-06 14:10:42 -05:00
107 changed files with 16827 additions and 2786 deletions
--- a/.azuredevops/components/AMDMIGraphX.yml
+++ b/.azuredevops/components/AMDMIGraphX.yml
@@ -128,6 +128,9 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
+      parameters:
+        cmakeVersion: '3.28.6'
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
@@ -152,6 +155,7 @@ jobs:
          -DCMAKE_BUILD_TYPE=Release
          -DGPU_TARGETS=${{ job.target }}
          -DAMDGPU_TARGETS=${{ job.target }}
+          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
          -DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm/lib/cmake/hip
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm/llvm;$(Agent.BuildDirectory)/rocm
          -DHALF_INCLUDE_DIR=$(Agent.BuildDirectory)/rocm/include
@@ -192,6 +196,9 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
+      parameters:
+        cmakeVersion: '3.28.6'
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
@@ -217,6 +224,7 @@ jobs:
          -DCMAKE_BUILD_TYPE=Release
          -DGPU_TARGETS=${{ job.target }}
          -DAMDGPU_TARGETS=${{ job.target }}
+          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
          -DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm/lib/cmake/hip
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm/llvm;$(Agent.BuildDirectory)/rocm
          -DHALF_INCLUDE_DIR=$(Agent.BuildDirectory)/rocm/include
--- a/.azuredevops/components/HIP.yml
+++ b/.azuredevops/components/HIP.yml
@@ -34,6 +34,7 @@ parameters:
  default:
    - cmake
    - libnuma-dev
+    - libsimde-dev
    - mesa-common-dev
    - ninja-build
    - ocl-icd-libopencl1
--- a/.azuredevops/components/amdsmi.yml
+++ b/.azuredevops/components/amdsmi.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: amdsmi
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -31,7 +50,7 @@ parameters:

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: amdsmi_build_${{ job.os }}
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}
    pool:
      ${{ if eq(job.os, 'ubuntu2404') }}:
        vmImage: 'ubuntu-24.04'
@@ -55,6 +74,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        os: ${{ job.os }}
@@ -65,50 +85,54 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
        os: ${{ job.os }}
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
        os: ${{ job.os }}
+        componentName: ${{ parameters.componentName }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
    #   parameters:
    #     aptPackages: ${{ parameters.aptPackages }}

- ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: amdsmi_test_${{ job.os }}_${{ job.target }}
-    dependsOn: amdsmi_build_${{ job.os }}
-    condition:
-      and(succeeded(),
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
-        eq(${{ parameters.aggregatePipeline }}, False)
-      )
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool: ${{ job.target }}_test_pool
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        packageManager: ${{ job.packageManager }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-      parameters:
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-      parameters:
-        runRocminfo: false
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: amdsmi
-        testDir: '$(Agent.BuildDirectory)'
-        testExecutable: 'sudo ./rocm/share/amd_smi/tests/amdsmitst'
-        testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        environment: test
-        gpuTarget: ${{ job.target }}
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
+    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
+      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}
+      condition:
+        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
+          eq(${{ parameters.aggregatePipeline }}, False)
+        )
+      variables:
+      - group: common
+      - template: /.azuredevops/variables-global.yml
+      pool: ${{ job.target }}_test_pool
+      workspace:
+        clean: all
+      steps:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          packageManager: ${{ job.packageManager }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+        parameters:
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+        parameters:
+          runRocminfo: false
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: ${{ parameters.componentName }}
+          testDir: '$(Agent.BuildDirectory)'
+          testExecutable: 'sudo ./rocm/share/amd_smi/tests/amdsmitst'
+          testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          environment: test
+          gpuTarget: ${{ job.target }}
--- a/.azuredevops/components/hipTensor.yml
+++ b/.azuredevops/components/hipTensor.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: hipTensor
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -51,7 +70,7 @@ parameters:

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: hipTensor_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -66,12 +85,15 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        extraBuildFlags: >-
@@ -85,9 +107,12 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
@@ -95,44 +120,47 @@ jobs:
        aptPackages: ${{ parameters.aptPackages }}
        gpuTarget: ${{ job.target }}

- ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: hipTensor_test_${{ job.target }}
-    timeoutInMinutes: 90
-    dependsOn: hipTensor_build_${{ job.target }}
-    condition:
-      and(succeeded(),
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
-        eq(${{ parameters.aggregatePipeline }}, False)
-      )
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool: ${{ job.target }}_test_pool
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-      parameters:
-        gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmTestDependencies }}
-        gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: hipTensor
-        testDir: '$(Agent.BuildDirectory)/rocm/bin/hiptensor'
-        testParameters: '-E ".*-extended" --output-on-failure --force-new-ctest-process --output-junit test_output.xml'
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        environment: test
-        gpuTarget: ${{ job.target }}
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
+    - job: ${{ parameters.componentName }}_test_${{ job.target }}
+      timeoutInMinutes: 90
+      dependsOn: ${{ parameters.componentName }}_build_${{ job.target }}
+      condition:
+        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
+          eq(${{ parameters.aggregatePipeline }}, False)
+        )
+      variables:
+      - group: common
+      - template: /.azuredevops/variables-global.yml
+      pool: ${{ job.target }}_test_pool
+      workspace:
+        clean: all
+      steps:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+        parameters:
+          gpuTarget: ${{ job.target }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
+          dependencyList: ${{ parameters.rocmTestDependencies }}
+          gpuTarget: ${{ job.target }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: ${{ parameters.componentName }}
+          testDir: '$(Agent.BuildDirectory)/rocm/bin/hiptensor'
+          testParameters: '-E ".*-extended" --extra-verbose --output-on-failure --force-new-ctest-process --output-junit test_output.xml'
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          environment: test
+          gpuTarget: ${{ job.target }}
--- a/.azuredevops/components/origami.yml
+++ b/.azuredevops/components/origami.yml
@@ -39,6 +39,7 @@ parameters:
    - python3
    - python3-dev
    - python3-pip
+    - python3-venv
    - libgtest-dev
    - libboost-filesystem-dev
    - libboost-program-options-dev
@@ -46,6 +47,8 @@ parameters:
  type: object
  default:
    - nanobind>=2.0.0
+    - pytest
+    - pytest-cov
 - name: rocmDependencies
  type: object
  default:
@@ -72,8 +75,10 @@ parameters:
      - { os: ubuntu2204, packageManager: apt }
      - { os: almalinux8, packageManager: dnf }
    testJobs:
-      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+      # - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
+      # - { os: ubuntu2204, packageManager: apt, target: gfx1151 }
+      # - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
 - name: downstreamComponentMatrix
  type: object
  default:
@@ -116,6 +121,11 @@ jobs:
      parameters:
        dependencyList:
          - gtest
+    - ${{ if ne(job.os, 'almalinux8') }}:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
+        parameters:
+          dependencyList:
+            - catch2
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
@@ -137,6 +147,7 @@ jobs:
          -DORIGAMI_BUILD_SHARED_LIBS=ON
          -DORIGAMI_ENABLE_PYTHON=ON
          -DORIGAMI_BUILD_TESTING=ON
+          -DORIGAMI_ENABLE_FETCH=ON
          -GNinja
    - ${{ if ne(job.os, 'almalinux8') }}:
      - task: PublishPipelineArtifact@1
@@ -169,7 +180,6 @@ jobs:
      dependsOn: origami_build_${{ job.os }}
      condition:
        and(succeeded(),
-          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
          eq(${{ parameters.aggregatePipeline }}, False)
        )
@@ -180,30 +190,30 @@ jobs:
      workspace:
        clean: all
      steps:
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-        parameters:
-          checkoutRepo: ${{ parameters.checkoutRepo }}
-          sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
        parameters:
          aptPackages: ${{ parameters.aptPackages }}
          pipModules: ${{ parameters.pipModules }}
          packageManager: ${{ job.packageManager }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+        parameters:
+          checkoutRepo: ${{ parameters.checkoutRepo }}
+          sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
+        parameters:
+          dependencyList:
+            - gtest
+      - ${{ if ne(job.os, 'almalinux8') }}:
+        - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
+          parameters:
+            dependencyList:
+              - catch2
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
        parameters:
          preTargetFilter: ${{ parameters.componentName }}
          os: ${{ job.os }}
-      - task: DownloadPipelineArtifact@2
-        displayName: 'Download Build Directory Artifact'
-        inputs:
-          artifact: '${{ parameters.componentName }}_${{ job.os }}_build_dir'
-          path: '$(Agent.BuildDirectory)/s/build'
-      - task: DownloadPipelineArtifact@2
-        displayName: 'Download Python Source Artifact'
-        inputs:
-          artifact: '${{ parameters.componentName }}_${{ job.os }}_python_src'
-          path: '$(Agent.BuildDirectory)/s/python'
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
        parameters:
          checkoutRef: ${{ parameters.checkoutRef }}
@@ -212,25 +222,72 @@ jobs:
          gpuTarget: ${{ job.target }}
          ${{ if parameters.triggerDownstreamJobs }}:
            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+      - task: CMake@1
+        displayName: 'Origami Test CMake Configuration'
+        inputs:
+          cmakeArgs: >-
+            -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor
+            -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
+            -DORIGAMI_BUILD_SHARED_LIBS=ON
+            -DORIGAMI_ENABLE_PYTHON=ON
+            -DORIGAMI_BUILD_TESTING=ON
+            -GNinja
+            $(Agent.BuildDirectory)/s
+      - task: Bash@3
+        displayName: 'Build Origami Tests and Python Bindings'
+        inputs:
+          targetType: inline
+          workingDirectory: build
+          script: |
+            cmake --build . --target origami-tests origami_python -- -j$(nproc)
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      # Run tests using CTest (discovers and runs both C++ and Python tests)
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
        parameters:
          componentName: ${{ parameters.componentName }}
          os: ${{ job.os }}
-          testDir: '$(Agent.BuildDirectory)/rocm/bin'
-          testExecutable: './origami-tests'
-          testParameters: '--yaml origami-tests.yaml --gtest_output=xml:./test_output.xml --gtest_color=yes'
-      - script: |
-          set -e
-          export PYTHONPATH=$(Agent.BuildDirectory)/s/build/python:$PYTHONPATH
-
-          echo "--- Running origami_test.py ---"
-          python3 $(Agent.BuildDirectory)/s/python/origami_test.py
-          
-          echo "--- Running origami_grid_test.py ---"
-          python3 $(Agent.BuildDirectory)/s/python/origami_grid_test.py
-        displayName: 'Run Python Binding Tests'
-        condition: succeeded()
+          testDir: 'build'
+          testParameters: '--output-on-failure --force-new-ctest-process --output-junit test_output.xml'
+      # Test pip install workflow
+      # - task: Bash@3
+      #   displayName: 'Test Pip Install'
+      #   inputs:
+      #     targetType: inline
+      #     script: |
+      #       set -e
+            
+      #       echo "==================================================================="
+      #       echo "Testing pip install workflow (pip install -e .)"
+      #       echo "==================================================================="
+            
+      #       # Set environment variables for pip install CMake build
+      #       export ROCM_PATH=$(Agent.BuildDirectory)/rocm
+      #       export CMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm:$(Agent.BuildDirectory)/vendor
+      #       export CMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
+            
+      #       echo "ROCM_PATH: $ROCM_PATH"
+      #       echo "CMAKE_PREFIX_PATH: $CMAKE_PREFIX_PATH"
+      #       echo "CMAKE_CXX_COMPILER: $CMAKE_CXX_COMPILER"
+      #       echo ""
+            
+      #       # Install from source directory
+      #       cd "$(Agent.BuildDirectory)/s/python"
+      #       pip install -e .
+            
+      #       # Verify import works
+      #       echo ""
+      #       echo "Verifying origami can be imported..."
+      #       python3 -c "import origami; print('✓ Successfully imported origami')"
+            
+      #       # Run pytest on installed package
+      #       echo ""
+      #       echo "Running pytest tests..."
+      #       python3 -m pytest tests/ -v -m "not slow" --tb=short
+            
+      #       echo ""
+      #       echo "==================================================================="
+      #       echo "Pip install test completed successfully"
+      #       echo "==================================================================="
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
        parameters:
          aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/rccl.yml
+++ b/.azuredevops/components/rccl.yml
@@ -1,10 +1,35 @@
 parameters:
+- name: componentName
+  type: string
+  default: rccl
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+- name: systemsRepo
+  type: string
+  default: systems_repo
+- name: systemsSparseCheckoutDir
+  type: string
+  default: 'projects/rocprofiler-sdk'
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -57,19 +82,28 @@ parameters:
  type: object
  default:
    buildJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
    testJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+- name: downstreamComponentMatrix
+  type: object
+  default:
+    - rocprofiler-sdk:
+      name: rocprofiler-sdk
+      sparseCheckoutDir: ''
+      skipUnifiedBuild: 'false'
+      buildDependsOn:
+        - rccl_build

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: rccl_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}_${{ job.target }}
    timeoutInMinutes: 120
    variables:
    - group: common
@@ -77,17 +111,23 @@ jobs:
    - name: HIP_ROCCLR_HOME
      value: $(Build.BinariesDirectory)/rocm
    pool: ${{ variables.MEDIUM_BUILD_POOL }}
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
    workspace:
      clean: all
    steps:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        submoduleBehaviour: recursive
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
      parameters:
@@ -97,10 +137,14 @@ jobs:
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
+        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
+        os: ${{ job.os }}
        extraBuildFlags: >-
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/bin/hipcc
          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/bin/hipcc
@@ -112,58 +156,87 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
+        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        gpuTarget: ${{ job.target }}
-        extraEnvVars:
-          - HIP_ROCCLR_HOME:::/home/user/workspace/rocm
-        installLatestCMake: true
+    - ${{ if eq(job.os, 'ubuntu2204') }}:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          gpuTarget: ${{ job.target }}
+          extraEnvVars:
+            - HIP_ROCCLR_HOME:::/home/user/workspace/rocm
+          installLatestCMake: true

- ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: rccl_test_${{ job.target }}
-    timeoutInMinutes: 120
-    dependsOn: rccl_build_${{ job.target }}
-    condition:
-      and(succeeded(),
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
-        eq(${{ parameters.aggregatePipeline }}, False)
-      )
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool: ${{ job.target }}_test_pool
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-      parameters:
-        gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmTestDependencies }}
-        gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: rccl
-        testDir: '$(Agent.BuildDirectory)/rocm/bin'
-        testExecutable: './rccl-UnitTests'
-        testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        environment: test
-        gpuTarget: ${{ job.target }}
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
+    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
+      timeoutInMinutes: 120
+      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
+      condition:
+        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
+          eq(${{ parameters.aggregatePipeline }}, False)
+        )
+      variables:
+      - group: common
+      - template: /.azuredevops/variables-global.yml
+      pool: ${{ job.target }}_test_pool
+      workspace:
+        clean: all
+      steps:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+        parameters:
+          preTargetFilter: ${{ parameters.componentName }}
+          os: ${{ job.os }}
+          gpuTarget: ${{ job.target }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
+          dependencyList: ${{ parameters.rocmTestDependencies }}
+          os: ${{ job.os }}
+          gpuTarget: ${{ job.target }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: ${{ parameters.componentName }}
+          os: ${{ job.os }}
+          testDir: '$(Agent.BuildDirectory)/rocm/bin'
+          testExecutable: './rccl-UnitTests'
+          testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          environment: test
+          gpuTarget: ${{ job.target }}
+
+- ${{ if parameters.triggerDownstreamJobs }}:
+  - ${{ each component in parameters.downstreamComponentMatrix }}:
+    - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
+      - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
+        parameters:
+          checkoutRepo: ${{ parameters.systemsRepo }}
+          sparseCheckoutDir: ${{ parameters.systemsSparseCheckoutDir }}
+          triggerDownstreamJobs: true
+          unifiedBuild: ${{ parameters.unifiedBuild }}
+          ${{ if parameters.unifiedBuild }}:
+            buildDependsOn: ${{ component.unifiedBuild.buildDependsOn }}
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ component.unifiedBuild.downstreamAggregateNames }}
+          ${{ else }}:
+            buildDependsOn: ${{ component.buildDependsOn }}
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
--- a/.azuredevops/components/rocWMMA.yml
+++ b/.azuredevops/components/rocWMMA.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: rocWMMA
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -66,7 +85,11 @@ parameters:

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: rocWMMA_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.target }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -81,6 +104,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
@@ -102,9 +126,12 @@ jobs:
  # gfx1030 not supported in documentation
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
@@ -112,43 +139,45 @@ jobs:
        aptPackages: ${{ parameters.aptPackages }}
        gpuTarget: ${{ job.target }}

- ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: rocWMMA_test_${{ job.target }}
-    timeoutInMinutes: 270
-    dependsOn: rocWMMA_build_${{ job.target }}
-    condition:
-      and(succeeded(),
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
-        eq(${{ parameters.aggregatePipeline }}, False)
-      )
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool: ${{ job.target }}_test_pool
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-      parameters:
-        gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmTestDependencies }}
-        gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: rocWMMA
-        testDir: '$(Agent.BuildDirectory)/rocm/bin/rocwmma'
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        environment: test
-        gpuTarget: ${{ job.target }}
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
+    - job: ${{ parameters.componentName }}_test_${{ job.target }}
+      timeoutInMinutes: 350
+      dependsOn: ${{ parameters.componentName }}_build_${{ job.target }}
+      condition:
+        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
+          eq(${{ parameters.aggregatePipeline }}, False)
+        )
+      variables:
+      - group: common
+      - template: /.azuredevops/variables-global.yml
+      pool: ${{ job.target }}_test_pool
+      workspace:
+        clean: all
+      steps:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+        parameters:
+          preTargetFilter: ${{ parameters.componentName }}
+          gpuTarget: ${{ job.target }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
+          dependencyList: ${{ parameters.rocmTestDependencies }}
+          gpuTarget: ${{ job.target }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: ${{ parameters.componentName }}
+          testDir: '$(Agent.BuildDirectory)/rocm/bin/rocwmma'
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          environment: test
+          gpuTarget: ${{ job.target }}
--- a/.azuredevops/components/rocm-cmake.yml
+++ b/.azuredevops/components/rocm-cmake.yml
@@ -81,7 +81,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
      parameters:
        componentName: rocm-cmake
-        testParameters: '-E "pass-version-parent" --output-on-failure --force-new-ctest-process --output-junit test_output.xml'
+        testParameters: '-E "pass-version-parent" --extra-verbose --output-on-failure --force-new-ctest-process --output-junit test_output.xml'
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
--- a/.azuredevops/components/rocm-examples.yml
+++ b/.azuredevops/components/rocm-examples.yml
@@ -21,13 +21,35 @@ parameters:
    - libtbb-dev
    - libtiff-dev
    - libva-amdgpu-dev
+    - libva2-amdgpu
+    - mesa-amdgpu-va-drivers
+    - libavcodec-dev
+    - libavformat-dev
+    - libavutil-dev
    - ninja-build
    - python3-pip
+    - protobuf-compiler
+    - libprotoc-dev
+    - libopencv-dev
+- name: pipModules
+  type: object
+  default:
+    - future==1.0.0
+    - pytz==2022.1
+    - numpy==1.23
+    - google==3.0.0
+    - protobuf==3.12.4
+    - onnx==1.12.0
+    - nnef==1.0.7
 - name: rocmDependencies
  type: object
  default:
    - AMDMIGraphX
+    - aomp
+    - aomp-extras
    - clr
+    - half
+    - composable_kernel
    - hipBLAS
    - hipBLAS-common
    - hipBLASLt
@@ -40,7 +62,13 @@ parameters:
    - hipTensor
    - llvm-project
    - MIOpen
+    - MIVisionX
+    - rocm_smi_lib
+    - rccl
+    - rocAL
+    - rocALUTION
    - rocBLAS
+    - rocDecode
    - rocFFT
    - rocJPEG
    - rocPRIM
@@ -57,7 +85,11 @@ parameters:
  type: object
  default:
    - AMDMIGraphX
+    - aomp
+    - aomp-extras
    - clr
+    - half
+    - composable_kernel
    - hipBLAS
    - hipBLAS-common
    - hipBLASLt
@@ -70,7 +102,13 @@ parameters:
    - hipTensor
    - llvm-project
    - MIOpen
+    - MIVisionX
+    - rocm_smi_lib
+    - rccl
+    - rocAL
+    - rocALUTION
    - rocBLAS
+    - rocDecode
    - rocFFT
    - rocminfo
    - rocPRIM
@@ -113,6 +151,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
+        pipModules: ${{ parameters.pipModules }}
        registerROCmPackages: true
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
      parameters:
@@ -212,5 +251,6 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
+        pipModules: ${{ parameters.pipModules }}
        environment: test
        gpuTarget: ${{ job.target }}
--- a/.azuredevops/components/rocprofiler-compute.yml
+++ b/.azuredevops/components/rocprofiler-compute.yml
@@ -65,6 +65,13 @@ parameters:
    - pytest
    - pytest-cov
    - pytest-xdist
+- name: rocmDependencies
+  type: object
+  default:
+    - clr
+    - llvm-project
+    - ROCR-Runtime
+    - rocprofiler-sdk
 - name: rocmTestDependencies
  type: object
  default:
@@ -101,10 +108,12 @@ jobs:
    ${{ if parameters.buildDependsOn }}:
      dependsOn:
        - ${{ each build in parameters.buildDependsOn }}:
-          - ${{ build }}_${{ job.os }}_${{ job.target }}
+          - ${{ build }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
+    - name: ROCM_PATH
+      value: $(Agent.BuildDirectory)/rocm
    pool:
      vmImage: ${{ variables.BASE_BUILD_POOL }}
    workspace:
@@ -119,6 +128,14 @@ jobs:
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      parameters:
+        checkoutRef: ${{ parameters.checkoutRef }}
+        dependencyList: ${{ parameters.rocmDependencies }}
+        gpuTarget: ${{ job.target }}
+        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        extraBuildFlags: >-
--- a/.azuredevops/components/rocprofiler-sdk.yml
+++ b/.azuredevops/components/rocprofiler-sdk.yml
@@ -79,27 +79,27 @@ parameters:
  type: object
  default:
    buildJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
    testJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: rocprofiler_sdk_build_${{ job.target }}
+  - job: rocprofiler_sdk_build_${{ job.os }}_${{ job.target }}
    ${{ if parameters.buildDependsOn }}:
      dependsOn:
        - ${{ each build in parameters.buildDependsOn }}:
-          - ${{ build }}_${{ job.target }}
+          - ${{ build }}_${{ job.os}}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
    pool: ${{ variables.MEDIUM_BUILD_POOL }}
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
    workspace:
      clean: all
    steps:
@@ -107,6 +107,7 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
+        packageManager: ${{ job.packageManager }}
        registerROCmPackages: true
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
@@ -118,6 +119,7 @@ jobs:
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
+        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
        ${{ if parameters.triggerDownstreamJobs }}:
@@ -132,6 +134,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        componentName: ${{ parameters.componentName }}
+        os: ${{ job.os }}
        extraBuildFlags: >-
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
          -DROCPROFILER_BUILD_TESTS=ON
@@ -143,6 +146,7 @@ jobs:
      parameters:
        componentName: ${{ parameters.componentName }}
        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
@@ -158,8 +162,8 @@ jobs:

 - ${{ if eq(parameters.unifiedBuild, False) }}:
  - ${{ each job in parameters.jobMatrix.testJobs }}:
-    - job: rocprofiler_sdk_test_${{ job.target }}
-      dependsOn: rocprofiler_sdk_build_${{ job.target }}
+    - job: rocprofiler_sdk_test_${{ job.os }}_${{ job.target }}
+      dependsOn: rocprofiler_sdk_build_${{ job.os }}_${{ job.target }}
      condition:
        and(succeeded(),
          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -177,6 +181,7 @@ jobs:
        parameters:
          aptPackages: ${{ parameters.aptPackages }}
          pipModules: ${{ parameters.pipModules }}
+          packageManager: ${{ job.packageManager }}
          registerROCmPackages: true
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
@@ -188,6 +193,7 @@ jobs:
        parameters:
          checkoutRef: ${{ parameters.checkoutRef }}
          dependencyList: ${{ parameters.rocmDependencies }}
+          os: ${{ job.os }}
          gpuTarget: ${{ job.target }}
          ${{ if parameters.triggerDownstreamJobs }}:
              downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
@@ -202,6 +208,7 @@ jobs:
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
        parameters:
          componentName: ${{ parameters.componentName }}
+          os: ${{ job.os }}
          extraBuildFlags: >-
            -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
            -DROCPROFILER_BUILD_TESTS=ON
@@ -213,7 +220,8 @@ jobs:
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
        parameters:
          componentName: ${{ parameters.componentName }}
-          testDir: $(Agent.BuildDirectory)/s/build
+          os: ${{ job.os }}
+          testDir: $(Agent.BuildDirectory)/build
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
        parameters:
          aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/templates/steps/dependencies-dnf.yml
+++ b/.azuredevops/templates/steps/dependencies-dnf.yml
@@ -63,6 +63,7 @@ parameters:
    libopenblas-dev: openblas-devel
    libopenmpi-dev: openmpi-devel
    libpci-dev: libpciaccess-devel
+    libsimde-dev: simde-devel
    libssl-dev: openssl-devel
    # note: libstdc++-devel is in the base packages list
    libsystemd-dev: systemd-devel
--- a/.azuredevops/templates/steps/dependencies-rocm.yml
+++ b/.azuredevops/templates/steps/dependencies-rocm.yml
@@ -35,8 +35,8 @@ parameters:
      developBranch: develop
      hasGpuTarget: true
    amdsmi:
-      pipelineId: 99
-      developBranch: amd-staging
+      pipelineId: 376
+      developBranch: develop
      hasGpuTarget: false
    aomp-extras:
      pipelineId: 111
@@ -115,7 +115,7 @@ parameters:
      developBranch: develop
      hasGpuTarget: true
    hipTensor:
-      pipelineId: 105
+      pipelineId: 374
      developBranch: develop
      hasGpuTarget: true
    llvm-project:
@@ -263,7 +263,7 @@ parameters:
      developBranch: develop
      hasGpuTarget: true
    rocWMMA:
-      pipelineId: 109
+      pipelineId: 370
      developBranch: develop
      hasGpuTarget: true
    rpp:
--- a/.azuredevops/templates/steps/test.yml
+++ b/.azuredevops/templates/steps/test.yml
@@ -13,7 +13,7 @@ parameters:
  default: ctest
 - name: testParameters
  type: string
-  default: --output-on-failure --force-new-ctest-process --output-junit test_output.xml
+  default: --extra-verbose --output-on-failure --force-new-ctest-process --output-junit test_output.xml
 - name: extraTestParameters
  type: string
  default: ''
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 .venv
 .vscode
 build
+__pycache__

 # documentation artifacts
 _build/
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -27,6 +27,7 @@ ASICs
 ASan
 ASAN
 ASm
+Async
 ATI
 atomicRMW
 AddressSanitizer
@@ -34,6 +35,7 @@ AlexNet
 Andrej
 Arb
 Autocast
+autograd
 BARs
 BatchNorm
 BLAS
@@ -77,6 +79,7 @@ CX
 Cavium
 CentOS
 ChatGPT
+Cholesky
 CoRR
 Codespaces
 Commitizen
@@ -86,9 +89,11 @@ Conda
 ConnectX
 CountOnes
 CuPy
+customizable
 da
 Dashboarding
 Dataloading
+dataflows
 DBRX
 DDR
 DF
@@ -130,10 +135,13 @@ ELMo
 ENDPGM
 EPYC
 ESXi
+EP
 EoS
 etcd
+equalto
 fas
 FBGEMM
+FiLM
 FIFOs
 FFT
 FFTs
@@ -154,10 +162,12 @@ Fortran
 Fuyu
 GALB
 GAT
+GATNE
 GCC
 GCD
 GCDs
 GCN
+GCNN
 GDB
 GDDR
 GDR
@@ -176,13 +186,16 @@ Glibc
 GLXT
 Gloo
 GMI
+GNN
+GNNs
 GPG
 GPR
 GPT
 GPU
 GPU's
+GPUDirect
 GPUs
-Graphbolt
+GraphBolt
 GraphSage
 GRBM
 GRE
@@ -212,7 +225,10 @@ Haswell
 Higgs
 href
 Hyperparameters
+HybridEngine
 Huggingface
+Hunyuan
+HunyuanVideo
 IB
 ICD
 ICT
@@ -243,7 +259,9 @@ Intersphinx
 Intra
 Ioffe
 JAX's
+JAXLIB
 Jinja
+js
 JSON
 Jupyter
 KFD
@@ -263,6 +281,7 @@ LLM
 LLMs
 LLVM
 LM
+logsumexp
 LRU
 LSAN
 LSan
@@ -298,6 +317,7 @@ Makefiles
 Matplotlib
 Matrox
 MaxText
+MBT
 Megablocks
 Megatrends
 Megatron
@@ -307,12 +327,14 @@ Meta's
 Miniconda
 MirroredStrategy
 Mixtral
+MLA
 MosaicML
 MoEs
 Mooncake
 Mpops
 Multicore
 Multithreaded
+mx
 MXFP
 MyEnvironment
 MyST
@@ -349,6 +371,7 @@ OFED
 OMM
 OMP
 OMPI
+OOM
 OMPT
 OMPX
 ONNX
@@ -375,6 +398,7 @@ perf
 PEQT
 PIL
 PILImage
+PJRT
 POR
 PRNG
 PRs
@@ -394,6 +418,7 @@ Profiler's
 PyPi
 Pytest
 PyTorch
+QPS
 Qcycles
 Qwen
 RAII
@@ -496,13 +521,12 @@ TPS
 TPU
 TPUs
 TSME
-Taichi
-Taichi's
 Tagram
 TensileLite
 TensorBoard
 TensorFlow
 TensorParallel
+TheRock
 ToC
 TorchAudio
 torchaudio
@@ -520,6 +544,7 @@ UAC
 UC
 UCC
 UCX
+ud
 UE
 UIF
 UMC
@@ -669,6 +694,7 @@ denoised
 denoises
 denormalize
 dequantization
+dequantized
 dequantizes
 deserializers
 detections
@@ -784,6 +810,7 @@ linalg
 linearized
 linter
 linux
+llm
 llvm
 lm
 localscratch
@@ -829,11 +856,13 @@ pallas
 parallelization
 parallelizing
 param
+params
 parameterization
 passthrough
 pe
 perfcounter
 performant
+piecewise
 perl
 pragma
 pre
@@ -874,6 +903,7 @@ querySelectorAll
 queueing
 qwen
 radeon
+rc
 rccl
 rdc
 rdma
@@ -935,6 +965,7 @@ scalability
 scalable
 scipy
 seealso
+selectattr
 selectedTag
 sendmsg
 seqs
@@ -980,6 +1011,7 @@ tokenizer
 tokenizes
 toolchain
 toolchains
+topk
 toolset
 toolsets
 torchtitan
@@ -1007,6 +1039,7 @@ USM
 UTCL
 UTIL
 utils
+UX
 vL
 variational
 vdi
@@ -1036,6 +1069,8 @@ writebacks
 wrreq
 wzo
 xargs
+xdit
+xDiT
 xGMI
 xPacked
 xz
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
--- a/RELEASE.md
+++ b/RELEASE.md
--- a/default.xml
+++ b/default.xml
@@ -1,33 +1,17 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <manifest>
    <remote name="rocm-org" fetch="https://github.com/ROCm/" />
-    <default revision="refs/tags/rocm-7.0.2"
+    <default revision="refs/tags/rocm-7.1.1"
     remote="rocm-org"
     sync-c="true"
     sync-j="4" />
 <!--list of projects for ROCm-->
    <project name="ROCK-Kernel-Driver" />
-    <project name="ROCR-Runtime" />
    <project name="amdsmi" />
-    <project name="aqlprofile" />
-    <project name="rdc" />
    <project name="rocm_bandwidth_test" />
-    <project name="rocm_smi_lib" />
-    <project name="rocm-core" />
    <project name="rocm-examples" />
-    <project name="rocminfo" />
-    <project name="rocprofiler" />
-    <project name="rocprofiler-register" />
-    <project name="rocprofiler-sdk" />
-    <project name="rocprofiler-compute" />
-    <project name="rocprofiler-systems" />
-    <project name="roctracer" />
 <!--HIP Projects-->
-    <project name="hip" />
-    <project name="hip-tests" />
    <project name="HIPIFY" />
-    <project name="clr" />
-    <project name="hipother" />
 <!-- The following projects are all associated with the AMDGPU LLVM compiler -->
    <project name="half" />
    <project name="llvm-project" />
@@ -41,6 +25,7 @@
    <project groups="mathlibs" name="MIVisionX" />
    <project groups="mathlibs" name="ROCmValidationSuite" />
    <project groups="mathlibs" name="composable_kernel" />
+    <project groups="mathlibs" name="hipSOLVER" />
    <project groups="mathlibs" name="hipTensor" />
    <project groups="mathlibs" name="hipfort" />
    <project groups="mathlibs" name="rccl" />
@@ -54,7 +39,14 @@
        MIOpen rocBLAS rocFFT rocPRIM rocRAND
        rocSPARSE rocThrust Tensile -->
    <project groups="mathlibs" name="rocm-libraries" />
+    <!-- The following components have been migrated to rocm-systems:
+        aqlprofile clr hip hip-tests hipother
+        rdc rocm-core rocm_smi_lib rocminfo rocprofiler-compute 
+        rocprofiler-register rocprofiler-sdk rocprofiler-systems 
+        rocprofiler rocr-runtime roctracer -->
+    <project groups="mathlibs" name="rocm-systems" />
    <project groups="mathlibs" name="rocPyDecode" />
+    <project groups="mathlibs" name="rocSOLVER" />
    <project groups="mathlibs" name="rocSHMEM" />
    <project groups="mathlibs" name="rocWMMA" />
    <project groups="mathlibs" name="rocm-cmake" />
--- a/docs/about/license.md
+++ b/docs/about/license.md
@@ -25,69 +25,69 @@ additional licenses. Please review individual repositories for more information.
 <!-- spellcheck-disable -->
 | Component | License |
 |:---------------------|:-------------------------|
-| [AMD Compute Language Runtime (CLR)](https://github.com/ROCm/clr) | [MIT](https://github.com/ROCm/clr/blob/amd-staging/LICENSE.txt) |
+| [AMD Compute Language Runtime (CLR)](https://github.com/ROCm/rocm-systems/tree/develop/projects/clr) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/clr/LICENSE.md) |
 | [AMD SMI](https://github.com/ROCm/amdsmi) | [MIT](https://github.com/ROCm/amdsmi/blob/amd-staging/LICENSE) |
 | [aomp](https://github.com/ROCm/aomp/) | [Apache 2.0](https://github.com/ROCm/aomp/blob/aomp-dev/LICENSE) |
 | [aomp-extras](https://github.com/ROCm/aomp-extras/) | [MIT](https://github.com/ROCm/aomp-extras/blob/aomp-dev/LICENSE) |
-| [AQLprofile](https://github.com/rocm/aqlprofile/) | [MIT](https://github.com/ROCm/aqlprofile/blob/amd-staging/LICENSE.md) |
+| [AQLprofile](https://github.com/ROCm/rocm-systems/tree/develop/projects/aqlprofile/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/aqlprofile/LICENSE.md) |
 | [Code Object Manager (Comgr)](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/comgr) | [The University of Illinois/NCSA](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/comgr/LICENSE.txt) |
 | [Composable Kernel](https://github.com/ROCm/composable_kernel) | [MIT](https://github.com/ROCm/composable_kernel/blob/develop/LICENSE) |
 | [half](https://github.com/ROCm/half/) | [MIT](https://github.com/ROCm/half/blob/rocm/LICENSE.txt) |
-| [HIP](https://github.com/ROCm/HIP/) | [MIT](https://github.com/ROCm/HIP/blob/amd-staging/LICENSE.txt) |
-| [hipamd](https://github.com/ROCm/clr/tree/amd-staging/hipamd) | [MIT](https://github.com/ROCm/clr/blob/amd-staging/hipamd/LICENSE.txt) |
-| [hipBLAS](https://github.com/ROCm/hipBLAS/) | [MIT](https://github.com/ROCm/hipBLAS/blob/develop/LICENSE.md) |
-| [hipBLASLt](https://github.com/ROCm/hipBLASLt/) | [MIT](https://github.com/ROCm/hipBLASLt/blob/develop/LICENSE.md) |
+| [HIP](https://github.com/ROCm/rocm-systems/tree/develop/projects/hip/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/hip/LICENSE.md) |
+| [hipamd](https://github.com/ROCm/rocm-systems/tree/develop/projects/clr/hipamd/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/clr/hipamd/LICENSE.md) |
+| [hipBLAS](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipblas/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipblas/LICENSE.md) |
+| [hipBLASLt](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipblaslt/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipblaslt/LICENSE.md) |
 | [HIPCC](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/hipcc) | [MIT](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/hipcc/LICENSE.txt) |
-| [hipCUB](https://github.com/ROCm/hipCUB/) | [Custom](https://github.com/ROCm/hipCUB/blob/develop/LICENSE.txt) |
-| [hipFFT](https://github.com/ROCm/hipFFT/) | [MIT](https://github.com/ROCm/hipFFT/blob/develop/LICENSE.md) |
+| [hipCUB](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipcub/) | [Custom](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipcub/LICENSE.txt) |
+| [hipFFT](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipfft/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipfft/LICENSE.md) |
 | [hipfort](https://github.com/ROCm/hipfort/) | [MIT](https://github.com/ROCm/hipfort/blob/develop/LICENSE) |
 | [HIPIFY](https://github.com/ROCm/HIPIFY/) | [MIT](https://github.com/ROCm/HIPIFY/blob/amd-staging/LICENSE.txt) |
-| [hipRAND](https://github.com/ROCm/hipRAND/) | [MIT](https://github.com/ROCm/hipRAND/blob/develop/LICENSE.txt) |
-| [hipSOLVER](https://github.com/ROCm/hipSOLVER/) | [MIT](https://github.com/ROCm/hipSOLVER/blob/develop/LICENSE.md) |
-| [hipSPARSE](https://github.com/ROCm/hipSPARSE/) | [MIT](https://github.com/ROCm/hipSPARSE/blob/develop/LICENSE.md) |
-| [hipSPARSELt](https://github.com/ROCm/hipSPARSELt/) | [MIT](https://github.com/ROCm/hipSPARSELt/blob/develop/LICENSE.md) |
-| [hipTensor](https://github.com/ROCm/hipTensor) | [MIT](https://github.com/ROCm/hipTensor/blob/develop/LICENSE) |
+| [hipRAND](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hiprand/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hiprand/LICENSE.md) |
+| [hipSOLVER](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipsolver/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipsolver/LICENSE.md) |
+| [hipSPARSE](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipsparse/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipsparse/LICENSE.md) |
+| [hipSPARSELt](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hipsparselt/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipsparselt/LICENSE.md) |
+| [hipTensor](https://github.com/ROCm/rocm-libraries/tree/develop/projects/hiptensor/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hiptensor/LICENSE) |
 | [llvm-project](https://github.com/ROCm/llvm-project/) | [Apache](https://github.com/ROCm/llvm-project/blob/amd-staging/LICENSE.TXT) |
 | [llvm-project/flang](https://github.com/ROCm/llvm-project/tree/amd-staging/flang) | [Apache 2.0](https://github.com/ROCm/llvm-project/blob/amd-staging/flang/LICENSE.TXT) |
 | [MIGraphX](https://github.com/ROCm/AMDMIGraphX/) | [MIT](https://github.com/ROCm/AMDMIGraphX/blob/develop/LICENSE) |
-| [MIOpen](https://github.com/ROCm/MIOpen/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/miopen/LICENSE.md) |
+| [MIOpen](https://github.com/ROCm/rocm-libraries/tree/develop/projects/miopen/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/miopen/LICENSE.md) |
 | [MIVisionX](https://github.com/ROCm/MIVisionX/) | [MIT](https://github.com/ROCm/MIVisionX/blob/develop/LICENSE.txt) |
 | [rocAL](https://github.com/ROCm/rocAL) | [MIT](https://github.com/ROCm/rocAL/blob/develop/LICENSE.txt) |
 | [rocALUTION](https://github.com/ROCm/rocALUTION/) | [MIT](https://github.com/ROCm/rocALUTION/blob/develop/LICENSE.md) |
-| [rocBLAS](https://github.com/ROCm/rocBLAS/) | [MIT](https://github.com/ROCm/rocBLAS/blob/develop/LICENSE.md) |
+| [rocBLAS](https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocblas/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/rocblas/LICENSE.md) |
 | [ROCdbgapi](https://github.com/ROCm/ROCdbgapi/) | [MIT](https://github.com/ROCm/ROCdbgapi/blob/amd-staging/LICENSE.txt) |
 | [rocDecode](https://github.com/ROCm/rocDecode) | [MIT](https://github.com/ROCm/rocDecode/blob/develop/LICENSE) |
-| [rocFFT](https://github.com/ROCm/rocFFT/) | [MIT](https://github.com/ROCm/rocFFT/blob/develop/LICENSE.md) |
+| [rocFFT](https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocfft/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/rocfft/LICENSE.md) |
 | [ROCgdb](https://github.com/ROCm/ROCgdb/) | [GNU General Public License v3.0](https://github.com/ROCm/ROCgdb/blob/amd-staging/COPYING3) |
 | [rocJPEG](https://github.com/ROCm/rocJPEG/) | [MIT](https://github.com/ROCm/rocJPEG/blob/develop/LICENSE) |
 | [ROCK-Kernel-Driver](https://github.com/ROCm/ROCK-Kernel-Driver/) | [GPL 2.0 WITH Linux-syscall-note](https://github.com/ROCm/ROCK-Kernel-Driver/blob/master/COPYING) |
-| [rocminfo](https://github.com/ROCm/rocminfo/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocminfo/blob/amd-staging/License.txt) |
+| [rocminfo](https://github.com/ROCm/rocm-systems/tree/develop/projects/rocminfo/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocm-systems/blob/develop/projects/rocminfo/License.txt) |
 | [ROCm Bandwidth Test](https://github.com/ROCm/rocm_bandwidth_test/) | [MIT](https://github.com/ROCm/rocm_bandwidth_test/blob/master/LICENSE.txt) |
 | [ROCm CMake](https://github.com/ROCm/rocm-cmake/) | [MIT](https://github.com/ROCm/rocm-cmake/blob/develop/LICENSE) |
 | [ROCm Communication Collectives Library (RCCL)](https://github.com/ROCm/rccl/) | [Custom](https://github.com/ROCm/rccl/blob/develop/LICENSE.txt) |
-| [ROCm-Core](https://github.com/ROCm/rocm-core) | [MIT](https://github.com/ROCm/rocm-core/blob/master/copyright) |
-| [ROCm Compute Profiler](https://github.com/ROCm/rocprofiler-compute) | [MIT](https://github.com/ROCm/rocprofiler-compute/blob/amd-staging/LICENSE) |
-| [ROCm Data Center (RDC)](https://github.com/ROCm/rdc/) | [MIT](https://github.com/ROCm/rdc/blob/amd-staging/LICENSE.md) |
+| [ROCm-Core](https://github.com/ROCm/rocm-systems/tree/develop/projects/rocm-core/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/rocm-core/LICENSE.md) |
+| [ROCm Compute Profiler](https://github.com/ROCm/rocm-systems/tree/develop/projects/rocprofiler-compute/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/rocprofiler-compute/LICENSE.md) |
+| [ROCm Data Center (RDC)](https://github.com/ROCm/rocm-systems/tree/develop/projects/rdc/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/rdc/LICENSE.md) |
 | [ROCm-Device-Libs](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/device-libs) | [The University of Illinois/NCSA](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/device-libs/LICENSE.TXT) |
-| [ROCm-OpenCL-Runtime](https://github.com/ROCm/clr/tree/amd-staging/opencl) | [MIT](https://github.com/ROCm/clr/blob/amd-staging/opencl/LICENSE.txt) |
+| [ROCm-OpenCL-Runtime](https://github.com/ROCm/rocm-systems/tree/develop/projects/clr/opencl/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/clr/opencl/LICENSE.md) |
 | [ROCm Performance Primitives (RPP)](https://github.com/ROCm/rpp) | [MIT](https://github.com/ROCm/rpp/blob/develop/LICENSE) |
-| [ROCm SMI Lib](https://github.com/ROCm/rocm_smi_lib/) | [MIT](https://github.com/ROCm/rocm_smi_lib/blob/amd-staging/LICENSE.md) |
-| [ROCm Systems Profiler](https://github.com/ROCm/rocprofiler-systems) | [MIT](https://github.com/ROCm/rocprofiler-systems/blob/amd-staging/LICENSE.md) |
+| [ROCm SMI Lib](https://github.com/ROCm/rocm-systems/tree/develop/projects/rocm-smi-lib/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/rocm-smi-lib/LICENSE.md) |
+| [ROCm Systems Profiler](https://github.com/ROCm/rocm-systems/tree/develop/projects/rocprofiler-systems/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/rocprofiler-systems/LICENSE.md) |
 | [ROCm Validation Suite](https://github.com/ROCm/ROCmValidationSuite/) | [MIT](https://github.com/ROCm/ROCmValidationSuite/blob/master/LICENSE) |
-| [rocPRIM](https://github.com/ROCm/rocPRIM/) | [MIT](https://github.com/ROCm/rocPRIM/blob/develop/LICENSE.txt) |
-| [ROCProfiler](https://github.com/ROCm/rocprofiler/) | [MIT](https://github.com/ROCm/rocprofiler/blob/amd-staging/LICENSE.md) |
-| [ROCprofiler-SDK](https://github.com/ROCm/rocprofiler-sdk) | [MIT](https://github.com/ROCm/rocprofiler-sdk/blob/amd-mainline/LICENSE) |
+| [rocPRIM](https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocprim/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/rocprim/LICENSE.md) |
+| [ROCProfiler](https://github.com/ROCm/rocm-systems/tree/develop/projects/rocprofiler/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/rocprofiler/LICENSE.md) |
+| [ROCprofiler-SDK](https://github.com/ROCm/rocm-systems/tree/develop/projects/rocprofiler-sdk/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/rocprofiler-sdk/LICENSE.md) |
 | [rocPyDecode](https://github.com/ROCm/rocPyDecode) | [MIT](https://github.com/ROCm/rocPyDecode/blob/develop/LICENSE.txt) |
-| [rocRAND](https://github.com/ROCm/rocRAND/) | [MIT](https://github.com/ROCm/rocRAND/blob/develop/LICENSE.txt) |
+| [rocRAND](https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocrand/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/rocrand/LICENSE.md) |
 | [ROCr Debug Agent](https://github.com/ROCm/rocr_debug_agent/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocr_debug_agent/blob/amd-staging/LICENSE.txt) |
-| [ROCR-Runtime](https://github.com/ROCm/ROCR-Runtime/) | [The University of Illinois/NCSA](https://github.com/ROCm/ROCR-Runtime/blob/amd-staging/LICENSE.txt) |
+| [ROCR-Runtime](https://github.com/ROCm/rocm-systems/tree/develop/projects/rocr-runtime/) | [The University of Illinois/NCSA](https://github.com/ROCm/rocm-systems/blob/develop/projects/rocr-runtime/LICENSE.txt) |
 | [rocSHMEM](https://github.com/ROCm/rocSHMEM/) | [MIT](https://github.com/ROCm/rocSHMEM/blob/develop/LICENSE.md) |
-| [rocSOLVER](https://github.com/ROCm/rocSOLVER/) | [BSD-2-Clause](https://github.com/ROCm/rocSOLVER/blob/develop/LICENSE.md) |
-| [rocSPARSE](https://github.com/ROCm/rocSPARSE/) | [MIT](https://github.com/ROCm/rocSPARSE/blob/develop/LICENSE.md) |
-| [rocThrust](https://github.com/ROCm/rocThrust/) | [Apache 2.0](https://github.com/ROCm/rocThrust/blob/develop/LICENSE) |
-| [ROCTracer](https://github.com/ROCm/roctracer/) | [MIT](https://github.com/ROCm/roctracer/blob/amd-master/LICENSE) |
-| [rocWMMA](https://github.com/ROCm/rocWMMA/) | [MIT](https://github.com/ROCm/rocWMMA/blob/develop/LICENSE.md) |
-| [Tensile](https://github.com/ROCm/Tensile/) | [MIT](https://github.com/ROCm/Tensile/blob/develop/LICENSE.md) |
+| [rocSOLVER](https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocsolver/) | [BSD-2-Clause](https://github.com/ROCm/rocm-libraries/blob/develop/projects/rocsolver/LICENSE.md) |
+| [rocSPARSE](https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocsparse/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/rocsparse/LICENSE.md) |
+| [rocThrust](https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocthrust/) | [Apache 2.0](https://github.com/ROCm/rocm-libraries/blob/develop/projects/rocthrust/LICENSE) |
+| [ROCTracer](https://github.com/ROCm/rocm-systems/tree/develop/projects/roctracer/) | [MIT](https://github.com/ROCm/rocm-systems/blob/develop/projects/roctracer/LICENSE.md) |
+| [rocWMMA](https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocwmma/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/rocwmma/LICENSE.md) |
+| [Tensile](https://github.com/ROCm/rocm-libraries/tree/develop/shared/tensile/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/shared/tensile/LICENSE.md) |
 | [TransferBench](https://github.com/ROCm/TransferBench) | [MIT](https://github.com/ROCm/TransferBench/blob/develop/LICENSE.md) |

 Open sourced ROCm components are released via public GitHub
--- a/docs/compatibility/compatibility-matrix-historical-6.0.csv
+++ b/docs/compatibility/compatibility-matrix-historical-6.0.csv
@@ -1,137 +1,136 @@
-ROCm Version,7.0.2,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0, 6.1.5, 6.1.2, 6.1.1, 6.1.0, 6.0.2, 6.0.0
-      :ref:`Operating systems & kernels <OS-kernel-versions>`,Ubuntu 24.04.3,Ubuntu 24.04.3,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,"Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04",Ubuntu 24.04,,,,,,
-      ,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,"Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3, 22.04.2","Ubuntu 22.04.4, 22.04.3, 22.04.2"
-      ,,,,,,,,,,,,,,,"Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5"
-      ,"RHEL 10.0 [#rhel-10-702-past-60]_, 9.6 [#rhel-10-702-past-60]_, 9.4 [#rhel-94-702-past-60]_","RHEL 9.6 [#rhel-10-702-past-60]_, 9.4 [#rhel-94-702-past-60]_","RHEL 9.6, 9.4","RHEL 9.6, 9.4","RHEL 9.6, 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.3, 9.2","RHEL 9.3, 9.2"
-      ,RHEL 8.10 [#rhel-700-past-60]_,RHEL 8.10 [#rhel-700-past-60]_,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,"RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8"
-      ,SLES 15 SP7 [#sles-db-700-past-60]_,SLES 15 SP7 [#sles-db-700-past-60]_,"SLES 15 SP7, SP6","SLES 15 SP7, SP6",SLES 15 SP6,SLES 15 SP6,"SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4"
-      ,,,,,,,,,,,,,,,,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9
-      ,"Oracle Linux 10, 9, 8 [#ol-700-mi300x-past-60]_","Oracle Linux 9, 8 [#ol-700-mi300x-past-60]_","Oracle Linux 9, 8 [#mi300x-past-60]_","Oracle Linux 9, 8 [#mi300x-past-60]_","Oracle Linux 9, 8 [#mi300x-past-60]_","Oracle Linux 9, 8 [#mi300x-past-60]_",Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,,,
-      ,"Debian 13 [#db-mi300x-past-60]_, 12 [#sles-db-700-past-60]_",Debian 12 [#sles-db-700-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,,,,,,,,,,,
-      ,Azure Linux 3.0 [#az-mi300x-past-60]_,Azure Linux 3.0 [#az-mi300x-past-60]_,Azure Linux 3.0 [#az-mi300x-past-60]_,Azure Linux 3.0 [#az-mi300x-past-60]_,Azure Linux 3.0 [#az-mi300x-past-60]_,Azure Linux 3.0 [#az-mi300x-past-60]_,Azure Linux 3.0 [#az-mi300x-630-past-60]_,Azure Linux 3.0 [#az-mi300x-630-past-60]_,,,,,,,,,,,,
-      ,Rocky Linux 9 [#rl-700-past-60]_,Rocky Linux 9 [#rl-700-past-60]_,,,,,,,,,,,,,,,,,,
-      ,.. _architecture-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
-      :doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA4,CDNA4,,,,,,,,,,,,,,,,,,
-      ,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3
-      ,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2
-      ,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA
-      ,RDNA4,RDNA4,RDNA4,RDNA4,RDNA4,,,,,,,,,,,,,,,
-      ,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3
-      ,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2
-      ,.. _gpu-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
-      :doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>`,gfx950 [#mi350x-os-past-60]_,gfx950 [#mi350x-os-past-60]_,,,,,,,,,,,,,,,,,,
-      ,gfx1201 [#RDNA-OS-700-past-60]_,gfx1201 [#RDNA-OS-700-past-60]_,gfx1201 [#RDNA-OS-past-60]_,gfx1201 [#RDNA-OS-past-60]_,gfx1201 [#RDNA-OS-past-60]_,,,,,,,,,,,,,,,
-      ,gfx1200 [#RDNA-OS-700-past-60]_,gfx1200 [#RDNA-OS-700-past-60]_,gfx1200 [#RDNA-OS-past-60]_,gfx1200 [#RDNA-OS-past-60]_,gfx1200 [#RDNA-OS-past-60]_,,,,,,,,,,,,,,,
-      ,gfx1101 [#RDNA-OS-700-past-60]_ [#rd-v710-past-60]_,gfx1101 [#RDNA-OS-700-past-60]_ [#rd-v710-past-60]_,gfx1101 [#RDNA-OS-past-60]_ [#7700XT-OS-past-60]_,gfx1101 [#RDNA-OS-past-60]_ [#7700XT-OS-past-60]_,gfx1101 [#RDNA-OS-past-60]_,,,,,,,,,,,,,,,
-      ,gfx1100 [#RDNA-OS-700-past-60]_,gfx1100 [#RDNA-OS-700-past-60]_,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100
-      ,gfx1030 [#RDNA-OS-700-past-60]_ [#rd-v620-past-60]_,gfx1030 [#RDNA-OS-700-past-60]_ [#rd-v620-past-60]_,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030
-      ,gfx942 [#mi325x-os-past-60]_ [#mi300x-os-past-60]_ [#mi300A-os-past-60]_,gfx942 [#mi325x-os-past-60]_ [#mi300x-os-past-60]_ [#mi300A-os-past-60]_,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942 [#mi300_624-past-60]_,gfx942 [#mi300_622-past-60]_,gfx942 [#mi300_621-past-60]_,gfx942 [#mi300_620-past-60]_, gfx942 [#mi300_612-past-60]_, gfx942 [#mi300_612-past-60]_, gfx942 [#mi300_611-past-60]_, gfx942 [#mi300_610-past-60]_, gfx942 [#mi300_602-past-60]_, gfx942 [#mi300_600-past-60]_
-      ,gfx90a [#mi200x-os-past-60]_,gfx90a [#mi200x-os-past-60]_,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a
-      ,gfx908 [#mi100-os-past-60]_,gfx908 [#mi100-os-past-60]_,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908
-      ,,,,,,,,,,,,,,,,,,,,
-      FRAMEWORK SUPPORT,.. _framework-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
-      :doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.8, 2.7, 2.6","2.7, 2.6, 2.5","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13"
-      :doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.19.1, 2.18.1, 2.17.1 [#tf-mi350-past-60]_","2.19.1, 2.18.1, 2.17.1 [#tf-mi350-past-60]_","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.14.0, 2.13.1, 2.12.1","2.14.0, 2.13.1, 2.12.1"
-      :doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.6.0,0.6.0,0.4.35,0.4.35,0.4.35,0.4.35,0.4.31,0.4.31,0.4.31,0.4.31,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26
-      :doc:`verl <../compatibility/ml-compatibility/verl-compatibility>` [#verl_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.3.0.post0,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>` [#stanford-megatron-lm_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,85f95ae,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,2.4.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>` [#megablocks_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.7.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`Taichi <../compatibility/ml-compatibility/taichi-compatibility>` [#taichi_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,1.8.0b1,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`Ray <../compatibility/ml-compatibility/ray-compatibility>` [#ray_compat-past-60]_,N/A,N/A,N/A,N/A,2.48.0.post0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`llama.cpp <../compatibility/ml-compatibility/llama-cpp-compatibility>` [#llama-cpp_compat-past-60]_,N/A,b6356,b6356,b6356,b6356,b5997,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`FlashInfer <../compatibility/ml-compatibility/flashinfer-compatibility>` [#flashinfer_compat-past-60]_,N/A,N/A,N/A,N/A,v0.2.5,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.22.0,1.22.0,1.20.0,1.20.0,1.20.0,1.20.0,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.14.1,1.14.1
-      ,,,,,,,,,,,,,,,,,,,,
-      ,,,,,,,,,,,,,,,,,,,,
-      THIRD PARTY COMMS,.. _thirdpartycomms-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
-      `UCC <https://github.com/ROCm/ucc>`_,>=1.4.0,>=1.4.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.2.0,>=1.2.0
-      `UCX <https://github.com/ROCm/ucx>`_,>=1.17.0,>=1.17.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1
-      ,,,,,,,,,,,,,,,,,,,,
-      THIRD PARTY ALGORITHM,.. _thirdpartyalgorithm-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
-      Thrust,2.6.0,2.6.0,2.5.0,2.5.0,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
-      CUB,2.6.0,2.6.0,2.5.0,2.5.0,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
-      ,,,,,,,,,,,,,,,,,,,,
-     DRIVER & USER SPACE [#kfd_support-past-60]_,.. _kfd-userspace-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
-      :doc:`AMD GPU Driver <rocm-install-on-linux:reference/user-kernel-space-compat-matrix>`,"30.10.2, 30.10.1 [#driver_patch-past-60]_, 30.10, 6.4.x, 6.3.x","30.10.1 [#driver_patch-past-60]_, 30.10, 6.4.x, 6.3.x, 6.2.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x"
-      ,,,,,,,,,,,,,,,,,,,,
-      ML & COMPUTER VISION,.. _mllibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
-      :doc:`Composable Kernel <composable_kernel:index>`,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0
-      :doc:`MIGraphX <amdmigraphx:index>`,2.13.0,2.13.0,2.12.0,2.12.0,2.12.0,2.12.0,2.11.0,2.11.0,2.11.0,2.11.0,2.10.0,2.10.0,2.10.0,2.10.0,2.9.0,2.9.0,2.9.0,2.9.0,2.8.0,2.8.0
-      :doc:`MIOpen <miopen:index>`,3.5.0,3.5.0,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
-      :doc:`MIVisionX <mivisionx:index>`,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0
-      :doc:`rocAL <rocal:index>`,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0,2.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
-      :doc:`rocDecode <rocdecode:index>`,1.0.0,1.0.0,0.10.0,0.10.0,0.10.0,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.0,0.5.0,N/A,N/A
-      :doc:`rocJPEG <rocjpeg:index>`,1.1.0,1.1.0,0.8.0,0.8.0,0.8.0,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`rocPyDecode <rocpydecode:index>`,0.6.0,0.6.0,0.3.1,0.3.1,0.3.1,0.3.1,0.2.0,0.2.0,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`RPP <rpp:index>`,2.0.0,2.0.0,1.9.10,1.9.10,1.9.10,1.9.10,1.9.1,1.9.1,1.9.1,1.9.1,1.8.0,1.8.0,1.8.0,1.8.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0
-      ,,,,,,,,,,,,,,,,,,,,
-      COMMUNICATION,.. _commlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
-      :doc:`RCCL <rccl:index>`,2.26.6,2.26.6,2.22.3,2.22.3,2.22.3,2.22.3,2.21.5,2.21.5,2.21.5,2.21.5,2.20.5,2.20.5,2.20.5,2.20.5,2.18.6,2.18.6,2.18.6,2.18.6,2.18.3,2.18.3
-      :doc:`rocSHMEM <rocshmem:index>`,3.0.0,3.0.0,2.0.1,2.0.1,2.0.0,2.0.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      ,,,,,,,,,,,,,,,,,,,,
-      MATH LIBS,.. _mathlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
-      `half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0
-      :doc:`hipBLAS <hipblas:index>`,3.0.2,3.0.0,2.4.0,2.4.0,2.4.0,2.4.0,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0
-      :doc:`hipBLASLt <hipblaslt:index>`,1.0.0,1.0.0,0.12.1,0.12.1,0.12.1,0.12.0,0.10.0,0.10.0,0.10.0,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.7.0,0.7.0,0.7.0,0.7.0,0.6.0,0.6.0
-      :doc:`hipFFT <hipfft:index>`,1.0.20,1.0.20,1.0.18,1.0.18,1.0.18,1.0.18,1.0.17,1.0.17,1.0.17,1.0.17,1.0.16,1.0.15,1.0.15,1.0.14,1.0.14,1.0.14,1.0.14,1.0.14,1.0.13,1.0.13
-      :doc:`hipfort <hipfort:index>`,0.7.0,0.7.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.1,0.5.1,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0
-      :doc:`hipRAND <hiprand:index>`,3.0.0,3.0.0,2.12.0,2.12.0,2.12.0,2.12.0,2.11.1,2.11.1,2.11.1,2.11.0,2.11.1,2.11.0,2.11.0,2.11.0,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16
-      :doc:`hipSOLVER <hipsolver:index>`,3.0.0,3.0.0,2.4.0,2.4.0,2.4.0,2.4.0,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.1,2.1.1,2.1.1,2.1.0,2.0.0,2.0.0
-      :doc:`hipSPARSE <hipsparse:index>`,4.0.1,4.0.1,3.2.0,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.1.1,3.1.1,3.1.1,3.1.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
-      :doc:`hipSPARSELt <hipsparselt:index>`,0.2.4,0.2.4,0.2.3,0.2.3,0.2.3,0.2.3,0.2.2,0.2.2,0.2.2,0.2.2,0.2.1,0.2.1,0.2.1,0.2.1,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0
-      :doc:`rocALUTION <rocalution:index>`,4.0.0,4.0.0,3.2.3,3.2.3,3.2.3,3.2.2,3.2.1,3.2.1,3.2.1,3.2.1,3.2.1,3.2.0,3.2.0,3.2.0,3.1.1,3.1.1,3.1.1,3.1.1,3.0.3,3.0.3
-      :doc:`rocBLAS <rocblas:index>`,5.0.2,5.0.0,4.4.1,4.4.1,4.4.0,4.4.0,4.3.0,4.3.0,4.3.0,4.3.0,4.2.4,4.2.1,4.2.1,4.2.0,4.1.2,4.1.2,4.1.0,4.1.0,4.0.0,4.0.0
-      :doc:`rocFFT <rocfft:index>`,1.0.34,1.0.34,1.0.32,1.0.32,1.0.32,1.0.32,1.0.31,1.0.31,1.0.31,1.0.31,1.0.30,1.0.29,1.0.29,1.0.28,1.0.27,1.0.27,1.0.27,1.0.26,1.0.25,1.0.23
-      :doc:`rocRAND <rocrand:index>`,4.0.0,4.0.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.1,3.1.0,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,2.10.17
-      :doc:`rocSOLVER <rocsolver:index>`,3.30.1,3.30.0,3.28.2,3.28.2,3.28.0,3.28.0,3.27.0,3.27.0,3.27.0,3.27.0,3.26.2,3.26.0,3.26.0,3.26.0,3.25.0,3.25.0,3.25.0,3.25.0,3.24.0,3.24.0
-      :doc:`rocSPARSE <rocsparse:index>`,4.0.2,4.0.2,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.0.2,3.0.2
-      :doc:`rocWMMA <rocwmma:index>`,2.0.0,2.0.0,1.7.0,1.7.0,1.7.0,1.7.0,1.6.0,1.6.0,1.6.0,1.6.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0
-      :doc:`Tensile <tensile:src/index>`,4.44.0,4.44.0,4.43.0,4.43.0,4.43.0,4.43.0,4.42.0,4.42.0,4.42.0,4.42.0,4.41.0,4.41.0,4.41.0,4.41.0,4.40.0,4.40.0,4.40.0,4.40.0,4.39.0,4.39.0
-      ,,,,,,,,,,,,,,,,,,,,
-      PRIMITIVES,.. _primitivelibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
-      :doc:`hipCUB <hipcub:index>`,4.0.0,4.0.0,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
-      :doc:`hipTensor <hiptensor:index>`,2.0.0,2.0.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0,1.3.0,1.3.0,1.2.0,1.2.0,1.2.0,1.2.0,1.1.0,1.1.0
-      :doc:`rocPRIM <rocprim:index>`,4.0.1,4.0.0,3.4.1,3.4.1,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.2,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
-      :doc:`rocThrust <rocthrust:index>`,4.0.0,4.0.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.1.1,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
-      ,,,,,,,,,,,,,,,,,,,,
-      SUPPORT LIBS,,,,,,,,,,,,,,,,,,,,
-      `hipother <https://github.com/ROCm/hipother>`_,7.0.51830,7.0.51830,6.4.43483,6.4.43483,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
-      `rocm-core <https://github.com/ROCm/rocm-core>`_,7.0.2,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0,6.1.5,6.1.2,6.1.1,6.1.0,6.0.2,6.0.0
-      `ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,20240607.5.7,20240607.5.7,20240607.4.05,20240607.1.4246,20240125.5.08,20240125.5.08,20240125.5.08,20240125.3.30,20231016.2.245,20231016.2.245
-      ,,,,,,,,,,,,,,,,,,,,
-      SYSTEM MGMT TOOLS,.. _tools-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
-      :doc:`AMD SMI <amdsmi:index>`,26.0.2,26.0.0,25.5.1,25.5.1,25.4.2,25.3.0,24.7.1,24.7.1,24.7.1,24.7.1,24.6.3,24.6.3,24.6.3,24.6.2,24.5.1,24.5.1,24.5.1,24.4.1,23.4.2,23.4.2
-      :doc:`ROCm Data Center Tool <rdc:index>`,1.1.0,1.1.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0
-      :doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
-      :doc:`ROCm SMI <rocm_smi_lib:index>`,7.8.0,7.8.0,7.7.0,7.5.0,7.5.0,7.5.0,7.4.0,7.4.0,7.4.0,7.4.0,7.3.0,7.3.0,7.3.0,7.3.0,7.2.0,7.2.0,7.0.0,7.0.0,6.0.2,6.0.0
-      :doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.2.0,1.2.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.0.60204,1.0.60202,1.0.60201,1.0.60200,1.0.60105,1.0.60102,1.0.60101,1.0.60100,1.0.60002,1.0.60000
-      ,,,,,,,,,,,,,,,,,,,,
-      PERFORMANCE TOOLS,,,,,,,,,,,,,,,,,,,,
-      :doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,2.6.0,2.6.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0
-      :doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.2.3,3.2.3,3.1.1,3.1.1,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.0.1,2.0.1,2.0.1,2.0.1,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.1.1,1.1.0,1.0.2,1.0.2,1.0.1,1.0.0,0.1.2,0.1.1,0.1.0,0.1.0,1.11.2,1.11.2,1.11.2,1.11.2,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`ROCProfiler <rocprofiler:index>`,2.0.70002,2.0.70000,2.0.60403,2.0.60402,2.0.60401,2.0.60400,2.0.60303,2.0.60302,2.0.60301,2.0.60300,2.0.60204,2.0.60202,2.0.60201,2.0.60200,2.0.60105,2.0.60102,2.0.60101,2.0.60100,2.0.60002,2.0.60000
-      :doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,1.0.0,1.0.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.0,0.5.0,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`ROCTracer <roctracer:index>`,4.1.70002,4.1.70000,4.1.60403,4.1.60402,4.1.60401,4.1.60400,4.1.60303,4.1.60302,4.1.60301,4.1.60300,4.1.60204,4.1.60202,4.1.60201,4.1.60200,4.1.60105,4.1.60102,4.1.60101,4.1.60100,4.1.60002,4.1.60000
-      ,,,,,,,,,,,,,,,,,,,,
-      DEVELOPMENT TOOLS,,,,,,,,,,,,,,,,,,,,
-      :doc:`HIPIFY <hipify:index>`,20.0.0,20.0.0,19.0.0,19.0.0,19.0.0,19.0.0,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
-      :doc:`ROCm CMake <rocmcmakebuildtools:index>`,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.13.0,0.13.0,0.13.0,0.13.0,0.12.0,0.12.0,0.12.0,0.12.0,0.11.0,0.11.0
-      :doc:`ROCdbgapi <rocdbgapi:index>`,0.77.4,0.77.3,0.77.2,0.77.2,0.77.2,0.77.2,0.77.0,0.77.0,0.77.0,0.77.0,0.76.0,0.76.0,0.76.0,0.76.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0
-      :doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,16.3.0,16.3.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,14.2.0,14.2.0,14.2.0,14.2.0,14.1.0,14.1.0,14.1.0,14.1.0,13.2.0,13.2.0
-      `rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.3.0,0.3.0,0.3.0,0.3.0,N/A,N/A
-      :doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.1.0,2.1.0,2.0.4,2.0.4,2.0.4,2.0.4,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3
-      ,,,,,,,,,,,,,,,,,,,,
-      COMPILERS,.. _compilers-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
-      `clang-ocl <https://github.com/ROCm/clang-ocl>`_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0
-      :doc:`hipCC <hipcc:index>`,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
-      `Flang <https://github.com/ROCm/flang>`_,20.0.0.25385,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
-      :doc:`llvm-project <llvm-project:index>`,20.0.0.25385,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
-      `OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,20.0.0.25385,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
-      ,,,,,,,,,,,,,,,,,,,,
-      RUNTIMES,.. _runtime-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
-      :doc:`AMD CLR <hip:understand/amd_clr>`,7.0.51831,7.0.51830,6.4.43484,6.4.43484,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
-      :doc:`HIP <hip:index>`,7.0.51831,7.0.51830,6.4.43484,6.4.43484,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
-      `OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0
-      :doc:`ROCr Runtime <rocr-runtime:index>`,1.18.0,1.18.0,1.15.0,1.15.0,1.15.0,1.15.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.13.0,1.13.0,1.13.0,1.13.0,1.13.0,1.12.0,1.12.0
+ROCm Version,7.1.1,7.1.0,7.0.2,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0, 6.1.5, 6.1.2, 6.1.1, 6.1.0, 6.0.2, 6.0.0
+      :ref:`Operating systems & kernels <OS-kernel-versions>` [#os-compatibility-past-60]_,Ubuntu 24.04.3,Ubuntu 24.04.3,Ubuntu 24.04.3,Ubuntu 24.04.3,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,"Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04",Ubuntu 24.04,,,,,,
+      ,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,"Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3, 22.04.2","Ubuntu 22.04.4, 22.04.3, 22.04.2"
+      ,,,,,,,,,,,,,,,,,"Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5"
+      ,"RHEL 10.1, 10.0, 9.7, 9.6, 9.4","RHEL 10.0, 9.6, 9.4","RHEL 10.0, 9.6, 9.4","RHEL 9.6, 9.4","RHEL 9.6, 9.4","RHEL 9.6, 9.4","RHEL 9.6, 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.3, 9.2","RHEL 9.3, 9.2"
+      ,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,"RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8"
+      ,SLES 15 SP7,SLES 15 SP7,SLES 15 SP7,SLES 15 SP7,"SLES 15 SP7, SP6","SLES 15 SP7, SP6",SLES 15 SP6,SLES 15 SP6,"SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4"
+      ,,,,,,,,,,,,,,,,,,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9
+      ,"Oracle Linux 10, 9, 8","Oracle Linux 10, 9, 8","Oracle Linux 10, 9, 8","Oracle Linux 9, 8","Oracle Linux 9, 8","Oracle Linux 9, 8","Oracle Linux 9, 8","Oracle Linux 9, 8",Oracle Linux 8.10,Oracle Linux 8.10,Oracle Linux 8.10,Oracle Linux 8.10,Oracle Linux 8.9,Oracle Linux 8.9,Oracle Linux 8.9,Oracle Linux 8.9,Oracle Linux 8.9,Oracle Linux 8.9,Oracle Linux 8.9,,,
+      ,"Debian 13, 12","Debian 13, 12","Debian 13, 12",Debian 12,Debian 12,Debian 12,Debian 12,Debian 12,Debian 12,Debian 12,Debian 12,,,,,,,,,,,
+      ,,,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,Azure Linux 3.0,,,,,,,,,,,,
+      ,Rocky Linux 9,Rocky Linux 9,Rocky Linux 9,Rocky Linux 9,,,,,,,,,,,,,,,,,,
+      ,.. _architecture-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
+      :doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA4,CDNA4,CDNA4,CDNA4,,,,,,,,,,,,,,,,,,
+      ,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3
+      ,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2
+      ,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA
+      ,RDNA4,RDNA4,RDNA4,RDNA4,RDNA4,RDNA4,RDNA4,,,,,,,,,,,,,,,
+      ,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3
+      ,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2
+      ,.. _gpu-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
+      :doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>`  [#gpu-compatibility-past-60]_,gfx950,gfx950,gfx950,gfx950,,,,,,,,,,,,,,,,,,
+      ,gfx1201,gfx1201,gfx1201,gfx1201,gfx1201,gfx1201,gfx1201,,,,,,,,,,,,,,,
+      ,gfx1200,gfx1200,gfx1200,gfx1200,gfx1200,gfx1200,gfx1200,,,,,,,,,,,,,,,
+      ,gfx1101,gfx1101,gfx1101,gfx1101,gfx1101,gfx1101,gfx1101,,,,,,,,,,,,,,,
+      ,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100
+      ,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030
+      ,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942, gfx942, gfx942, gfx942, gfx942, gfx942, gfx942
+      ,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a
+      ,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908
+      ,,,,,,,,,,,,,,,,,,,,,,
+      FRAMEWORK SUPPORT,.. _framework-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
+      :doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.9, 2.8, 2.7","2.8, 2.7, 2.6","2.8, 2.7, 2.6","2.7, 2.6, 2.5","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13"
+      :doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.20.0, 2.19.1, 2.18.1","2.20.0, 2.19.1, 2.18.1","2.19.1, 2.18.1, 2.17.1 [#tf-mi350-past-60]_","2.19.1, 2.18.1, 2.17.1 [#tf-mi350-past-60]_","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.14.0, 2.13.1, 2.12.1","2.14.0, 2.13.1, 2.12.1"
+      :doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.7.1,0.7.1,0.6.0,0.6.0,0.4.35,0.4.35,0.4.35,0.4.35,0.4.31,0.4.31,0.4.31,0.4.31,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26
+      :doc:`verl <../compatibility/ml-compatibility/verl-compatibility>` [#verl_compat-past-60]_,N/A,N/A,N/A,0.6.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.3.0.post0,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>` [#stanford-megatron-lm_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,85f95ae,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat-past-60]_,N/A,N/A,N/A,2.4.0,2.4.0,N/A,N/A,2.4.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>` [#megablocks_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.7.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`Ray <../compatibility/ml-compatibility/ray-compatibility>` [#ray_compat-past-60]_,N/A,N/A,N/A,2.51.1,N/A,N/A,2.48.0.post0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`llama.cpp <../compatibility/ml-compatibility/llama-cpp-compatibility>` [#llama-cpp_compat-past-60]_,N/A,N/A,N/A,b6652,b6356,b6356,b6356,b5997,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`FlashInfer <../compatibility/ml-compatibility/flashinfer-compatibility>` [#flashinfer_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,v0.2.5,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.23.1,1.22.0,1.22.0,1.22.0,1.20.0,1.20.0,1.20.0,1.20.0,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.14.1,1.14.1
+      ,,,,,,,,,,,,,,,,,,,,,,
+      ,,,,,,,,,,,,,,,,,,,,,,
+      THIRD PARTY COMMS,.. _thirdpartycomms-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
+      `UCC <https://github.com/ROCm/ucc>`_,>=1.4.0,>=1.4.0,>=1.4.0,>=1.4.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.2.0,>=1.2.0
+      `UCX <https://github.com/ROCm/ucx>`_,>=1.17.0,>=1.17.0,>=1.17.0,>=1.17.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1
+      ,,,,,,,,,,,,,,,,,,,,,,
+      THIRD PARTY ALGORITHM,.. _thirdpartyalgorithm-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
+      Thrust,2.8.5,2.8.5,2.6.0,2.6.0,2.5.0,2.5.0,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
+      CUB,2.8.5,2.8.5,2.6.0,2.6.0,2.5.0,2.5.0,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
+      ,,,,,,,,,,,,,,,,,,,,,,
+     DRIVER & USER SPACE [#kfd_support-past-60]_,.. _kfd-userspace-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
+      :doc:`AMD GPU Driver <rocm-install-on-linux:reference/user-kernel-space-compat-matrix>`,"30.20.1, 30.20.0 [#mi325x_KVM-past-60]_, 30.10.2, 30.10.1 [#driver_patch-past-60]_, 30.10, 6.4.x","30.20.0 [#mi325x_KVM-past-60]_, 30.10.2, 30.10.1 [#driver_patch-past-60]_, 30.10, 6.4.x","30.10.2, 30.10.1 [#driver_patch-past-60]_, 30.10, 6.4.x, 6.3.x","30.10.1 [#driver_patch-past-60]_, 30.10, 6.4.x, 6.3.x, 6.2.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x"
+      ,,,,,,,,,,,,,,,,,,,,,,
+      ML & COMPUTER VISION,.. _mllibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
+      :doc:`Composable Kernel <composable_kernel:index>`,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0
+      :doc:`MIGraphX <amdmigraphx:index>`,2.14.0,2.14.0,2.13.0,2.13.0,2.12.0,2.12.0,2.12.0,2.12.0,2.11.0,2.11.0,2.11.0,2.11.0,2.10.0,2.10.0,2.10.0,2.10.0,2.9.0,2.9.0,2.9.0,2.9.0,2.8.0,2.8.0
+      :doc:`MIOpen <miopen:index>`,3.5.1,3.5.1,3.5.0,3.5.0,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
+      :doc:`MIVisionX <mivisionx:index>`,3.4.0,3.4.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0
+      :doc:`rocAL <rocal:index>`,2.4.0,2.4.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0,2.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
+      :doc:`rocDecode <rocdecode:index>`,1.4.0,1.4.0,1.0.0,1.0.0,0.10.0,0.10.0,0.10.0,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.0,0.5.0,N/A,N/A
+      :doc:`rocJPEG <rocjpeg:index>`,1.2.0,1.2.0,1.1.0,1.1.0,0.8.0,0.8.0,0.8.0,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`rocPyDecode <rocpydecode:index>`,0.7.0,0.7.0,0.6.0,0.6.0,0.3.1,0.3.1,0.3.1,0.3.1,0.2.0,0.2.0,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`RPP <rpp:index>`,2.1.0,2.1.0,2.0.0,2.0.0,1.9.10,1.9.10,1.9.10,1.9.10,1.9.1,1.9.1,1.9.1,1.9.1,1.8.0,1.8.0,1.8.0,1.8.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0
+      ,,,,,,,,,,,,,,,,,,,,,,
+      COMMUNICATION,.. _commlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
+      :doc:`RCCL <rccl:index>`,2.27.7,2.27.7,2.26.6,2.26.6,2.22.3,2.22.3,2.22.3,2.22.3,2.21.5,2.21.5,2.21.5,2.21.5,2.20.5,2.20.5,2.20.5,2.20.5,2.18.6,2.18.6,2.18.6,2.18.6,2.18.3,2.18.3
+      :doc:`rocSHMEM <rocshmem:index>`,3.1.0,3.0.0,3.0.0,3.0.0,2.0.1,2.0.1,2.0.0,2.0.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      ,,,,,,,,,,,,,,,,,,,,,,
+      MATH LIBS,.. _mathlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
+      `half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0
+      :doc:`hipBLAS <hipblas:index>`,3.1.0,3.1.0,3.0.2,3.0.0,2.4.0,2.4.0,2.4.0,2.4.0,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0
+      :doc:`hipBLASLt <hipblaslt:index>`,1.1.0,1.1.0,1.0.0,1.0.0,0.12.1,0.12.1,0.12.1,0.12.0,0.10.0,0.10.0,0.10.0,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.7.0,0.7.0,0.7.0,0.7.0,0.6.0,0.6.0
+      :doc:`hipFFT <hipfft:index>`,1.0.21,1.0.21,1.0.20,1.0.20,1.0.18,1.0.18,1.0.18,1.0.18,1.0.17,1.0.17,1.0.17,1.0.17,1.0.16,1.0.15,1.0.15,1.0.14,1.0.14,1.0.14,1.0.14,1.0.14,1.0.13,1.0.13
+      :doc:`hipfort <hipfort:index>`,0.7.1,0.7.1,0.7.0,0.7.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.1,0.5.1,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0
+      :doc:`hipRAND <hiprand:index>`,3.1.0,3.1.0,3.0.0,3.0.0,2.12.0,2.12.0,2.12.0,2.12.0,2.11.1,2.11.1,2.11.1,2.11.0,2.11.1,2.11.0,2.11.0,2.11.0,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16
+      :doc:`hipSOLVER <hipsolver:index>`,3.1.0,3.1.0,3.0.0,3.0.0,2.4.0,2.4.0,2.4.0,2.4.0,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.1,2.1.1,2.1.1,2.1.0,2.0.0,2.0.0
+      :doc:`hipSPARSE <hipsparse:index>`,4.1.0,4.1.0,4.0.1,4.0.1,3.2.0,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.1.1,3.1.1,3.1.1,3.1.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
+      :doc:`hipSPARSELt <hipsparselt:index>`,0.2.5,0.2.5,0.2.4,0.2.4,0.2.3,0.2.3,0.2.3,0.2.3,0.2.2,0.2.2,0.2.2,0.2.2,0.2.1,0.2.1,0.2.1,0.2.1,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0
+      :doc:`rocALUTION <rocalution:index>`,4.0.1,4.0.1,4.0.0,4.0.0,3.2.3,3.2.3,3.2.3,3.2.2,3.2.1,3.2.1,3.2.1,3.2.1,3.2.1,3.2.0,3.2.0,3.2.0,3.1.1,3.1.1,3.1.1,3.1.1,3.0.3,3.0.3
+      :doc:`rocBLAS <rocblas:index>`,5.1.1,5.1.0,5.0.2,5.0.0,4.4.1,4.4.1,4.4.0,4.4.0,4.3.0,4.3.0,4.3.0,4.3.0,4.2.4,4.2.1,4.2.1,4.2.0,4.1.2,4.1.2,4.1.0,4.1.0,4.0.0,4.0.0
+      :doc:`rocFFT <rocfft:index>`,1.0.35,1.0.35,1.0.34,1.0.34,1.0.32,1.0.32,1.0.32,1.0.32,1.0.31,1.0.31,1.0.31,1.0.31,1.0.30,1.0.29,1.0.29,1.0.28,1.0.27,1.0.27,1.0.27,1.0.26,1.0.25,1.0.23
+      :doc:`rocRAND <rocrand:index>`,4.1.0,4.1.0,4.0.0,4.0.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.1,3.1.0,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,2.10.17
+      :doc:`rocSOLVER <rocsolver:index>`,3.31.0,3.31.0,3.30.1,3.30.0,3.28.2,3.28.2,3.28.0,3.28.0,3.27.0,3.27.0,3.27.0,3.27.0,3.26.2,3.26.0,3.26.0,3.26.0,3.25.0,3.25.0,3.25.0,3.25.0,3.24.0,3.24.0
+      :doc:`rocSPARSE <rocsparse:index>`,4.1.0,4.1.0,4.0.2,4.0.2,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.0.2,3.0.2
+      :doc:`rocWMMA <rocwmma:index>`,2.1.0,2.0.0,2.0.0,2.0.0,1.7.0,1.7.0,1.7.0,1.7.0,1.6.0,1.6.0,1.6.0,1.6.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0
+      :doc:`Tensile <tensile:src/index>`,4.44.0,4.44.0,4.44.0,4.44.0,4.43.0,4.43.0,4.43.0,4.43.0,4.42.0,4.42.0,4.42.0,4.42.0,4.41.0,4.41.0,4.41.0,4.41.0,4.40.0,4.40.0,4.40.0,4.40.0,4.39.0,4.39.0
+      ,,,,,,,,,,,,,,,,,,,,,,
+      PRIMITIVES,.. _primitivelibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
+      :doc:`hipCUB <hipcub:index>`,4.1.0,4.1.0,4.0.0,4.0.0,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
+      :doc:`hipTensor <hiptensor:index>`,2.0.0,2.0.0,2.0.0,2.0.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0,1.3.0,1.3.0,1.2.0,1.2.0,1.2.0,1.2.0,1.1.0,1.1.0
+      :doc:`rocPRIM <rocprim:index>`,4.1.0,4.1.0,4.0.1,4.0.0,3.4.1,3.4.1,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.2,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
+      :doc:`rocThrust <rocthrust:index>`,4.1.0,4.1.0,4.0.0,4.0.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.1.1,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
+      ,,,,,,,,,,,,,,,,,,,,,,
+      SUPPORT LIBS,,,,,,,,,,,,,,,,,,,,,,
+      `hipother <https://github.com/ROCm/hipother>`_,7.1.52802,7.1.25424,7.0.51831,7.0.51830,6.4.43483,6.4.43483,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
+      `rocm-core <https://github.com/ROCm/rocm-core>`_,7.1.1,7.1.0,7.0.2,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0,6.1.5,6.1.2,6.1.1,6.1.0,6.0.2,6.0.0
+      `ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,20240607.5.7,20240607.5.7,20240607.4.05,20240607.1.4246,20240125.5.08,20240125.5.08,20240125.5.08,20240125.3.30,20231016.2.245,20231016.2.245
+      ,,,,,,,,,,,,,,,,,,,,,,
+      SYSTEM MGMT TOOLS,.. _tools-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
+      :doc:`AMD SMI <amdsmi:index>`,26.2.0,26.1.0,26.0.2,26.0.0,25.5.1,25.5.1,25.4.2,25.3.0,24.7.1,24.7.1,24.7.1,24.7.1,24.6.3,24.6.3,24.6.3,24.6.2,24.5.1,24.5.1,24.5.1,24.4.1,23.4.2,23.4.2
+      :doc:`ROCm Data Center Tool <rdc:index>`,1.2.0,1.2.0,1.1.0,1.1.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0
+      :doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
+      :doc:`ROCm SMI <rocm_smi_lib:index>`,7.8.0,7.8.0,7.8.0,7.8.0,7.7.0,7.5.0,7.5.0,7.5.0,7.4.0,7.4.0,7.4.0,7.4.0,7.3.0,7.3.0,7.3.0,7.3.0,7.2.0,7.2.0,7.0.0,7.0.0,6.0.2,6.0.0
+      :doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.3.0,1.2.0,1.2.0,1.2.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.0.60204,1.0.60202,1.0.60201,1.0.60200,1.0.60105,1.0.60102,1.0.60101,1.0.60100,1.0.60002,1.0.60000
+      ,,,,,,,,,,,,,,,,,,,,,,
+      PERFORMANCE TOOLS,,,,,,,,,,,,,,,,,,,,,,
+      :doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,2.6.0,2.6.0,2.6.0,2.6.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0
+      :doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.3.1,3.3.0,3.2.3,3.2.3,3.1.1,3.1.1,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.0.1,2.0.1,2.0.1,2.0.1,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.2.1,1.2.0,1.1.1,1.1.0,1.0.2,1.0.2,1.0.1,1.0.0,0.1.2,0.1.1,0.1.0,0.1.0,1.11.2,1.11.2,1.11.2,1.11.2,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`ROCProfiler <rocprofiler:index>`,2.0.70101,2.0.70100,2.0.70002,2.0.70000,2.0.60403,2.0.60402,2.0.60401,2.0.60400,2.0.60303,2.0.60302,2.0.60301,2.0.60300,2.0.60204,2.0.60202,2.0.60201,2.0.60200,2.0.60105,2.0.60102,2.0.60101,2.0.60100,2.0.60002,2.0.60000
+      :doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,1.0.0,1.0.0,1.0.0,1.0.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.0,0.5.0,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`ROCTracer <roctracer:index>`,4.1.70101,4.1.70100,4.1.70002,4.1.70000,4.1.60403,4.1.60402,4.1.60401,4.1.60400,4.1.60303,4.1.60302,4.1.60301,4.1.60300,4.1.60204,4.1.60202,4.1.60201,4.1.60200,4.1.60105,4.1.60102,4.1.60101,4.1.60100,4.1.60002,4.1.60000
+      ,,,,,,,,,,,,,,,,,,,,,,
+      DEVELOPMENT TOOLS,,,,,,,,,,,,,,,,,,,,,,
+      :doc:`HIPIFY <hipify:index>`,20.0.0,20.0.0,20.0.0,20.0.0,19.0.0,19.0.0,19.0.0,19.0.0,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
+      :doc:`ROCm CMake <rocmcmakebuildtools:index>`,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.13.0,0.13.0,0.13.0,0.13.0,0.12.0,0.12.0,0.12.0,0.12.0,0.11.0,0.11.0
+      :doc:`ROCdbgapi <rocdbgapi:index>`,0.77.4,0.77.4,0.77.4,0.77.3,0.77.2,0.77.2,0.77.2,0.77.2,0.77.0,0.77.0,0.77.0,0.77.0,0.76.0,0.76.0,0.76.0,0.76.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0
+      :doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,16.3.0,16.3.0,16.3.0,16.3.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,14.2.0,14.2.0,14.2.0,14.2.0,14.1.0,14.1.0,14.1.0,14.1.0,13.2.0,13.2.0
+      `rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.5.0,0.5.0,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.3.0,0.3.0,0.3.0,0.3.0,N/A,N/A
+      :doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.1.0,2.1.0,2.1.0,2.1.0,2.0.4,2.0.4,2.0.4,2.0.4,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3
+      ,,,,,,,,,,,,,,,,,,,,,,
+      COMPILERS,.. _compilers-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
+      `clang-ocl <https://github.com/ROCm/clang-ocl>`_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0
+      :doc:`hipCC <hipcc:index>`,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
+      `Flang <https://github.com/ROCm/flang>`_,20.0.025444,20.0.025425,20.0.0.25385,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
+      :doc:`llvm-project <llvm-project:index>`,20.0.025444,20.0.025425,20.0.0.25385,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
+      `OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,20.0.025444,20.0.025425,20.0.0.25385,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
+      ,,,,,,,,,,,,,,,,,,,,,,
+      RUNTIMES,.. _runtime-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,,,
+      :doc:`AMD CLR <hip:understand/amd_clr>`,7.1.52802,7.1.25424,7.0.51831,7.0.51830,6.4.43484,6.4.43484,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
+      :doc:`HIP <hip:index>`,7.1.52802,7.1.25424,7.0.51831,7.0.51830,6.4.43484,6.4.43484,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
+      `OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0
+      :doc:`ROCr Runtime <rocr-runtime:index>`,1.18.0,1.18.0,1.18.0,1.18.0,1.15.0,1.15.0,1.15.0,1.15.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.13.0,1.13.0,1.13.0,1.13.0,1.13.0,1.12.0,1.12.0
--- a/docs/compatibility/compatibility-matrix.rst
+++ b/docs/compatibility/compatibility-matrix.rst
@@ -12,7 +12,7 @@ You can also refer to the :ref:`past versions of ROCm compatibility matrix<past-

 GPUs listed in the following table support compute workloads (no display
 information or graphics). If you’re using ROCm with AMD Radeon GPUs or Ryzen APUs for graphics
-workloads, see the :docs:`Use ROCm on Radeon and Ryzen <radeon:index.html>` to verify
+workloads, see the :doc:`Use ROCm on Radeon and Ryzen <radeon:index>` to verify
 compatibility and system requirements.

 .. |br| raw:: html
@@ -22,18 +22,18 @@ compatibility and system requirements.
 .. container:: format-big-table

  .. csv-table::
-      :header: "ROCm Version", "7.0.2", "7.0.1/7.0.0", "6.4.0"
+      :header: "ROCm Version", "7.1.1", "7.1.0", "6.4.0"
      :stub-columns: 1

-      :ref:`Operating systems & kernels <OS-kernel-versions>`,Ubuntu 24.04.3,Ubuntu 24.04.3,Ubuntu 24.04.2
+      :ref:`Operating systems & kernels <OS-kernel-versions>` [#os-compatibility]_,Ubuntu 24.04.3,Ubuntu 24.04.3,Ubuntu 24.04.2
      ,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5
-      ,"RHEL 10.0 [#rhel-10-702]_, 9.6 [#rhel-10-702]_, 9.4 [#rhel-94-702]_","RHEL 9.6 [#rhel-10-702]_, 9.4 [#rhel-94-702]_","RHEL 9.5, 9.4"
-      ,RHEL 8.10 [#rhel-700]_,RHEL 8.10 [#rhel-700]_,RHEL 8.10
-      ,SLES 15 SP7 [#sles-db-700]_,SLES 15 SP7 [#sles-db-700]_,SLES 15 SP6
-      ,"Oracle Linux 10, 9, 8 [#ol-700-mi300x]_","Oracle Linux 9, 8 [#ol-700-mi300x]_","Oracle Linux 9, 8 [#ol-mi300x]_"
-      ,"Debian 13 [#db-mi300x]_, 12 [#sles-db-700]_",Debian 12 [#sles-db-700]_,Debian 12 [#single-node]_
-      ,Azure Linux 3.0 [#az-mi300x]_,Azure Linux 3.0 [#az-mi300x]_,Azure Linux 3.0 [#az-mi300x]_
-      ,Rocky Linux 9 [#rl-700]_,Rocky Linux 9 [#rl-700]_,
+      ,"RHEL 10.1, 10.0,  9.7, |br| 9.6, 9.4","RHEL 10.0, 9.6, 9.4","RHEL 9.5, 9.4"
+      ,RHEL 8.10,RHEL 8.10,RHEL 8.10
+      ,SLES 15 SP7,SLES 15 SP7,SLES 15 SP6
+      ,"Oracle Linux 10, 9, 8","Oracle Linux 10, 9, 8","Oracle Linux 9, 8"
+      ,"Debian 13, 12","Debian 13, 12",Debian 12
+      ,,,Azure Linux 3.0
+      ,Rocky Linux 9,Rocky Linux 9,
      ,.. _architecture-support-compatibility-matrix:,,
      :doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA4,CDNA4,
      ,CDNA3,CDNA3,CDNA3
@@ -43,99 +43,99 @@ compatibility and system requirements.
      ,RDNA3,RDNA3,RDNA3
      ,RDNA2,RDNA2,RDNA2
      ,.. _gpu-support-compatibility-matrix:,,
-      :doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>`,gfx950 [#mi350x-os]_,gfx950 [#mi350x-os]_,
-      ,gfx1201 [#RDNA-OS-700]_,gfx1201 [#RDNA-OS-700]_,
-      ,gfx1200 [#RDNA-OS-700]_,gfx1200 [#RDNA-OS-700]_,
-      ,gfx1101 [#RDNA-OS-700]_ [#rd-v710]_,gfx1101 [#RDNA-OS-700]_ [#rd-v710]_,
-      ,gfx1100 [#RDNA-OS-700]_,gfx1100 [#RDNA-OS-700]_,gfx1100
-      ,gfx1030 [#RDNA-OS-700]_ [#rd-v620]_,gfx1030 [#RDNA-OS-700]_ [#rd-v620]_,gfx1030
-      ,gfx942 [#mi325x-os]_ [#mi300x-os]_ [#mi300A-os]_,gfx942 [#mi325x-os]_ [#mi300x-os]_ [#mi300A-os]_,gfx942
-      ,gfx90a [#mi200x-os]_,gfx90a [#mi200x-os]_,gfx90a
-      ,gfx908 [#mi100-os]_,gfx908 [#mi100-os]_,gfx908
+      :doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>` [#gpu-compatibility]_,gfx950,gfx950,
+      ,gfx1201,gfx1201,
+      ,gfx1200,gfx1200,
+      ,gfx1101,gfx1101,
+      ,gfx1100,gfx1100,gfx1100
+      ,gfx1030,gfx1030,gfx1030
+      ,gfx942,gfx942,gfx942
+      ,gfx90a,gfx90a,gfx90a
+      ,gfx908,gfx908,gfx908
      ,,,
      FRAMEWORK SUPPORT,.. _framework-support-compatibility-matrix:,,
-      :doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.8, 2.7, 2.6","2.7, 2.6, 2.5","2.6, 2.5, 2.4, 2.3"
-      :doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.19.1, 2.18.1, 2.17.1 [#tf-mi350]_","2.19.1, 2.18.1, 2.17.1 [#tf-mi350]_","2.18.1, 2.17.1, 2.16.2"
-      :doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.6.0,0.6.0,0.4.35
+      :doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.9, 2.8, 2.7","2.8, 2.7, 2.6","2.6, 2.5, 2.4, 2.3"
+      :doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.20.0, 2.19.1, 2.18.1","2.20.0, 2.19.1, 2.18.1","2.18.1, 2.17.1, 2.16.2"
+      :doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.7.1,0.7.1,0.4.35
      :doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat]_,N/A,N/A,2.4.0
-      :doc:`llama.cpp <../compatibility/ml-compatibility/llama-cpp-compatibility>` [#llama-cpp_compat]_,N/A,b6356,b5997
-      `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.22.0,1.22.0,1.20.0
+      :doc:`llama.cpp <../compatibility/ml-compatibility/llama-cpp-compatibility>` [#llama-cpp_compat]_,N/A,N/A,b5997
+      `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.23.1,1.22.0,1.20.0
      ,,,
      THIRD PARTY COMMS,.. _thirdpartycomms-support-compatibility-matrix:,,
      `UCC <https://github.com/ROCm/ucc>`_,>=1.4.0,>=1.4.0,>=1.3.0
      `UCX <https://github.com/ROCm/ucx>`_,>=1.17.0,>=1.17.0,>=1.15.0
      ,,,
      THIRD PARTY ALGORITHM,.. _thirdpartyalgorithm-support-compatibility-matrix:,,
-      Thrust,2.6.0,2.6.0,2.5.0
-      CUB,2.6.0,2.6.0,2.5.0
+      Thrust,2.8.5,2.8.5,2.5.0
+      CUB,2.8.5,2.8.5,2.5.0
      ,,,
      DRIVER & USER SPACE [#kfd_support]_,.. _kfd-userspace-support-compatibility-matrix:,,
-      :doc:`AMD GPU Driver <rocm-install-on-linux:reference/user-kernel-space-compat-matrix>`,"30.10.2, 30.10.1 [#driver_patch]_, |br| 30.10, 6.4.x, 6.3.x","30.10.1 [#driver_patch]_, 30.10, |br| 6.4.x, 6.3.x, 6.2.x","6.4.x, 6.3.x, 6.2.x, 6.1.x"
+      :doc:`AMD GPU Driver <rocm-install-on-linux:reference/user-kernel-space-compat-matrix>`,"30.20.1, 30.20.0 [#mi325x_KVM]_, |br| 30.10.2, 30.10.1 [#driver_patch]_, |br| 30.10, 6.4.x","30.20.0 [#mi325x_KVM]_, 30.10.2, |br| 30.10.1 [#driver_patch]_, 30.10, 6.4.x","6.4.x, 6.3.x, 6.2.x, 6.1.x"
      ,,,
      ML & COMPUTER VISION,.. _mllibs-support-compatibility-matrix:,,
      :doc:`Composable Kernel <composable_kernel:index>`,1.1.0,1.1.0,1.1.0
-      :doc:`MIGraphX <amdmigraphx:index>`,2.13.0,2.13.0,2.12.0
-      :doc:`MIOpen <miopen:index>`,3.5.0,3.5.0,3.4.0
-      :doc:`MIVisionX <mivisionx:index>`,3.3.0,3.3.0,3.2.0
-      :doc:`rocAL <rocal:index>`,2.3.0,2.3.0,2.2.0
-      :doc:`rocDecode <rocdecode:index>`,1.0.0,1.0.0,0.10.0
-      :doc:`rocJPEG <rocjpeg:index>`,1.1.0,1.1.0,0.8.0
-      :doc:`rocPyDecode <rocpydecode:index>`,0.6.0,0.6.0,0.3.1
-      :doc:`RPP <rpp:index>`,2.0.0,2.0.0,1.9.10
+      :doc:`MIGraphX <amdmigraphx:index>`,2.14.0,2.14.0,2.12.0
+      :doc:`MIOpen <miopen:index>`,3.5.1,3.5.1,3.4.0
+      :doc:`MIVisionX <mivisionx:index>`,3.4.0,3.4.0,3.2.0
+      :doc:`rocAL <rocal:index>`,2.4.0,2.4.0,2.2.0
+      :doc:`rocDecode <rocdecode:index>`,1.4.0,1.4.0,0.10.0
+      :doc:`rocJPEG <rocjpeg:index>`,1.2.0,1.2.0,0.8.0
+      :doc:`rocPyDecode <rocpydecode:index>`,0.7.0,0.7.0,0.3.1
+      :doc:`RPP <rpp:index>`,2.1.0,2.1.0,1.9.10
      ,,,
      COMMUNICATION,.. _commlibs-support-compatibility-matrix:,,
-      :doc:`RCCL <rccl:index>`,2.26.6,2.26.6,2.22.3
-      :doc:`rocSHMEM <rocshmem:index>`,3.0.0,3.0.0,2.0.0
+      :doc:`RCCL <rccl:index>`,2.27.7,2.27.7,2.22.3
+      :doc:`rocSHMEM <rocshmem:index>`,3.1.0,3.0.0,2.0.0
      ,,,
      MATH LIBS,.. _mathlibs-support-compatibility-matrix:,,
      `half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0,1.12.0
-      :doc:`hipBLAS <hipblas:index>`,3.0.2,3.0.0,2.4.0
-      :doc:`hipBLASLt <hipblaslt:index>`,1.0.0,1.0.0,0.12.0
-      :doc:`hipFFT <hipfft:index>`,1.0.20,1.0.20,1.0.18
-      :doc:`hipfort <hipfort:index>`,0.7.0,0.7.0,0.6.0
-      :doc:`hipRAND <hiprand:index>`,3.0.0,3.0.0,2.12.0
-      :doc:`hipSOLVER <hipsolver:index>`,3.0.0,3.0.0,2.4.0
-      :doc:`hipSPARSE <hipsparse:index>`,4.0.1,4.0.1,3.2.0
-      :doc:`hipSPARSELt <hipsparselt:index>`,0.2.4,0.2.4,0.2.3
-      :doc:`rocALUTION <rocalution:index>`,4.0.0,4.0.0,3.2.2
-      :doc:`rocBLAS <rocblas:index>`,5.0.2,5.0.0,4.4.0
-      :doc:`rocFFT <rocfft:index>`,1.0.34,1.0.34,1.0.32
-      :doc:`rocRAND <rocrand:index>`,4.0.0,4.0.0,3.3.0
-      :doc:`rocSOLVER <rocsolver:index>`,3.30.1,3.30.0,3.28.0
-      :doc:`rocSPARSE <rocsparse:index>`,4.0.2,4.0.2,3.4.0
-      :doc:`rocWMMA <rocwmma:index>`,2.0.0,2.0.0,1.7.0
+      :doc:`hipBLAS <hipblas:index>`,3.1.0,3.1.0,2.4.0
+      :doc:`hipBLASLt <hipblaslt:index>`,1.1.0,1.1.0,0.12.0
+      :doc:`hipFFT <hipfft:index>`,1.0.21,1.0.21,1.0.18
+      :doc:`hipfort <hipfort:index>`,0.7.1,0.7.1,0.6.0
+      :doc:`hipRAND <hiprand:index>`,3.1.0,3.1.0,2.12.0
+      :doc:`hipSOLVER <hipsolver:index>`,3.1.0,3.1.0,2.4.0
+      :doc:`hipSPARSE <hipsparse:index>`,4.1.0,4.1.0,3.2.0
+      :doc:`hipSPARSELt <hipsparselt:index>`,0.2.5,0.2.5,0.2.3
+      :doc:`rocALUTION <rocalution:index>`,4.0.1,4.0.1,3.2.2
+      :doc:`rocBLAS <rocblas:index>`,5.1.1,5.1.0,4.4.0
+      :doc:`rocFFT <rocfft:index>`,1.0.35,1.0.35,1.0.32
+      :doc:`rocRAND <rocrand:index>`,4.1.0,4.1.0,3.3.0
+      :doc:`rocSOLVER <rocsolver:index>`,3.31.0,3.31.0,3.28.0
+      :doc:`rocSPARSE <rocsparse:index>`,4.1.0,4.1.0,3.4.0
+      :doc:`rocWMMA <rocwmma:index>`,2.1.0,2.0.0,1.7.0
      :doc:`Tensile <tensile:src/index>`,4.44.0,4.44.0,4.43.0
      ,,,
      PRIMITIVES,.. _primitivelibs-support-compatibility-matrix:,,
-      :doc:`hipCUB <hipcub:index>`,4.0.0,4.0.0,3.4.0
+      :doc:`hipCUB <hipcub:index>`,4.1.0,4.1.0,3.4.0
      :doc:`hipTensor <hiptensor:index>`,2.0.0,2.0.0,1.5.0
-      :doc:`rocPRIM <rocprim:index>`,4.0.1,4.0.0,3.4.0
-      :doc:`rocThrust <rocthrust:index>`,4.0.0,4.0.0,3.3.0
+      :doc:`rocPRIM <rocprim:index>`,4.1.0,4.1.0,3.4.0
+      :doc:`rocThrust <rocthrust:index>`,4.1.0,4.1.0,3.3.0
      ,,,
      SUPPORT LIBS,,,
-      `hipother <https://github.com/ROCm/hipother>`_,7.0.51830,7.0.51830,6.4.43482
-      `rocm-core <https://github.com/ROCm/rocm-core>`_,7.0.2,7.0.1/7.0.0,6.4.0
+      `hipother <https://github.com/ROCm/hipother>`_,7.1.52802,7.1.25424,6.4.43482
+      `rocm-core <https://github.com/ROCm/rocm-core>`_,7.1.1,7.1.0,6.4.0
      `ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr]_,N/A [#ROCT-rocr]_,N/A [#ROCT-rocr]_
      ,,,
      SYSTEM MGMT TOOLS,.. _tools-support-compatibility-matrix:,,
-      :doc:`AMD SMI <amdsmi:index>`,26.0.2,26.0.0,25.3.0
-      :doc:`ROCm Data Center Tool <rdc:index>`,1.1.0,1.1.0,0.3.0
+      :doc:`AMD SMI <amdsmi:index>`,26.2.0,26.1.0,25.3.0
+      :doc:`ROCm Data Center Tool <rdc:index>`,1.2.0,1.2.0,0.3.0
      :doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0,1.0.0
      :doc:`ROCm SMI <rocm_smi_lib:index>`,7.8.0,7.8.0,7.5.0
-      :doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.2.0,1.2.0,1.1.0
+      :doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.3.0,1.2.0,1.1.0
      ,,,
      PERFORMANCE TOOLS,,,
      :doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,2.6.0,2.6.0,1.4.0
-      :doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.2.3,3.2.3,3.1.0
-      :doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.1.1,1.1.0,1.0.0
-      :doc:`ROCProfiler <rocprofiler:index>`,2.0.70002,2.0.70000,2.0.60400
+      :doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.3.1,3.3.0,3.1.0
+      :doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.2.1,1.2.0,1.0.0
+      :doc:`ROCProfiler <rocprofiler:index>`,2.0.70101,2.0.70100,2.0.60400
      :doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,1.0.0,1.0.0,0.6.0
-      :doc:`ROCTracer <roctracer:index>`,4.1.70002,4.1.70000,4.1.60400
+      :doc:`ROCTracer <roctracer:index>`,4.1.70101,4.1.70100,4.1.60400
      ,,,
      DEVELOPMENT TOOLS,,,
      :doc:`HIPIFY <hipify:index>`,20.0.0,20.0.0,19.0.0
      :doc:`ROCm CMake <rocmcmakebuildtools:index>`,0.14.0,0.14.0,0.14.0
-      :doc:`ROCdbgapi <rocdbgapi:index>`,0.77.4,0.77.3,0.77.2
+      :doc:`ROCdbgapi <rocdbgapi:index>`,0.77.4,0.77.4,0.77.2
      :doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,16.3.0,16.3.0,15.2.0
      `rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.5.0,0.5.0,0.4.0
      :doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.1.0,2.1.0,2.0.4
@@ -143,86 +143,33 @@ compatibility and system requirements.
      COMPILERS,.. _compilers-support-compatibility-matrix:,,
      `clang-ocl <https://github.com/ROCm/clang-ocl>`_,N/A,N/A,N/A
      :doc:`hipCC <hipcc:index>`,1.1.1,1.1.1,1.1.1
-      `Flang <https://github.com/ROCm/flang>`_,20.0.0.25385,20.0.0.25314,19.0.0.25133
-      :doc:`llvm-project <llvm-project:index>`,20.0.0.25385,20.0.0.25314,19.0.0.25133
-      `OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,20.0.0.25385,20.0.0.25314,19.0.0.25133
+      `Flang <https://github.com/ROCm/flang>`_,20.0.025444,20.0.025425,19.0.0.25133
+      :doc:`llvm-project <llvm-project:index>`,20.0.025444,20.0.025425,19.0.0.25133
+      `OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,20.0.025444,20.0.025425,19.0.0.25133
      ,,,
      RUNTIMES,.. _runtime-support-compatibility-matrix:,,
-      :doc:`AMD CLR <hip:understand/amd_clr>`,7.0.51831,7.0.51830,6.4.43482
-      :doc:`HIP <hip:index>`,7.0.51831,7.0.51830,6.4.43482
+      :doc:`AMD CLR <hip:understand/amd_clr>`,7.1.52802,7.1.25424,6.4.43482
+      :doc:`HIP <hip:index>`,7.1.52802,7.1.25424,6.4.43482
      `OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0,2.0.0
      :doc:`ROCr Runtime <rocr-runtime:index>`,1.18.0,1.18.0,1.15.0

 .. rubric:: Footnotes

-.. [#rhel-10-702] RHEL 10.0 and RHEL 9.6 are supported on all listed :ref:`supported_GPUs` except AMD Radeon PRO V620 GPU.
-.. [#rhel-94-702] RHEL 9.4 is supported on all AMD Instinct GPUs listed under :ref:`supported_GPUs`.
-.. [#rhel-700] RHEL 8.10 is supported only on AMD Instinct MI300X, MI300A, MI250X, MI250, MI210, and MI100 GPUs.
-.. [#ol-700-mi300x] **For ROCm 7.0.x** - Oracle Linux 10 and 9 are supported only on AMD Instinct MI355X, MI350X, and MI300X GPUs. Oracle Linux 8 is supported only on AMD Instinct MI300X GPU.
-.. [#ol-mi300x] **Prior ROCm 7.0.0** - Oracle Linux is supported only on AMD Instinct MI300X GPUs.
-.. [#db-mi300x] **For ROCm 7.0.2** - Debian 13 is supported only on AMD Instinct MI300X GPUs.
-.. [#sles-db-700] **For ROCm 7.0.x** - SLES 15 SP7 and Debian 12 are supported only on AMD Instinct MI300X, MI300A, MI250X, MI250, and MI210 GPUs.
-.. [#az-mi300x] Starting ROCm 6.4.0, Azure Linux 3.0 is supported only on AMD Instinct MI300X and AMD Radeon PRO V710 GPUs.
-.. [#rl-700] Rocky Linux 9 is supported only on AMD Instinct MI300X and MI300A GPUs.
-.. [#single-node] **Prior to ROCm 7.0.0** - Debian 12 is supported only on AMD Instinct MI300X GPUs for single-node functionality.
-.. [#mi350x-os] AMD Instinct MI355X (gfx950) and MI350X(gfx950) GPUs are supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 10.0, RHEL 9.6, RHEL 9.4, Oracle Linux 10, and Oracle Linux 9.
-.. [#RDNA-OS-700] **For ROCm 7.0.x** - AMD Radeon PRO AI PRO R9700 (gfx1201), AMD Radeon RX 9070 XT (gfx1201), AMD Radeon RX 9070 GRE (gfx1201), AMD Radeon RX 9070 (gfx1201), AMD Radeon RX 9060 XT (gfx1200), AMD Radeon RX 9060 (gfx1200), AMD Radeon RX 7800 XT (gfx1101), AMD Radeon RX 7700 XT (gfx1101), AMD Radeon PRO W7700 (gfx1101), and AMD Radeon PRO W6800 (gfx1030) are supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 10.0, and RHEL 9.6.
-.. [#rd-v710] **For ROCm 7.0.x** - AMD Radeon PRO V710 (gfx1101) GPUs are supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 10.0, RHEL 9.6, and Azure Linux 3.0.
-.. [#rd-v620] **For ROCm 7.0.x** - AMD Radeon PRO V620 (gfx1030) GPUs are supported only on Ubuntu 24.04.3 and Ubuntu 22.04.5.
-.. [#mi325x-os] **For ROCm 7.0.x** - AMD Instinct MI325X GPUs (gfx942) are supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 9.6, and RHEL 9.4.
-.. [#mi300x-os] **For ROCm 7.0.x** - AMD Instinct MI300X GPUs (gfx942) are supported on all listed :ref:`supported_distributions`.
-.. [#mi300A-os] **For ROCm 7.0.x** - AMD Instinct MI300A GPUs (gfx942) are supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 10.0, RHEL 9.6, RHEL 9.4, RHEL 8.10, SLES 15 SP7, Debian 12, and Rocky Linux 9.
-.. [#mi200x-os] **For ROCm 7.0.x** - AMD Instinct MI200 Series GPUs (gfx90a) are supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 10.0, RHEL 9.6, RHEL 9.4, RHEL 8.10, SLES 15 SP7, and Debian 12.
-.. [#mi100-os] **For ROCm 7.0.x** - AMD Instinct MI100 GPUs (gfx908) are supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 10.0, RHEL 9.6, RHEL 9.4, and RHEL 8.10.
-.. [#tf-mi350] TensorFlow 2.17.1 is not supported on AMD Instinct MI350 Series GPUs. Use TensorFlow 2.19.1 or 2.18.1 with MI350 Series GPUs instead.
-.. [#dgl_compat] DGL is supported only on ROCm 6.4.0.
-.. [#llama-cpp_compat] llama.cpp is supported only on ROCm 7.0.0 and ROCm 6.4.x.
+.. [#os-compatibility] Some operating systems are supported on limited GPUs. For detailed information, see the latest :ref:`supported_distributions`. For version specific information, see `ROCm 7.1.1 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.1/reference/system-requirements.html#supported-operating-systems>`__, `ROCm 7.1.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.0/reference/system-requirements.html#supported-operating-systems>`__, and `ROCm 6.4.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.0/reference/system-requirements.html#supported-operating-systems>`__.
+.. [#gpu-compatibility] Some GPUs have limited operating system support. For detailed information, see the latest :ref:`supported_GPUs`. For version specific information, see `ROCm 7.1.1 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.1/reference/system-requirements.html#supported-gpus>`__, `ROCm 7.1.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.0/reference/system-requirements.html#supported-gpus>`__, and `ROCm 6.4.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.0/reference/system-requirements.html#supported-gpus>`__.
+.. [#dgl_compat] DGL is only supported on ROCm 7.0.0, ROCm 6.4.3 and ROCm 6.4.0.
+.. [#llama-cpp_compat] llama.cpp is only supported on ROCm 7.0.0 and ROCm 6.4.x.
+.. [#mi325x_KVM] For AMD Instinct MI325X KVM SR-IOV users, do not use AMD GPU Driver (amdgpu) 30.20.0.
 .. [#driver_patch] AMD GPU Driver (amdgpu) 30.10.1 is a quality release that resolves an issue identified in the 30.10 release. There are no other significant changes or feature additions in ROCm 7.0.1 from ROCm 7.0.0. AMD GPU Driver (amdgpu) 30.10.1 is compatible with ROCm 7.0.1 and ROCm 7.0.0.
 .. [#kfd_support] As of ROCm 6.4.0, forward and backward compatibility between the AMD GPU Driver (amdgpu) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The supported user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and AMD GPU Driver support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
 .. [#ROCT-rocr] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.

-
 .. _OS-kernel-versions:

 Operating systems, kernel and Glibc versions
 *********************************************

-Use this lookup table to confirm which operating system and kernel versions are supported with ROCm.
-
-.. csv-table::
-   :header: "OS", "Version", "Kernel", "Glibc"
-   :widths: 40, 20, 30, 20
-   :stub-columns: 1
-
-   `Ubuntu <https://ubuntu.com/about/release-cycle#ubuntu-kernel-release-cycle>`_, 24.04.3, "6.8 [GA], 6.14 [HWE]", 2.39
-   ,,
-   `Ubuntu <https://ubuntu.com/about/release-cycle#ubuntu-kernel-release-cycle>`_, 24.04.2, "6.8 [GA], 6.11 [HWE]", 2.39
-   ,,
-   `Ubuntu <https://ubuntu.com/about/release-cycle#ubuntu-kernel-release-cycle>`_, 22.04.5, "5.15 [GA], 6.8 [HWE]", 2.35
-   ,,
-   `Red Hat Enterprise Linux (RHEL 10) <https://access.redhat.com/articles/3078#RHEL9>`_, 10.0, 6.12.0-55, 2.39
-   ,,
-   `Red Hat Enterprise Linux (RHEL 9) <https://access.redhat.com/articles/3078#RHEL9>`_, 9.6, 5.14.0-570, 2.34
-   ,9.5, 5.14+, 2.34
-   ,9.4, 5.14.0-427, 2.34
-   ,,
-   `Red Hat Enterprise Linux (RHEL 8) <https://access.redhat.com/articles/3078#RHEL8>`_, 8.10, 4.18.0-553, 2.28
-   ,,
-   `SUSE Linux Enterprise Server (SLES) <https://www.suse.com/support/kb/doc/?id=000019587#SLE15SP4>`_, 15 SP7, 6.40-150700.51, 2.38
-   ,15 SP6, "6.5.0+, 6.4.0", 2.38
-   ,15 SP5, 5.14.21, 2.31
-   ,,
-   `Rocky Linux <https://wiki.rockylinux.org/rocky/version/>`_, 9, 5.14.0-570, 2.34
-   ,,
-   `Oracle Linux <https://blogs.oracle.com/scoter/post/oracle-linux-and-unbreakable-enterprise-kernel-uek-releases>`_, 10, 6.12.0 (UEK), 2.39
-   ,9, 6.12.0 (UEK), 2.34
-   ,8, 5.15.0 (UEK), 2.28
-   ,,
-   `Debian <https://www.debian.org/download>`_,13, 6.12, 2.35
-   ,12, 6.1.0, 2.36
-   ,,
-   `Azure Linux <https://techcommunity.microsoft.com/blog/linuxandopensourceblog/azure-linux-3-0-now-in-preview-on-azure-kubernetes-service-v1-31/4287229>`_,3.0, 6.6.92, 2.38
-   ,,
+For detailed information on operating system supported on ROCm 7.1.1 and associated Kernel and Glibc version, see the latest :ref:`supported_distributions`. For version specific information, see `ROCm 7.1.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.0/reference/system-requirements.html#supported-operating-systems>`__, and `ROCm 6.4.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.0/reference/system-requirements.html#supported-operating-systems>`__.

 .. note::

@@ -254,46 +201,17 @@ Expand for full historical view of:

   .. rubric:: Footnotes

-   .. [#rhel-10-702-past-60] RHEL 10.0 and RHEL 9.6 are supported on all listed :ref:`supported_GPUs` except AMD Radeon PRO V620 GPU.
-   .. [#rhel-94-702-past-60] RHEL 9.4 is supported on all AMD Instinct GPUs listed under :ref:`supported_GPUs`.
-   .. [#rhel-700-past-60] **For ROCm 7.0.x** - RHEL 8.10 is supported only on AMD Instinct MI300X, MI300A, MI250X, MI250, MI210, and MI100 GPUs.
-   .. [#ol-700-mi300x-past-60] **For ROCm 7.0.x** - Oracle Linux 10 and 9 are supported only on AMD Instinct MI355X, MI350X, and MI300X GPUs. Oracle Linux 8 is supported only on AMD Instinct MI300X GPU.
-   .. [#mi300x-past-60] **Prior ROCm 7.0.0** - Oracle Linux is supported only on AMD Instinct MI300X GPUs.
-   .. [#db-mi300x-past-60] **For ROCm 7.0.2** - Debian 13 is supported only on AMD Instinct MI300X GPUs.
-   .. [#sles-db-700-past-60] **For ROCm 7.0.x** - SLES 15 SP7 and Debian 12 are supported only on AMD Instinct MI300X, MI300A, MI250X, MI250, and MI210 GPUs.
-   .. [#single-node-past-60] **Prior to ROCm 7.0.0** - Debian 12 is supported only on AMD Instinct MI300X GPUs for single-node functionality.
-   .. [#az-mi300x-past-60] Starting from ROCm 6.4.0, Azure Linux 3.0 is supported only on AMD Instinct MI300X and AMD Radeon PRO V710 GPUs.
-   .. [#az-mi300x-630-past-60] **Prior ROCm 6.4.0**- Azure Linux 3.0 is supported only on AMD Instinct MI300X GPUs.
-   .. [#rl-700-past-60] Rocky Linux 9 is supported only on AMD Instinct MI300X and MI300A GPUs.
-   .. [#mi350x-os-past-60] AMD Instinct MI355X (gfx950) and MI350X(gfx950) GPUs are supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 9.6, RHEL 9.4, and Oracle Linux 9.
-   .. [#RDNA-OS-700-past-60] **For ROCm 7.0.x** AMD Radeon PRO AI PRO R9700 (gfx1201), AMD Radeon RX 9070 XT (gfx1201), AMD Radeon RX 9070 GRE (gfx1201), AMD Radeon RX 9070 (gfx1201), AMD Radeon RX 9060 XT (gfx1200), AMD Radeon RX 9060 (gfx1200), AMD Radeon RX 7800 XT (gfx1101), AMD Radeon RX 7700 XT (gfx1101), AMD Radeon PRO W7700 (gfx1101), and AMD Radeon PRO W6800 (gfx1030) are supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 10.0, RHEL 9.6, RHEL 9.4, Oracle Linux 10, and Oracle Linux 9.
-   .. [#RDNA-OS-past-60] **Prior ROCm 7.0.0** - Radeon AI PRO R9700, Radeon RX 9070 XT (gfx1201), Radeon RX 9060 XT (gfx1200), Radeon PRO W7700 (gfx1101), and Radeon RX 7800 XT (gfx1101) are supported only on Ubuntu 24.04.2, Ubuntu 22.04.5, RHEL 9.6, and RHEL 9.4.
-   .. [#rd-v710-past-60] **For ROCm 7.0.x** - AMD Radeon PRO V710 (gfx1101) is supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 10.0, RHEL 9.6, and Azure Linux 3.0.
-   .. [#rd-v620-past-60] **For ROCm 7.0.x** - AMD Radeon PRO V620 (gfx1030) is supported only on Ubuntu 24.04.3 and Ubuntu 22.04.5.
-   .. [#mi325x-os-past-60] **For ROCm 7.0.x** - AMD Instinct MI325X GPU (gfx942) is supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 9.6, and RHEL 9.4.
-   .. [#mi300x-os-past-60] **For ROCm 7.0.x** - AMD Instinct MI300X GPU (gfx942) is supported on all listed :ref:`supported_distributions`.
-   .. [#mi300A-os-past-60] **For ROCm 7.0.x** - AMD Instinct MI300A GPU (gfx942) is supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 10.0, RHEL 9.6, RHEL 9.4, RHEL 8.10, SLES 15 SP7, Debian 12, and Rocky Linux 9.
-   .. [#mi200x-os-past-60] **For ROCm 7.0.x** - AMD Instinct MI200 Series GPUs (gfx90a) are supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 10.0, RHEL 9.6, RHEL 9.4, RHEL 8.10, SLES 15 SP7, and Debian 12.
-   .. [#mi100-os-past-60] **For ROCm 7.0.x** - AMD Instinct MI100 GPU (gfx908) is supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 10.0, RHEL 9.6, RHEL 9.4, and RHEL 8.10.
-   .. [#7700XT-OS-past-60] **Prior to ROCm 7.0.0** - Radeon RX 7700 XT (gfx1101) is supported only on Ubuntu 24.04.2 and RHEL 9.6.
-   .. [#mi300_624-past-60] **For ROCm 6.2.4** - MI300X (gfx942) is supported on listed operating systems *except* Ubuntu 22.04.5 [6.8 HWE] and Ubuntu 22.04.4 [6.5 HWE].
-   .. [#mi300_622-past-60] **For ROCm 6.2.2** - MI300X (gfx942) is supported on listed operating systems *except* Ubuntu 22.04.5 [6.8 HWE] and Ubuntu 22.04.4 [6.5 HWE].
-   .. [#mi300_621-past-60] **For ROCm 6.2.1** - MI300X (gfx942) is supported on listed operating systems *except* Ubuntu 22.04.5 [6.8 HWE] and Ubuntu 22.04.4 [6.5 HWE].
-   .. [#mi300_620-past-60] **For ROCm 6.2.0** - MI300X (gfx942) is supported on listed operating systems *except* Ubuntu 22.04.5 [6.8 HWE] and Ubuntu 22.04.4 [6.5 HWE].
-   .. [#mi300_612-past-60] **For ROCm 6.1.2** - MI300A (gfx942) is supported on Ubuntu 22.04.4, RHEL 9.4, RHEL 9.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is supported only on Ubuntu 22.04.4 and Oracle Linux.
-   .. [#mi300_611-past-60] **For ROCm 6.1.1** - MI300A (gfx942) is supported on Ubuntu 22.04.4, RHEL 9.4, RHEL 9.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is supported only on Ubuntu 22.04.4 and Oracle Linux.
-   .. [#mi300_610-past-60] **For ROCm 6.1.0** - MI300A (gfx942) is supported on Ubuntu 22.04.4, RHEL 9.4, RHEL 9.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is supported only on Ubuntu 22.04.4.
-   .. [#mi300_602-past-60] **For ROCm 6.0.2** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is supported only on Ubuntu 22.04.3.
-   .. [#mi300_600-past-60] **For ROCm 6.0.0** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is supported only on Ubuntu 22.04.3.
+   .. [#os-compatibility-past-60] Some operating systems are supported on limited GPUs. For detailed information, see the latest :ref:`supported_distributions`. For version specific information, see `ROCm 7.1.1 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.1/reference/system-requirements.html#supported-operating-systems>`__, `ROCm 7.1.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.0/reference/system-requirements.html#supported-operating-systems>`__, and `ROCm 6.4.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.0/reference/system-requirements.html#supported-operating-systems>`__.
+   .. [#gpu-compatibility-past-60] Some GPUs have limited operating system support. For detailed information, see the latest :ref:`supported_GPUs`. For version specific information, see `ROCm 7.1.1 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.1/reference/system-requirements.html#supported-gpus>`__, `ROCm 7.1.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.1.0/reference/system-requirements.html#supported-gpus>`__, and `ROCm 6.4.0 <https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.0/reference/system-requirements.html#supported-gpus>`__.
   .. [#tf-mi350-past-60] TensorFlow 2.17.1 is not supported on AMD Instinct MI350 Series GPUs. Use TensorFlow 2.19.1 or 2.18.1 with MI350 Series GPUs instead.
-   .. [#verl_compat-past-60] verl is supported only on ROCm 6.2.0.
-   .. [#stanford-megatron-lm_compat-past-60] Stanford Megatron-LM is supported only on ROCm 6.3.0.
-   .. [#dgl_compat-past-60] DGL is supported only on ROCm 6.4.0.
-   .. [#megablocks_compat-past-60] Megablocks is supported only on ROCm 6.3.0.
-   .. [#taichi_compat-past-60] Taichi is supported only on ROCm 6.3.2.
-   .. [#ray_compat-past-60] Ray is supported only on ROCm 6.4.1.
-   .. [#llama-cpp_compat-past-60] llama.cpp is supported only on ROCm 7.0.0 and 6.4.x.
-   .. [#flashinfer_compat-past-60] FlashInfer is supported only on ROCm 6.4.1.
+   .. [#verl_compat-past-60] verl is only supported on ROCm 7.0.0 and 6.2.0.
+   .. [#stanford-megatron-lm_compat-past-60] Stanford Megatron-LM is only supported on ROCm 6.3.0.
+   .. [#dgl_compat-past-60] DGL is only supported on ROCm 7.0.0, ROCm 6.4.3 and ROCm 6.4.0.
+   .. [#megablocks_compat-past-60] Megablocks is only supported on ROCm 6.3.0.
+   .. [#ray_compat-past-60] Ray is only supported on ROCm 7.0.0 and 6.4.1.
+   .. [#llama-cpp_compat-past-60] llama.cpp is only supported on ROCm 7.0.0 and 6.4.x.
+   .. [#flashinfer_compat-past-60] FlashInfer is only supported on ROCm 6.4.1.
+   .. [#mi325x_KVM-past-60] For AMD Instinct MI325X KVM SR-IOV users, do not use AMD GPU Driver (amdgpu) 30.20.0.
   .. [#driver_patch-past-60] AMD GPU Driver (amdgpu) 30.10.1 is a quality release that resolves an issue identified in the 30.10 release. There are no other significant changes or feature additions in ROCm 7.0.1 from ROCm 7.0.0. AMD GPU Driver (amdgpu) 30.10.1 is compatible with ROCm 7.0.1 and ROCm 7.0.0.
   .. [#kfd_support-past-60] As of ROCm 6.4.0, forward and backward compatibility between the AMD GPU Driver (amdgpu) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The supported user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and AMD GPU Driver support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
   .. [#ROCT-rocr-past-60] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.
--- a/docs/compatibility/ml-compatibility/dgl-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/dgl-compatibility.rst
@@ -2,7 +2,7 @@

 .. meta::
    :description: Deep Graph Library (DGL) compatibility
-    :keywords: GPU, DGL compatibility
+    :keywords: GPU, CPU, deep graph library, DGL, deep learning, framework compatibility

 .. version-set:: rocm_version latest

@@ -10,215 +10,274 @@
 DGL compatibility
 ********************************************************************************

-Deep Graph Library `(DGL) <https://www.dgl.ai/>`_ is an easy-to-use, high-performance and scalable 
+Deep Graph Library (`DGL <https://www.dgl.ai/>`__) is an easy-to-use, high-performance, and scalable 
 Python package for deep learning on graphs. DGL is framework agnostic, meaning 
-if a deep graph model is a component in an end-to-end application, the rest of 
+that if a deep graph model is a component in an end-to-end application, the rest of 
 the logic is implemented using PyTorch.  

-* ROCm support for DGL is hosted in the `https://github.com/ROCm/dgl <https://github.com/ROCm/dgl>`_ repository. 
-* Due to independent compatibility considerations, this location differs from the `https://github.com/dmlc/dgl <https://github.com/dmlc/dgl>`_ upstream repository. 
-* Use the prebuilt :ref:`Docker images <dgl-docker-compat>` with DGL, PyTorch, and ROCm preinstalled.
-* See the :doc:`ROCm DGL installation guide <rocm-install-on-linux:install/3rd-party/dgl-install>` 
-  to install and get started.
+DGL provides a high-performance graph object that can reside on either CPUs or GPUs. 
+It bundles structural data features for better control and provides a variety of functions 
+for computing with graph objects, including efficient and customizable message passing 
+primitives for Graph Neural Networks.

-
-Supported devices
+Support overview
 ================================================================================

- **Officially Supported**: TF32 with AMD Instinct MI300X (through hipblaslt)
- **Partially Supported**: TF32 with AMD Instinct MI250X
+- The ROCm-supported version of DGL is maintained in the official `https://github.com/ROCm/dgl 
+  <https://github.com/ROCm/dgl>`__ repository, which differs from the 
+  `https://github.com/dmlc/dgl <https://github.com/dmlc/dgl>`__ upstream repository.

+- To get started and install DGL on ROCm, use the prebuilt :ref:`Docker images <dgl-docker-compat>`, 
+  which include ROCm, DGL, and all required dependencies.

-.. _dgl-recommendations:
-
-Use cases and recommendations
-================================================================================
-
-DGL can be used for Graph Learning, and building popular graph models like  
-GAT, GCN and GraphSage. Using these we can support a variety of use-cases such as:
-
- Recommender systems
- Network Optimization and Analysis
- 1D (Temporal) and 2D (Image) Classification
- Drug Discovery
-
-Multiple use cases of DGL have been tested and verified.
-However, a recommended example follows a drug discovery pipeline using the ``SE3Transformer``.
-Refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`_, 
-where you can search for DGL examples and best practices to optimize your training workflows on AMD GPUs. 
-
-Coverage includes:
-
- Single-GPU training/inference
- Multi-GPU training
+  - See the :doc:`ROCm DGL installation guide <rocm-install-on-linux:install/3rd-party/dgl-install>` 
+    for installation and setup instructions.

+  - You can also consult the upstream `Installation guide <https://www.dgl.ai/pages/start.html>`__ 
+    for additional context.

 .. _dgl-docker-compat:

-Docker image compatibility
+Compatibility matrix
 ================================================================================

 .. |docker-icon| raw:: html

   <i class="fab fa-docker"></i>

-AMD validates and publishes `DGL images <https://hub.docker.com/r/rocm/dgl>`_
-with ROCm and Pytorch backends on Docker Hub. The following Docker image tags and associated
-inventories were tested on `ROCm 6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`_.
+AMD validates and publishes `DGL images <https://hub.docker.com/r/rocm/dgl/tags>`__
+with ROCm backends on Docker Hub. The following Docker image tags and associated
+inventories represent the latest available DGL version from the official Docker Hub. 
 Click the |docker-icon| to view the image on Docker Hub.

-.. list-table:: DGL Docker image components
+.. list-table::
    :header-rows: 1
    :class: docker-image-compatibility

-    * - Docker
+    * - Docker image
+      - ROCm
      - DGL
      - PyTorch
      - Ubuntu
      - Python
+      - GPU

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.6.0/images/sha256-8ce2c3bcfaa137ab94a75f9e2ea711894748980f57417739138402a542dd5564"><i class="fab fa-docker fa-lg"></i></a>
+           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4.0.amd0_rocm7.0.0_ubuntu24.04_py3.12_pytorch_2.8.0/images/sha256-943698ddf54c22a7bcad2e5b4ff467752e29e4ba6d0c926789ae7b242cbd92dd"><i class="fab fa-docker fa-lg"></i> rocm/dgl</a>

-      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`_
-      - `2.6.0 <https://github.com/ROCm/pytorch/tree/release/2.6>`_
+      - `7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__
+      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
+      - `2.8.0 <https://github.com/pytorch/pytorch/releases/tag/v2.8.0>`__
      - 24.04
-      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
+      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`__
+      - MI300X, MI250X

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.4.1/images/sha256-cf1683283b8eeda867b690229c8091c5bbf1edb9f52e8fb3da437c49a612ebe4"><i class="fab fa-docker fa-lg"></i></a>
+           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4.0.amd0_rocm7.0.0_ubuntu24.04_py3.12_pytorch_2.6.0/images/sha256-b2ec286a035eb7d0a6aab069561914d21a3cac462281e9c024501ba5ccedfbf7"><i class="fab fa-docker fa-lg"></i> rocm/dgl</a>

-      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`_
-      - `2.4.1 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
+      - `7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__
+      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
+      - `2.6.0 <https://github.com/pytorch/pytorch/releases/tag/v2.6.0>`__
      - 24.04
-      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
-
+      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`__
+      - MI300X, MI250X

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.4.1/images/sha256-4834f178c3614e2d09e89e32041db8984c456d45dfd20286e377ca8635686554"><i class="fab fa-docker fa-lg"></i></a>
+           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4.0.amd0_rocm7.0.0_ubuntu22.04_py3.10_pytorch_2.7.1/images/sha256-d27aee16df922ccf0bcd9107bfcb6d20d34235445d456c637e33ca6f19d11a51"><i class="fab fa-docker fa-lg"></i> rocm/dgl</a>

-      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`_
-      - `2.4.1 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
+      - `7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__
+      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
+      - `2.7.1 <https://github.com/pytorch/pytorch/releases/tag/v2.7.1>`__
      - 22.04
-      - `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
-
+      - `3.10.16 <https://www.python.org/downloads/release/python-31016/>`__
+      - MI300X, MI250X

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.3.0/images/sha256-88740a2c8ab4084b42b10c3c6ba984cab33dd3a044f479c6d7618e2b2cb05e69"><i class="fab fa-docker fa-lg"></i></a>
+           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4.0.amd0_rocm6.4.3_ubuntu24.04_py3.12_pytorch_2.6.0/images/sha256-f3ba6a3c9ec9f6c1cde28449dc9780e0c4c16c4140f4b23f158565fbfd422d6b"><i class="fab fa-docker fa-lg"></i> rocm/dgl</a>

-      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`_
-      - `2.3.0 <https://github.com/ROCm/pytorch/tree/release/2.3>`_
+      - `6.4.3 <https://repo.radeon.com/rocm/apt/6.4.3/>`__
+      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
+      - `2.6.0 <https://github.com/pytorch/pytorch/releases/tag/v2.6.0>`__
+      - 24.04
+      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`__
+      - MI300X, MI250X
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.6.0/images/sha256-8ce2c3bcfaa137ab94a75f9e2ea711894748980f57417739138402a542dd5564"><i class="fab fa-docker fa-lg"></i> rocm/dgl</a>
+
+      - `6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__
+      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
+      - `2.6.0 <https://github.com/pytorch/pytorch/releases/tag/v2.6.0>`__
+      - 24.04
+      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`__
+      - MI300X, MI250X
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.4.1/images/sha256-cf1683283b8eeda867b690229c8091c5bbf1edb9f52e8fb3da437c49a612ebe4"><i class="fab fa-docker fa-lg"></i> rocm/dgl</a>
+
+      - `6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__
+      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
+      - `2.4.1 <https://github.com/pytorch/pytorch/releases/tag/v2.4.1>`__
+      - 24.04
+      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`__
+      - MI300X, MI250X
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.4.1/images/sha256-4834f178c3614e2d09e89e32041db8984c456d45dfd20286e377ca8635686554"><i class="fab fa-docker fa-lg"></i> rocm/dgl</a>
+
+      - `6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__
+      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
+      - `2.4.1 <https://github.com/pytorch/pytorch/releases/tag/v2.4.1>`__
      - 22.04
-      - `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
-      
+      - `3.10.16 <https://www.python.org/downloads/release/python-31016/>`__
+      - MI300X, MI250X
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.3.0/images/sha256-88740a2c8ab4084b42b10c3c6ba984cab33dd3a044f479c6d7618e2b2cb05e69"><i class="fab fa-docker fa-lg"></i> rocm/dgl</a>
+
+      - `6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__
+      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
+      - `2.3.0 <https://github.com/pytorch/pytorch/releases/tag/v2.3.0>`__
+      - 22.04
+      - `3.10.16 <https://www.python.org/downloads/release/python-31016/>`__
+      - MI300X, MI250X
+
+
+.. _dgl-key-rocm-libraries:

 Key ROCm libraries for DGL
 ================================================================================

 DGL on ROCm depends on specific libraries that affect its features and performance.
-Using the DGL Docker container or building it with the provided docker file or a ROCm base image is recommended.
+Using the DGL Docker container or building it with the provided Docker file or a ROCm base image is recommended.
 If you prefer to build it yourself, ensure the following dependencies are installed:

 .. list-table:: 
    :header-rows: 1

    * - ROCm library
-      - Version
+      - ROCm 7.0.0 Version
+      - ROCm 6.4.x Version
      - Purpose
    * - `Composable Kernel <https://github.com/ROCm/composable_kernel>`_
-      - :version-ref:`"Composable Kernel" rocm_version`
+      - 1.1.0
+      - 1.1.0
      - Enables faster execution of core operations like matrix multiplication
        (GEMM), convolutions and transformations.
    * - `hipBLAS <https://github.com/ROCm/hipBLAS>`_
-      - :version-ref:`hipBLAS rocm_version`
+      - 3.0.0
+      - 2.4.0
      - Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
        matrix and vector operations.
    * - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`_
-      - :version-ref:`hipBLASLt rocm_version`
+      - 1.0.0
+      - 0.12.0
      - hipBLASLt is an extension of the hipBLAS library, providing additional
        features like epilogues fused into the matrix multiplication kernel or
        use of integer tensor cores.
    * - `hipCUB <https://github.com/ROCm/hipCUB>`_
-      - :version-ref:`hipCUB rocm_version`
+      - 4.0.0
+      - 3.4.0
      - Provides a C++ template library for parallel algorithms for reduction,
        scan, sort and select.
    * - `hipFFT <https://github.com/ROCm/hipFFT>`_
-      - :version-ref:`hipFFT rocm_version`
+      - 1.0.20
+      - 1.0.18
      - Provides GPU-accelerated Fast Fourier Transform (FFT) operations.
    * - `hipRAND <https://github.com/ROCm/hipRAND>`_
-      - :version-ref:`hipRAND rocm_version`
+      - 3.0.0
+      - 2.12.0
      - Provides fast random number generation for GPUs.
    * - `hipSOLVER <https://github.com/ROCm/hipSOLVER>`_
-      - :version-ref:`hipSOLVER rocm_version`
+      - 3.0.0
+      - 2.4.0
      - Provides GPU-accelerated solvers for linear systems, eigenvalues, and
        singular value decompositions (SVD).
    * - `hipSPARSE <https://github.com/ROCm/hipSPARSE>`_
-      - :version-ref:`hipSPARSE rocm_version`
+      - 4.0.1
+      - 3.2.0
      - Accelerates operations on sparse matrices, such as sparse matrix-vector
        or matrix-matrix products.
    * - `hipSPARSELt <https://github.com/ROCm/hipSPARSELt>`_
-      - :version-ref:`hipSPARSELt rocm_version`
+      - 0.2.4
+      - 0.2.3
      - Accelerates operations on sparse matrices, such as sparse matrix-vector
        or matrix-matrix products.
    * - `hipTensor <https://github.com/ROCm/hipTensor>`_
-      - :version-ref:`hipTensor rocm_version`
+      - 2.0.0
+      - 1.5.0
      - Optimizes for high-performance tensor operations, such as contractions.
    * - `MIOpen <https://github.com/ROCm/MIOpen>`_
-      - :version-ref:`MIOpen rocm_version`
+      - 3.5.0
+      - 3.4.0
      - Optimizes deep learning primitives such as convolutions, pooling,
        normalization, and activation functions.
    * - `MIGraphX <https://github.com/ROCm/AMDMIGraphX>`_
-      - :version-ref:`MIGraphX rocm_version`
+      - 2.13.0
+      - 2.12.0
      - Adds graph-level optimizations, ONNX models and mixed precision support
        and enable Ahead-of-Time (AOT) Compilation.
    * - `MIVisionX <https://github.com/ROCm/MIVisionX>`_
-      - :version-ref:`MIVisionX rocm_version`
+      - 3.3.0
+      - 3.2.0
      - Optimizes acceleration for computer vision and AI workloads like
        preprocessing, augmentation, and inferencing.
    * - `rocAL <https://github.com/ROCm/rocAL>`_
-      - :version-ref:`rocAL rocm_version`
+      - 3.3.0
+      - 2.2.0
      - Accelerates the data pipeline by offloading intensive preprocessing and
        augmentation tasks. rocAL is part of MIVisionX.
    * - `RCCL <https://github.com/ROCm/rccl>`_
-      - :version-ref:`RCCL rocm_version`
+      - 2.26.6
+      - 2.22.3
      - Optimizes for multi-GPU communication for operations like AllReduce and
        Broadcast.
    * - `rocDecode <https://github.com/ROCm/rocDecode>`_
-      - :version-ref:`rocDecode rocm_version`
+      - 1.0.0
+      - 0.10.0
      - Provides hardware-accelerated data decoding capabilities, particularly
        for image, video, and other dataset formats.
    * - `rocJPEG <https://github.com/ROCm/rocJPEG>`_
-      - :version-ref:`rocJPEG rocm_version`
+      - 1.1.0
+      - 0.8.0
      - Provides hardware-accelerated JPEG image decoding and encoding.
    * - `RPP <https://github.com/ROCm/RPP>`_
-      - :version-ref:`RPP rocm_version`
+      - 2.0.0
+      - 1.9.10
      - Speeds up data augmentation, transformation, and other preprocessing steps.
    * - `rocThrust <https://github.com/ROCm/rocThrust>`_
-      - :version-ref:`rocThrust rocm_version`
+      - 4.0.0
+      - 3.3.0
      - Provides a C++ template library for parallel algorithms like sorting,
        reduction, and scanning.
    * - `rocWMMA <https://github.com/ROCm/rocWMMA>`_
-      - :version-ref:`rocWMMA rocm_version`
+      - 2.0.0
+      - 1.7.0
      - Accelerates warp-level matrix-multiply and matrix-accumulate to speed up matrix
        multiplication (GEMM) and accumulation operations with mixed precision
        support.

+.. _dgl-supported-features-latest:

-Supported features
+Supported features with ROCm 7.0.0
 ================================================================================

-Many functions and methods available in DGL Upstream are also supported in DGL ROCm.
+Many functions and methods available upstream are also supported in DGL on ROCm.
 Instead of listing them all, support is grouped into the following categories to provide a general overview. 

 * DGL Base
 * DGL Backend 
 * DGL Data
 * DGL Dataloading
-* DGL DGLGraph
+* DGL Graph
 * DGL Function
 * DGL Ops
 * DGL Sampling
@@ -230,26 +289,76 @@ Instead of listing them all, support is grouped into the following categories to
 * DGL NN
 * DGL Optim
 * DGL Sparse
+* GraphBolt

+.. _dgl-unsupported-features-latest:

-Unsupported features
+Unsupported features with ROCm 7.0.0
 ================================================================================

-* Graphbolt
-* Partial TF32 Support (MI250x only)
-* Kineto/ ROCTracer integration
+* TF32 Support (only supported for PyTorch 2.7 and above)
+* Kineto/ROCTracer integration

+.. _dgl-unsupported-functions:

-Unsupported functions
+Unsupported functions with ROCm 7.0.0
 ================================================================================

-* ``more_nnz``
+* ``bfs``
 * ``format``
 * ``multiprocess_sparse_adam_state_dict``
-* ``record_stream_ndarray``
 * ``half_spmm``
 * ``segment_mm`` 
 * ``gather_mm_idx_b``
-* ``pgexplainer``
 * ``sample_labors_prob``
 * ``sample_labors_noprob``
+* ``sparse_admin``
+
+.. _dgl-recommendations:
+
+Use cases and recommendations
+================================================================================
+
+DGL can be used for Graph Learning, and building popular graph models like  
+GAT, GCN, and GraphSage. Using these models, a variety of use cases are supported:
+
+- Recommender systems
+- Network Optimization and Analysis
+- 1D (Temporal) and 2D (Image) Classification
+- Drug Discovery
+
+For use cases and recommendations, refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__, 
+where you can search for DGL examples and best practices to optimize your workloads on AMD GPUs.
+
+* Although multiple use cases of DGL have been tested and verified, a few have been  
+  outlined in the `DGL in the Real World: Running GNNs on Real Use Cases 
+  <https://rocm.blogs.amd.com/artificial-intelligence/dgl_blog2/README.html>`__ blog 
+  post, which walks through four real-world graph neural network (GNN) workloads 
+  implemented with the Deep Graph Library on ROCm. It covers tasks ranging from 
+  heterogeneous e-commerce graphs and multiplex networks (GATNE) to molecular graph 
+  regression (GNN-FiLM) and EEG-based neurological diagnosis (EEG-GCNN). For each use 
+  case, the authors detail: the dataset and task, how DGL is used, and their experience 
+  porting to ROCm. It is shown that DGL codebases often run without modification, with 
+  seamless integration of graph operations, message passing, sampling, and convolution. 
+
+* The `Graph Neural Networks (GNNs) at Scale: DGL with ROCm on AMD Hardware 
+  <https://rocm.blogs.amd.com/artificial-intelligence/why-graph-neural/README.html>`__ 
+  blog post introduces the Deep Graph Library (DGL) and its enablement on the AMD ROCm platform, 
+  bringing high-performance graph neural network (GNN) training to AMD GPUs. DGL bridges 
+  the gap between dense tensor frameworks and the irregular nature of graph data through a 
+  graph-first, message-passing abstraction. Its design ensures scalability, flexibility, and 
+  interoperability across frameworks like PyTorch and TensorFlow. AMD’s ROCm integration 
+  enables DGL to run efficiently on HIP-based GPUs, supported by prebuilt Docker containers 
+  and open-source repositories. This marks a major step in AMD's mission to advance open, 
+  scalable AI ecosystems beyond traditional architectures.
+
+You can pre-process datasets and begin training on AMD GPUs through:
+
+* Single-GPU training/inference
+* Multi-GPU training
+
+
+Previous versions
+===============================================================================
+See :doc:`rocm-install-on-linux:install/3rd-party/previous-versions/dgl-history` to find documentation for previous releases
+of the ``ROCm/dgl`` Docker image.
--- a/docs/compatibility/ml-compatibility/flashinfer-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/flashinfer-compatibility.rst
@@ -1,8 +1,8 @@
 :orphan:

 .. meta::
-    :description: FlashInfer deep learning framework compatibility
-    :keywords: GPU, LLM, FlashInfer, compatibility
+    :description: FlashInfer compatibility
+    :keywords: GPU, LLM, FlashInfer, deep learning, framework compatibility

 .. version-set:: rocm_version latest

@@ -11,7 +11,7 @@ FlashInfer compatibility
 ********************************************************************************

 `FlashInfer <https://docs.flashinfer.ai/index.html>`__ is a library and kernel generator 
-for Large Language Models (LLMs) that provides high-performance implementation of graphics 
+for Large Language Models (LLMs) that provides a high-performance implementation of graphics 
 processing units (GPUs) kernels. FlashInfer focuses on LLM serving and inference, as well 
 as advanced performance across diverse scenarios.

@@ -25,63 +25,35 @@ offers high-performance LLM-specific operators, with easy integration through Py
  For the latest feature compatibility matrix, refer to the ``README`` of the 
  `https://github.com/ROCm/flashinfer <https://github.com/ROCm/flashinfer>`__ repository.

-Support for the ROCm port of FlashInfer is available as follows:
+Support overview
+================================================================================

- ROCm support for FlashInfer is hosted in the `https://github.com/ROCm/flashinfer 
-  <https://github.com/ROCm/flashinfer>`__ repository. This location differs from the 
-  `https://github.com/flashinfer-ai/flashinfer <https://github.com/flashinfer-ai/flashinfer>`_ 
+- The ROCm-supported version of FlashInfer is maintained in the official `https://github.com/ROCm/flashinfer 
+  <https://github.com/ROCm/flashinfer>`__ repository, which differs from the 
+  `https://github.com/flashinfer-ai/flashinfer <https://github.com/flashinfer-ai/flashinfer>`__ 
  upstream repository.

- To install FlashInfer, use the prebuilt :ref:`Docker image <flashinfer-docker-compat>`, 
-  which includes ROCm, FlashInfer, and all required dependencies.
+- To get started and install FlashInfer on ROCm, use the prebuilt :ref:`Docker images <flashinfer-docker-compat>`, 
+  which include ROCm, FlashInfer, and all required dependencies.

  - See the :doc:`ROCm FlashInfer installation guide <rocm-install-on-linux:install/3rd-party/flashinfer-install>` 
-    to install and get started.
+    for installation and setup instructions.

-  - See the `Installation guide <https://docs.flashinfer.ai/installation.html>`__ 
-    in the upstream FlashInfer documentation.
-
-.. note::
-
-  Flashinfer is supported on ROCm 6.4.1.
-
-Supported devices
-================================================================================
-
-**Officially Supported**: AMD Instinct™ MI300X
-
-
-.. _flashinfer-recommendations:
-
-Use cases and recommendations
-================================================================================
-
-This release of FlashInfer on ROCm provides the decode functionality for LLM inferencing.
-In the decode phase, tokens are generated sequentially, with the model predicting each new 
-token based on the previously generated tokens and the input context.
-
-FlashInfer on ROCm brings over upstream features such as load balancing, sparse and dense 
-attention optimizations, and batching support, enabling efficient execution on AMD Instinct™ MI300X GPUs.
-
-Because large LLMs often require substantial KV caches or long context windows, FlashInfer on ROCm 
-also implements cascade attention from upstream to reduce memory usage. 
-
-For currently supported use cases and recommendations, refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__, 
-where you can search for examples and best practices to optimize your workloads on AMD GPUs.
+  - You can also consult the upstream `Installation guide <https://docs.flashinfer.ai/installation.html>`__ 
+    for additional context.

 .. _flashinfer-docker-compat:

-Docker image compatibility
+Compatibility matrix
 ================================================================================

 .. |docker-icon| raw:: html

   <i class="fab fa-docker"></i>

-AMD validates and publishes `ROCm FlashInfer images <https://hub.docker.com/r/rocm/flashinfer/tags>`__
-with ROCm and Pytorch backends on Docker Hub. The following Docker image tags and associated
-inventories represent the FlashInfer version from the official Docker Hub.
-The Docker images have been validated for `ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__.
+AMD validates and publishes `FlashInfer images <https://hub.docker.com/r/rocm/flashinfer/tags>`__
+with ROCm backends on Docker Hub. The following Docker image tag and associated
+inventories represent the latest available FlashInfer version from the official Docker Hub. 
 Click |docker-icon| to view the image on Docker Hub.

 .. list-table:: 
@@ -94,6 +66,7 @@ Click |docker-icon| to view the image on Docker Hub.
      - PyTorch
      - Ubuntu
      - Python
+      - GPU

    * - .. raw:: html

@@ -103,5 +76,23 @@ Click |docker-icon| to view the image on Docker Hub.
      - `2.7.1 <https://github.com/ROCm/pytorch/releases/tag/v2.7.1>`__
      - 24.04
      - `3.12 <https://www.python.org/downloads/release/python-3129/>`__
+      - MI300X

+.. _flashinfer-recommendations:
+
+Use cases and recommendations
+================================================================================
+
+The release of FlashInfer on ROCm provides the decode functionality for LLM inferencing.
+In the decode phase, tokens are generated sequentially, with the model predicting each new 
+token based on the previously generated tokens and the input context.
+
+FlashInfer on ROCm brings over upstream features such as load balancing, sparse and dense 
+attention optimizations, and batching support, enabling efficient execution on AMD Instinct™ MI300X GPUs.
+
+Because large LLMs often require substantial KV caches or long context windows, FlashInfer on ROCm 
+also implements cascade attention from upstream to reduce memory usage. 
+
+For currently supported use cases and recommendations, refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__, 
+where you can search for examples and best practices to optimize your workloads on AMD GPUs.

--- a/docs/compatibility/ml-compatibility/jax-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/jax-compatibility.rst
@@ -2,7 +2,7 @@

 .. meta::
   :description: JAX compatibility
-   :keywords: GPU, JAX compatibility
+   :keywords: GPU, JAX, deep learning, framework compatibility

 .. version-set:: rocm_version latest

@@ -10,42 +10,58 @@
 JAX compatibility
 *******************************************************************************

-JAX provides a NumPy-like API, which combines automatic differentiation and the
-Accelerated Linear Algebra (XLA) compiler to achieve high-performance machine
-learning at scale.
+`JAX <https://docs.jax.dev/en/latest/notebooks/thinking_in_jax.html>`__ is a library 
+for array-oriented numerical computation (similar to NumPy), with automatic differentiation 
+and just-in-time (JIT) compilation to enable high-performance machine learning research.

-JAX uses composable transformations of Python and NumPy through just-in-time
-(JIT) compilation, automatic vectorization, and parallelization. To learn about
-JAX, including profiling and optimizations, see the official `JAX documentation
-<https://jax.readthedocs.io/en/latest/notebooks/quickstart.html>`_.
+JAX provides an API that combines automatic differentiation and the 
+Accelerated Linear Algebra (XLA) compiler to achieve high-performance machine 
+learning at scale. JAX uses composable transformations of Python and NumPy through 
+JIT compilation, automatic vectorization, and parallelization.

-ROCm support for JAX is upstreamed, and users can build the official source code
-with ROCm support:
+Support overview
+================================================================================

- ROCm JAX release:
+- The ROCm-supported version of JAX is maintained in the official `https://github.com/ROCm/rocm-jax 
+  <https://github.com/ROCm/rocm-jax>`__ repository, which differs from the 
+  `https://github.com/jax-ml/jax <https://github.com/jax-ml/jax>`__ upstream repository.

-  - Offers AMD-validated and community :ref:`Docker images <jax-docker-compat>`
-    with ROCm and JAX preinstalled.
+- To get started and install JAX on ROCm, use the prebuilt :ref:`Docker images <jax-docker-compat>`, 
+  which include ROCm, JAX, and all required dependencies.

-  - ROCm JAX repository: `ROCm/rocm-jax <https://github.com/ROCm/rocm-jax>`_
+  - See the :doc:`ROCm JAX installation guide <rocm-install-on-linux:install/3rd-party/jax-install>` 
+    for installation and setup instructions.

-  - See the :doc:`ROCm JAX installation guide <rocm-install-on-linux:install/3rd-party/jax-install>`
-    to get started.
+  - You can also consult the upstream `Installation guide <https://jax.readthedocs.io/en/latest/installation.html#amd-gpu-linux>`__ 
+    for additional context.

- Official JAX release:
+Version support
+--------------------------------------------------------------------------------

-  - Official JAX repository: `jax-ml/jax <https://github.com/jax-ml/jax>`_
+AMD releases official `ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax/tags>`_
+quarterly alongside new ROCm releases. These images undergo full AMD testing.
+`Community ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax-community/tags>`_
+follow upstream JAX releases and use the latest available ROCm version.

-  - See the `AMD GPU (Linux) installation section
-    <https://jax.readthedocs.io/en/latest/installation.html#amd-gpu-linux>`_ in
-    the JAX documentation.
+JAX Plugin-PJRT with JAX/JAXLIB compatibility
+================================================================================

-.. note::
+Portable JIT Runtime (PJRT) is an open, stable interface for device runtime and
+compiler. The following table details the ROCm version compatibility matrix
+between JAX Plugin–PJRT and JAX/JAXLIB.

-   AMD releases official `ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax>`_
-   quarterly alongside new ROCm releases. These images undergo full AMD testing.
-   `Community ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax-community>`_
-   follow upstream JAX releases and use the latest available ROCm version.
+.. list-table::
+    :header-rows: 1
+
+    * - JAX Plugin-PJRT
+      - JAX/JAXLIB
+      - ROCm
+    * - 0.7.1
+      - 0.7.1
+      - 7.1.1, 7.1.0
+    * - 0.6.0
+      - 0.6.2, 0.6.0
+      - 7.0.2, 7.0.1, 7.0.0

 Use cases and recommendations
 ================================================================================
@@ -71,7 +87,7 @@ Use cases and recommendations
 * The `Distributed fine-tuning with JAX on AMD GPUs <https://rocm.blogs.amd.com/artificial-intelligence/distributed-sft-jax/README.html>`_
  outlines the process of fine-tuning a Bidirectional Encoder Representations
  from Transformers (BERT)-based large language model (LLM) using JAX for a text
-  classification task. The blog post discuss techniques for parallelizing the
+  classification task. The blog post discusses techniques for parallelizing the
  fine-tuning across multiple AMD GPUs and assess the model's performance on a
  holdout dataset. During the fine-tuning, a BERT-base-cased transformer model
  and the General Language Understanding Evaluation (GLUE) benchmark dataset was
@@ -90,9 +106,9 @@ For more use cases and recommendations, see `ROCm JAX blog posts <https://rocm.b
 Docker image compatibility
 ================================================================================

-AMD provides preconfigured Docker images with JAX and the ROCm backend.
-These images are published on `Docker Hub <https://hub.docker.com/r/rocm/jax>`__ and are the
-recommended way to get started with deep learning with JAX on ROCm.
+AMD validates and publishes `JAX images <https://hub.docker.com/r/rocm/jax/tags>`__
+with ROCm backends on Docker Hub.
+
 For ``jax-community`` images, see `rocm/jax-community
 <https://hub.docker.com/r/rocm/jax-community/tags>`__ on Docker Hub.

@@ -234,7 +250,7 @@ The ROCm supported data types in JAX are collected in the following table.

 .. note::

-  JAX data type support is effected by the :ref:`key_rocm_libraries` and it's
+  JAX data type support is affected by the :ref:`key_rocm_libraries` and it's
  collected on :doc:`ROCm data types and precision support <rocm:reference/precision-support>`
  page.

--- a/docs/compatibility/ml-compatibility/llama-cpp-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/llama-cpp-compatibility.rst
@@ -1,8 +1,8 @@
 :orphan:

 .. meta::
-    :description: llama.cpp deep learning framework compatibility
-    :keywords: GPU, GGML, llama.cpp compatibility
+    :description: llama.cpp compatibility
+    :keywords: GPU, GGML, llama.cpp, deep learning, framework compatibility

 .. version-set:: rocm_version latest

@@ -20,73 +20,34 @@ to accelerate inference and reduce memory usage. Originally built as a CPU-first
 llama.cpp is easy to integrate with other programming environments and is widely 
 adopted across diverse platforms, including consumer devices. 

-ROCm support for llama.cpp is upstreamed, and you can build the official source code
-with ROCm support:
+Support overview
+================================================================================

- ROCm support for llama.cpp is hosted in the official `https://github.com/ROCm/llama.cpp 
-  <https://github.com/ROCm/llama.cpp>`_ repository.
+- The ROCm-supported version of llama.cpp is maintained in the official `https://github.com/ROCm/llama.cpp 
+  <https://github.com/ROCm/llama.cpp>`__ repository, which differs from the 
+  `https://github.com/ggml-org/llama.cpp <https://github.com/ggml-org/llama.cpp>`__ upstream repository.

- Due to independent compatibility considerations, this location differs from the 
-  `https://github.com/ggml-org/llama.cpp <https://github.com/ggml-org/llama.cpp>`_ upstream repository.
-
- To install llama.cpp, use the prebuilt :ref:`Docker image <llama-cpp-docker-compat>`, 
-  which includes ROCm, llama.cpp, and all required dependencies.
+- To get started and install llama.cpp on ROCm, use the prebuilt :ref:`Docker images <llama-cpp-docker-compat>`, 
+  which include ROCm, llama.cpp, and all required dependencies.

  - See the :doc:`ROCm llama.cpp installation guide <rocm-install-on-linux:install/3rd-party/llama-cpp-install>` 
-    to install and get started.
+    for installation and setup instructions.

-  - See the `Installation guide <https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md#hip>`__ 
-    in the upstream llama.cpp documentation.
-
-.. note::
-
-  llama.cpp is supported on ROCm 7.0.0 and ROCm 6.4.x.
-
-Supported devices
-================================================================================
-
-**Officially Supported**: AMD Instinct™ MI300X, MI325X, MI210
-
-
-Use cases and recommendations
-================================================================================
-
-llama.cpp can be applied in a variety of scenarios, particularly when you need to meet one or more of the following requirements:
-
- Plain C/C++ implementation with no external dependencies
- Support for 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory usage
- Custom HIP (Heterogeneous-compute Interface for Portability) kernels for running large language models (LLMs) on AMD GPUs (graphics processing units)
- CPU (central processing unit) + GPU (graphics processing unit) hybrid inference for partially accelerating models larger than the total available VRAM (video random-access memory)
-
-llama.cpp is also used in a range of real-world applications, including:
-
- Games such as `Lucy's Labyrinth <https://github.com/MorganRO8/Lucys_Labyrinth>`__:
-  A simple maze game where AI-controlled agents attempt to trick the player.
- Tools such as `Styled Lines <https://marketplace.unity.com/packages/tools/ai-ml-integration/style-text-webgl-ios-stand-alone-llm-llama-cpp-wrapper-292902>`__:
-  A proprietary, asynchronous inference wrapper for Unity3D game development, including pre-built mobile and web platform wrappers and a model example.
- Various other AI applications use llama.cpp as their inference engine;  
-  for a detailed list, see the `user interfaces (UIs) section <https://github.com/ggml-org/llama.cpp?tab=readme-ov-file#description>`__.
-
-For more use cases and recommendations, refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__, 
-where you can search for llama.cpp examples and best practices to optimize your workloads on AMD GPUs.
-
- The `Llama.cpp Meets Instinct: A New Era of Open-Source AI Acceleration <https://rocm.blogs.amd.com/ecosystems-and-partners/llama-cpp/README.html>`__ 
-  blog post outlines how the open-source llama.cpp framework enables efficient LLM inference—including interactive inference with ``llama-cli``, 
-  server deployment with ``llama-server``, GGUF model preparation and quantization, performance benchmarking, and optimizations tailored for 
-  AMD Instinct GPUs within the ROCm ecosystem. 
+  - You can also consult the upstream `Installation guide <https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md>`__ 
+    for additional context.

 .. _llama-cpp-docker-compat:

-Docker image compatibility
+Compatibility matrix
 ================================================================================

 .. |docker-icon| raw:: html

   <i class="fab fa-docker"></i>

-AMD validates and publishes `ROCm llama.cpp Docker images <https://hub.docker.com/r/rocm/llama.cpp/tags>`__
+AMD validates and publishes `llama.cpp images <https://hub.docker.com/r/rocm/llama.cpp/tags>`__
 with ROCm backends on Docker Hub. The following Docker image tags and associated
-inventories represent the available llama.cpp versions from the official Docker Hub.
+inventories represent the latest available llama.cpp versions from the official Docker Hub.
 Click |docker-icon| to view the image on Docker Hub.

 .. important::
@@ -107,32 +68,35 @@ Click |docker-icon| to view the image on Docker Hub.
      - llama.cpp
      - ROCm
      - Ubuntu
+      - GPU

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm7.0.0_ubuntu24.04_full/images/sha256-a2ecd635eaa65bb289a9041330128677f3ae88bee6fee0597424b17e38d4903c"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
+           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6652.amd0_rocm7.0.0_ubuntu24.04_full/images/sha256-a94f0c7a598cc6504ff9e8371c016d7a2f93e69bf54a36c870f9522567201f10g"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
      - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm7.0.0_ubuntu24.04_server/images/sha256-cb46b47df415addb5ceb6e6fdf0be70bf9d7f6863bbe6e10c2441ecb84246d52"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
+           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6652.amd0_rocm7.0.0_ubuntu24.04_server/images/sha256-be175932c3c96e882dfbc7e20e0e834f58c89c2925f48b222837ee929dfc47ee"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
      - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm7.0.0_ubuntu24.04_light/images/sha256-8f8536eec4b05c0ff1c022f9fc6c527ad1c89e6c1ca0906e4d39e4de73edbde9"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
+           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6652.amd0_rocm7.0.0_ubuntu24.04_light/images/sha256-d8ba0c70603da502c879b1f8010b439c8e7fa9f6cbdac8bbbbbba97cb41ebc9e"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
+      - `b6652 <https://github.com/ROCm/llama.cpp/tree/release/b6652>`__
      - `7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__
      - 24.04
+      - MI325X, MI300X, MI210

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm7.0.0_ubuntu22.04_full/images/sha256-f36de2a3b03ae53e81c85422cb3780368c9891e1ac7884b04403a921fe2ea45d"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
+           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6652.amd0_rocm7.0.0_ubuntu22.04_full/images/sha256-37582168984f25dce636cc7288298e06d94472ea35f65346b3541e6422b678ee"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
      - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm7.0.0_ubuntu22.04_server/images/sha256-df15e8ab11a6837cd3736644fec1e047465d49e37d610ab0b79df000371327df"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
+           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6652.amd0_rocm7.0.0_ubuntu22.04_server/images/sha256-7e70578e6c3530c6591cc2c26da24a9ee68a20d318e12241de93c83224f83720"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
      - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm7.0.0_ubuntu22.04_light/images/sha256-4ea2d5bb7964f0ee3ea9b30ba7f343edd6ddfab1b1037669ca7eafad2e3c2bd7"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
+           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6652.amd0_rocm7.0.0_ubuntu22.04_light/images/sha256-9a5231acf88b4a229677bc2c636ea3fe78a7a80f558bd80910b919855de93ad5"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
+      - `b6652 <https://github.com/ROCm/llama.cpp/tree/release/b6652>`__
      - `7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__
      - 22.04
+      - MI325X, MI300X, MI210

    * - .. raw:: html

@@ -146,6 +110,7 @@ Click |docker-icon| to view the image on Docker Hub.
      - `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
      - `6.4.3 <https://repo.radeon.com/rocm/apt/6.4.3/>`__
      - 24.04
+      - MI325X, MI300X, MI210

    * - .. raw:: html

@@ -159,7 +124,7 @@ Click |docker-icon| to view the image on Docker Hub.
      - `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
      - `6.4.3 <https://repo.radeon.com/rocm/apt/6.4.3/>`__
      - 22.04
-
+      - MI325X, MI300X, MI210

    * - .. raw:: html

@@ -173,6 +138,7 @@ Click |docker-icon| to view the image on Docker Hub.
      - `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
      - `6.4.2 <https://repo.radeon.com/rocm/apt/6.4.2/>`__
      - 24.04
+      - MI325X, MI300X, MI210

    * - .. raw:: html

@@ -186,7 +152,7 @@ Click |docker-icon| to view the image on Docker Hub.
      - `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
      - `6.4.2 <https://repo.radeon.com/rocm/apt/6.4.2/>`__
      - 22.04
-
+      - MI325X, MI300X, MI210

    * - .. raw:: html

@@ -200,6 +166,7 @@ Click |docker-icon| to view the image on Docker Hub.
      - `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
      - `6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__
      - 24.04
+      - MI325X, MI300X, MI210

    * - .. raw:: html

@@ -213,6 +180,7 @@ Click |docker-icon| to view the image on Docker Hub.
      - `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
      - `6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__
      - 22.04
+      - MI325X, MI300X, MI210

    * - .. raw:: html

@@ -226,7 +194,9 @@ Click |docker-icon| to view the image on Docker Hub.
      - `b5997 <https://github.com/ROCm/llama.cpp/tree/release/b5997>`__
      - `6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__
      - 24.04
+      - MI300X, MI210

+.. _llama-cpp-key-rocm-libraries:

 Key ROCm libraries for llama.cpp
 ================================================================================
@@ -269,6 +239,36 @@ your corresponding ROCm version.
      - Can be used to enhance the flash attention performance on AMD compute, by enabling
        the flag during compile time.

+.. _llama-cpp-uses-recommendations:
+
+Use cases and recommendations
+================================================================================
+
+llama.cpp can be applied in a variety of scenarios, particularly when you need to meet one or more of the following requirements:
+
+- Plain C/C++ implementation with no external dependencies
+- Support for 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory usage
+- Custom HIP (Heterogeneous-compute Interface for Portability) kernels for running large language models (LLMs) on AMD GPUs (graphics processing units)
+- CPU (central processing unit) + GPU (graphics processing unit) hybrid inference for partially accelerating models larger than the total available VRAM (video random-access memory)
+
+llama.cpp is also used in a range of real-world applications, including:
+
+- Games such as `Lucy's Labyrinth <https://github.com/MorganRO8/Lucys_Labyrinth>`__:
+  A simple maze game where AI-controlled agents attempt to trick the player.
+- Tools such as `Styled Lines <https://marketplace.unity.com/packages/tools/ai-ml-integration/style-text-webgl-ios-stand-alone-llm-llama-cpp-wrapper-292902>`__:
+  A proprietary, asynchronous inference wrapper for Unity3D game development, including pre-built mobile and web platform wrappers and a model example.
+- Various other AI applications use llama.cpp as their inference engine;  
+  for a detailed list, see the `user interfaces (UIs) section <https://github.com/ggml-org/llama.cpp?tab=readme-ov-file#description>`__.
+
+For more use cases and recommendations, refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__, 
+where you can search for llama.cpp examples and best practices to optimize your workloads on AMD GPUs.
+
+- The `Llama.cpp Meets Instinct: A New Era of Open-Source AI Acceleration <https://rocm.blogs.amd.com/ecosystems-and-partners/llama-cpp/README.html>`__ 
+  blog post outlines how the open-source llama.cpp framework enables efficient LLM inference—including interactive inference with ``llama-cli``, 
+  server deployment with ``llama-server``, GGUF model preparation and quantization, performance benchmarking, and optimizations tailored for 
+  AMD Instinct GPUs within the ROCm ecosystem. 
+
+
 Previous versions
 ===============================================================================
 See :doc:`rocm-install-on-linux:install/3rd-party/previous-versions/llama-cpp-history` to find documentation for previous releases
--- a/docs/compatibility/ml-compatibility/megablocks-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/megablocks-compatibility.rst
@@ -2,7 +2,7 @@

 .. meta::
    :description: Megablocks compatibility
-    :keywords: GPU, megablocks, compatibility
+    :keywords: GPU, megablocks, deep learning, framework compatibility

 .. version-set:: rocm_version latest

@@ -10,64 +10,41 @@
 Megablocks compatibility
 ********************************************************************************

-Megablocks is a light-weight library for mixture-of-experts (MoE) training. 
+`Megablocks <https://github.com/databricks/megablocks>`__ is a lightweight library 
+for mixture-of-experts `(MoE) <https://huggingface.co/blog/moe>`__ training. 
 The core of the system is efficient "dropless-MoE" and standard MoE layers. 
-Megablocks is integrated with `https://github.com/stanford-futuredata/Megatron-LM <https://github.com/stanford-futuredata/Megatron-LM>`_, 
+Megablocks is integrated with `https://github.com/stanford-futuredata/Megatron-LM 
+<https://github.com/stanford-futuredata/Megatron-LM>`__, 
 where data and pipeline parallel training of MoEs is supported.

-* ROCm support for Megablocks is hosted in the official `https://github.com/ROCm/megablocks <https://github.com/ROCm/megablocks>`_ repository. 
-* Due to independent compatibility considerations, this location differs from the `https://github.com/stanford-futuredata/Megatron-LM <https://github.com/stanford-futuredata/Megatron-LM>`_ upstream repository. 
-* Use the prebuilt :ref:`Docker image <megablocks-docker-compat>` with ROCm, PyTorch, and Megablocks preinstalled. 
-* See the :doc:`ROCm Megablocks installation guide <rocm-install-on-linux:install/3rd-party/megablocks-install>` to install and get started.
-
-.. note::
-
-  Megablocks is supported on ROCm 6.3.0.
-
-Supported devices
+Support overview
 ================================================================================

- **Officially Supported**: AMD Instinct MI300X
- **Partially Supported** (functionality or performance limitations): AMD Instinct MI250X, MI210
+- The ROCm-supported version of Megablocks is maintained in the official `https://github.com/ROCm/megablocks 
+  <https://github.com/ROCm/megablocks>`__ repository, which differs from the 
+  `https://github.com/stanford-futuredata/Megatron-LM <https://github.com/stanford-futuredata/Megatron-LM>`__ upstream repository.

-Supported models and features
-================================================================================
+- To get started and install Megablocks on ROCm, use the prebuilt :ref:`Docker image <megablocks-docker-compat>`, 
+  which includes ROCm, Megablocks, and all required dependencies.

-This section summarizes the Megablocks features supported by ROCm.
-
-* Distributed Pre-training
-* Activation Checkpointing and Recomputation
-* Distributed Optimizer
-* Mixture-of-Experts
-* dropless-Mixture-of-Experts
-
-
-.. _megablocks-recommendations:
-
-Use cases and recommendations
-================================================================================
-
-The `ROCm Megablocks blog posts <https://rocm.blogs.amd.com/artificial-intelligence/megablocks/README.html>`_ 
-guide how to leverage the ROCm platform for pre-training using the Megablocks framework. 
-It features how to pre-process datasets and how to begin pre-training on AMD GPUs through:
-
-* Single-GPU pre-training
-* Multi-GPU pre-training
+  - See the :doc:`ROCm Megablocks installation guide <rocm-install-on-linux:install/3rd-party/megablocks-install>` 
+    for installation and setup instructions.

+  - You can also consult the upstream `Installation guide <https://github.com/databricks/megablocks>`__ 
+    for additional context.

 .. _megablocks-docker-compat:

-Docker image compatibility
+Compatibility matrix
 ================================================================================

 .. |docker-icon| raw:: html

   <i class="fab fa-docker"></i>

-AMD validates and publishes `ROCm Megablocks images <https://hub.docker.com/r/rocm/megablocks/tags>`_
-with ROCm and Pytorch backends on Docker Hub. The following Docker image tags and associated
-inventories represent the latest Megatron-LM version from the official Docker Hub.
-The Docker images have been validated for `ROCm 6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`_.
+AMD validates and publishes `Megablocks images <https://hub.docker.com/r/rocm/megablocks/tags>`__
+with ROCm backends on Docker Hub. The following Docker image tag and associated
+inventories represent the latest available Megablocks version from the official Docker Hub. 
 Click |docker-icon| to view the image on Docker Hub.

 .. list-table:: 
@@ -80,6 +57,7 @@ Click |docker-icon| to view the image on Docker Hub.
      - PyTorch
      - Ubuntu
      - Python
+      - GPU

    * - .. raw:: html

@@ -89,5 +67,38 @@ Click |docker-icon| to view the image on Docker Hub.
      - `2.4.0 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
      - 24.04
      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
+      - MI300X

+Supported models and features with ROCm 6.3.0
+================================================================================
+
+This section summarizes the Megablocks features supported by ROCm.
+
+* Distributed Pre-training
+* Activation Checkpointing and Recomputation
+* Distributed Optimizer
+* Mixture-of-Experts
+* dropless-Mixture-of-Experts
+
+.. _megablocks-recommendations:
+
+Use cases and recommendations
+================================================================================
+
+* The `Efficient MoE training on AMD ROCm: How-to use Megablocks on AMD GPUs 
+  <https://rocm.blogs.amd.com/artificial-intelligence/megablocks/README.html>`__ 
+  blog post guides how to leverage the ROCm platform for pre-training using the 
+  Megablocks framework. It introduces a streamlined approach for training Mixture-of-Experts 
+  (MoE) models using the Megablocks library on AMD hardware. Focusing on GPT-2, it 
+  demonstrates how block-sparse computations can enhance scalability and efficiency in MoE 
+  training. The guide provides step-by-step instructions for setting up the environment, 
+  including cloning the repository, building the Docker image, and running the training container. 
+  Additionally, it offers insights into utilizing the ``oscar-1GB.json`` dataset for pre-training 
+  language models. By leveraging Megablocks and the ROCm platform, you can optimize your MoE 
+  training workflows for large-scale transformer models.
+
+It features how to pre-process datasets and how to begin pre-training on AMD GPUs through:
+
+* Single-GPU pre-training
+* Multi-GPU pre-training

--- a/docs/compatibility/ml-compatibility/pytorch-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/pytorch-compatibility.rst
@@ -2,7 +2,7 @@

 .. meta::
    :description: PyTorch compatibility
-    :keywords: GPU, PyTorch compatibility
+    :keywords: GPU, PyTorch, deep learning, framework compatibility

 .. version-set:: rocm_version latest

@@ -15,40 +15,42 @@ deep learning. PyTorch on ROCm provides mixed-precision and large-scale training
 using `MIOpen <https://github.com/ROCm/MIOpen>`__ and
 `RCCL <https://github.com/ROCm/rccl>`__ libraries.

-ROCm support for PyTorch is upstreamed into the official PyTorch repository. Due
-to independent compatibility considerations, this results in two distinct
-release cycles for PyTorch on ROCm:
+PyTorch provides two high-level features:

- ROCm PyTorch release:
+- Tensor computation (like NumPy) with strong GPU acceleration

-  - Provides the latest version of ROCm but might not necessarily support the
-    latest stable PyTorch version.
+- Deep neural networks built on a tape-based autograd system (rapid computation 
+  of multiple partial derivatives or gradients)

-  - Offers :ref:`Docker images <pytorch-docker-compat>` with ROCm and PyTorch
-    preinstalled.
+Support overview
+================================================================================

-  - ROCm PyTorch repository: `<https://github.com/ROCm/pytorch>`__
+ROCm support for PyTorch is upstreamed into the official PyTorch repository. 
+ROCm development is aligned with the stable release of PyTorch, while upstream 
+PyTorch testing uses the stable release of ROCm to maintain consistency:

-  - See the :doc:`ROCm PyTorch installation guide <rocm-install-on-linux:install/3rd-party/pytorch-install>`
-    to get started.
+- The ROCm-supported version of PyTorch is maintained in the official `https://github.com/ROCm/pytorch 
+  <https://github.com/ROCm/pytorch>`__ repository, which differs from the 
+  `https://github.com/pytorch/pytorch <https://github.com/pytorch/pytorch>`__ upstream repository.

- Official PyTorch release:
+- To get started and install PyTorch on ROCm, use the prebuilt :ref:`Docker images <pytorch-docker-compat>`, 
+  which include ROCm, PyTorch, and all required dependencies.

-  - Provides the latest stable version of PyTorch  but might not necessarily
-    support the latest ROCm version.
+  - See the :doc:`ROCm PyTorch installation guide <rocm-install-on-linux:install/3rd-party/pytorch-install>` 
+    for installation and setup instructions.

-  - Official PyTorch repository: `<https://github.com/pytorch/pytorch>`__
-
-  - See the `Nightly and latest stable version installation guide <https://pytorch.org/get-started/locally/>`__
-    or `Previous versions <https://pytorch.org/get-started/previous-versions/>`__
-    to get started.
+  - You can also consult the upstream `Installation guide <https://pytorch.org/get-started/locally/>`__ or 
+    `Previous versions <https://pytorch.org/get-started/previous-versions/>`__ for additional context.

 PyTorch includes tooling that generates HIP source code from the CUDA backend.
 This approach allows PyTorch to support ROCm without requiring manual code
 modifications. For more information, see :doc:`HIPIFY <hipify:index>`.

-ROCm development is aligned with the stable release of PyTorch, while upstream
-PyTorch testing uses the stable release of ROCm to maintain consistency.
+Version support
+--------------------------------------------------------------------------------
+
+AMD releases official `ROCm PyTorch Docker images <https://hub.docker.com/r/rocm/pytorch/tags>`_
+quarterly alongside new ROCm releases. These images undergo full AMD testing.

 .. _pytorch-recommendations:

@@ -78,7 +80,7 @@ Use cases and recommendations
  GPU.

 * The :doc:`Inception with PyTorch documentation </conceptual/ai-pytorch-inception>`
-  describes how PyTorch integrates with ROCm for AI workloads It outlines the
+  describes how PyTorch integrates with ROCm for AI workloads. It outlines the
  use of PyTorch on the ROCm platform and focuses on efficiently leveraging AMD
  GPU hardware for training and inference tasks in AI applications.

@@ -89,9 +91,8 @@ For more use cases and recommendations, see `ROCm PyTorch blog posts <https://ro
 Docker image compatibility
 ================================================================================

-AMD provides preconfigured Docker images with PyTorch and the ROCm backend.
-These images are published on `Docker Hub <https://hub.docker.com/r/rocm/pytorch>`__ and are the
-recommended way to get started with deep learning with PyTorch on ROCm.
+AMD validates and publishes `PyTorch images <https://hub.docker.com/r/rocm/pytorch/tags>`__
+with ROCm backends on Docker Hub.

 To find the right image tag, see the :ref:`PyTorch on ROCm installation
 documentation <rocm-install-on-linux:pytorch-docker-support>` for a list of
@@ -348,7 +349,7 @@ with ROCm.
        you need to explicitly move audio data (waveform tensor) to GPU using
        ``.to('cuda')``.

-    * - `torchtune <https://docs.pytorch.org/torchtune/stable/index.html>`_
+    * - `torchtune <https://meta-pytorch.org/torchtune/stable/index.html>`_
      - PyTorch-native library designed for fine-tuning large language models
        (LLMs). Provides supports the full fine-tuning workflow and offers
        compatibility with popular production inference systems.
@@ -360,21 +361,12 @@ with ROCm.
        popular datasets, model architectures, and common image transformations
        for computer vision applications.

-    * - `torchtext <https://docs.pytorch.org/text/stable/index.html>`_
-      - Text processing library for PyTorch. Provides data processing utilities
-        and popular datasets for natural language processing, including
-        tokenization, vocabulary management, and text embeddings.
-
-        **Note:** ``torchtext`` does not implement ROCm-specific kernels.
-        ROCm acceleration is provided through the underlying PyTorch framework
-        and ROCm library integration. Only official release exists.
-
    * - `torchdata <https://meta-pytorch.org/data/beta/index.html#torchdata>`_
      - Beta library of common modular data loading primitives for easily
        constructing flexible and performant data pipelines, with features still
        in prototype stage.

-    * - `torchrec <https://docs.pytorch.org/torchrec/>`_
+    * - `torchrec <https://meta-pytorch.org/torchrec/>`_
      - PyTorch domain library for common sparsity and parallelism primitives
        needed for large-scale recommender systems, enabling authors to train
        models with large embedding tables shared across many GPUs.
@@ -407,7 +399,40 @@ with ROCm.

        **Note:** Only official release exists.

-Key features and enhancements for PyTorch 2.7 with ROCm 7.0
+Key features and enhancements for PyTorch 2.9 with ROCm 7.1.1
+================================================================================
+- Scaled Dot Product Attention (SDPA) upgraded to use AOTriton version 0.11b.
+
+- Default hipBLASLt support enabled for gfx908 architecture on ROCm 6.3 and later.
+
+- MIOpen now supports channels last memory format for 3D convolutions and batch normalization.
+
+- NHWC convolution operations in MIOpen optimized by eliminating unnecessary transpose operations.
+
+- Improved tensor.item() performance by removing redundant synchronization.
+
+- Enhanced performance for element-wise operations and reduction kernels.
+
+- Added support for grouped GEMM operations through fbgemm_gpu generative AI components.
+
+- Resolved device error in Inductor when using CUDA graph trees with HIP.
+
+- Corrected logsumexp scaling in AOTriton-based SDPA implementation.
+
+- Added stream graph capture status validation in memory copy synchronization functions.
+
+Key features and enhancements for PyTorch 2.8 with ROCm 7.1
+================================================================================
+
+- MIOpen deep learning optimizations: Further optimized NHWC BatchNorm feature.
+
+- Added float8 support for the DeepSpeed extension, allowing for decreased
+  memory footprint and increased throughput in training and inference workloads.
+
+- ``torch.nn.functional.scaled_dot_product_attention`` now calling optimized
+  flash attention kernel automatically.
+
+Key features and enhancements for PyTorch 2.7/2.8 with ROCm 7.0
 ================================================================================

 - Enhanced TunableOp framework: Introduces ``tensorfloat32`` support for
@@ -442,10 +467,6 @@ Key features and enhancements for PyTorch 2.7 with ROCm 7.0
  ROCm-specific test conditions, and enhanced unit test coverage for Flash
  Attention and Memory Efficient operations.

- Build system and infrastructure improvements: Provides updated CentOS Stream 9
-  support, improved Docker configuration, migration to public MAGMA repository,
-  and enhanced QA automation scripts for PyTorch unit testing.
-
 - Composable Kernel (CK) updates: Features updated CK submodule integration with
  the latest optimizations and performance improvements for core mathematical
  operations.
@@ -467,7 +488,7 @@ Key features and enhancements for PyTorch 2.7 with ROCm 7.0
  network training or inference. For AMD platforms, ``amdclang++`` has been
  validated as the supported compiler for building these extensions.

-Known issues and notes for PyTorch 2.7 with ROCm 7.0
+Known issues and notes for PyTorch 2.7/2.8 with ROCm 7.0 and ROCm 7.1
 ================================================================================

 - The ``matmul.allow_fp16_reduced_precision_reduction`` and
--- a/docs/compatibility/ml-compatibility/ray-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/ray-compatibility.rst
@@ -1,8 +1,8 @@
 :orphan:

 .. meta::
-    :description: Ray deep learning framework compatibility
-    :keywords: GPU, Ray compatibility
+    :description: Ray compatibility
+    :keywords: GPU, Ray, deep learning, framework compatibility

 .. version-set:: rocm_version latest

@@ -12,43 +12,74 @@ Ray compatibility

 Ray is a unified framework for scaling AI and Python applications from your laptop 
 to a full cluster, without changing your code. Ray consists of `a core distributed 
-runtime  <https://docs.ray.io/en/latest/ray-core/walkthrough.html>`_ and a set of 
-`AI libraries <https://docs.ray.io/en/latest/ray-air/getting-started.html>`_ for 
+runtime  <https://docs.ray.io/en/latest/ray-core/walkthrough.html>`__ and a set of 
+`AI libraries <https://docs.ray.io/en/latest/ray-air/getting-started.html>`__ for 
 simplifying machine learning computations.

 Ray is a general-purpose framework that runs many types of workloads efficiently. 
 Any Python application can be scaled with Ray, without extra infrastructure.

-ROCm support for Ray is upstreamed, and you can build the official source code
-with ROCm support: 
-
- ROCm support for Ray is hosted in the official `https://github.com/ROCm/ray 
-  <https://github.com/ROCm/ray>`_ repository.
-
- Due to independent compatibility considerations, this location differs from the 
-  `https://github.com/ray-project/ray <https://github.com/ray-project/ray>`_ upstream repository.
-
- To install Ray, use the prebuilt :ref:`Docker image <ray-docker-compat>` 
-  which includes ROCm, Ray, and all required dependencies.
-
-  - See the :doc:`ROCm Ray installation guide <rocm-install-on-linux:install/3rd-party/ray-install>` 
-    for instructions to get started.
-
-  - See the `Installation section <https://docs.ray.io/en/latest/ray-overview/installation.html>`_ 
-    in the upstream Ray documentation.
-
-  - The Docker image provided is based on the upstream Ray `Daily Release (Nightly) wheels <https://docs.ray.io/en/latest/ray-overview/installation.html#daily-releases-nightlies>`__ 
-    corresponding to commit `005c372 <https://github.com/ray-project/ray/commit/005c372262e050d5745f475e22e64305fa07f8b8>`__.
-
-.. note::
-
-  Ray is supported on ROCm 6.4.1.
-
-Supported devices
+Support overview
 ================================================================================

-**Officially Supported**: AMD Instinct™ MI300X, MI210
+- The ROCm-supported version of Ray is maintained in the official `https://github.com/ROCm/ray 
+  <https://github.com/ROCm/ray>`__ repository, which differs from the 
+  `https://github.com/ray-project/ray <https://github.com/ray-project/ray>`__ upstream repository.

+- To get started and install Ray on ROCm, use the prebuilt :ref:`Docker image <ray-docker-compat>`, 
+  which includes ROCm, Ray, and all required dependencies.
+
+  - See the :doc:`ROCm Ray installation guide <rocm-install-on-linux:install/3rd-party/ray-install>`
+    for installation and setup instructions.
+
+  - You can also consult the upstream `Installation guide <https://docs.ray.io/en/latest/ray-overview/installation.html>`__ 
+    for additional context.
+
+.. _ray-docker-compat:
+
+Compatibility matrix
+================================================================================
+
+.. |docker-icon| raw:: html
+
+   <i class="fab fa-docker"></i>
+
+AMD validates and publishes `ROCm Ray Docker images <https://hub.docker.com/r/rocm/ray/tags>`__
+with ROCm backends on Docker Hub. The following Docker image tags and
+associated inventories represent the latest Ray version from the official Docker Hub.
+Click |docker-icon| to view the image on Docker Hub.
+
+.. list-table::
+    :header-rows: 1
+    :class: docker-image-compatibility
+
+    * - Docker image
+      - ROCm
+      - Ray
+      - Pytorch
+      - Ubuntu
+      - Python
+      - GPU
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/ray/ray-2.51.1_rocm7.0.0_ubuntu22.04_py3.12_pytorch2.9.0/images/sha256-a02f6766b4ba406f88fd7e85707ec86c04b569834d869a08043ec9bcbd672168"><i class="fab fa-docker fa-lg"></i> rocm/ray</a>
+      - `7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__
+      - `2.51.1 <https://github.com/ROCm/ray/tree/release/2.51.1>`__
+      - 2.9.0a0+git1c57644
+      - 22.04
+      - `3.12.12 <https://www.python.org/downloads/release/python-31212/>`__
+      - MI300X
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/ray/ray-2.48.0.post0_rocm6.4.1_ubuntu24.04_py3.12_pytorch2.6.0/images/sha256-0d166fe6bdced38338c78eedfb96eff92655fb797da3478a62dd636365133cc0"><i class="fab fa-docker fa-lg"></i> rocm/ray</a>
+      - `6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__
+      - `2.48.0.post0 <https://github.com/ROCm/ray/tree/release/2.48.0.post0>`__
+      - 2.6.0+git684f6f2
+      - 24.04
+      - `3.12.10 <https://www.python.org/downloads/release/python-31210/>`__
+      - MI300X, MI210

 Use cases and recommendations
 ================================================================================
@@ -77,35 +108,7 @@ topic <https://docs.ray.io/en/latest/ray-core/scheduling/accelerators.html#accel
 of the Ray core documentation and refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__, 
 where you can search for Ray examples and best practices to optimize your workloads on AMD GPUs.

-.. _ray-docker-compat:
-
-Docker image compatibility
-================================================================================
-
-.. |docker-icon| raw:: html
-
-   <i class="fab fa-docker"></i>
-
-AMD validates and publishes ready-made `ROCm Ray Docker images <https://hub.docker.com/r/rocm/ray/tags>`__
-with ROCm backends on Docker Hub. The following Docker image tags and
-associated inventories represent the latest Ray version from the official Docker Hub and are validated for
-`ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`_. Click the |docker-icon|
-icon to view the image on Docker Hub.
-
-.. list-table::
-    :header-rows: 1
-    :class: docker-image-compatibility
-
-    * - Docker image
-      - Ray
-      - Pytorch
-      - Ubuntu
-      - Python
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/ray/ray-2.48.0.post0_rocm6.4.1_ubuntu24.04_py3.12_pytorch2.6.0/images/sha256-0d166fe6bdced38338c78eedfb96eff92655fb797da3478a62dd636365133cc0"><i class="fab fa-docker fa-lg"></i> rocm/ray</a>
-      - `2.48.0.post0 <https://github.com/ROCm/ray/tree/release/2.48.0.post0>`_
-      - 2.6.0+git684f6f2
-      - 24.04
-      - `3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
+Previous versions
+===============================================================================
+See :doc:`rocm-install-on-linux:install/3rd-party/previous-versions/ray-history` to find documentation for previous releases
+of the ``ROCm/ray`` Docker image.
--- a/docs/compatibility/ml-compatibility/stanford-megatron-lm-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/stanford-megatron-lm-compatibility.rst
@@ -2,7 +2,7 @@

 .. meta::
    :description: Stanford Megatron-LM compatibility
-    :keywords: Stanford, Megatron-LM, compatibility
+    :keywords: Stanford, Megatron-LM, deep learning, framework compatibility

 .. version-set:: rocm_version latest

@@ -10,34 +10,76 @@
 Stanford Megatron-LM compatibility
 ********************************************************************************

-Stanford Megatron-LM is a large-scale language model training framework developed by NVIDIA `https://github.com/NVIDIA/Megatron-LM <https://github.com/NVIDIA/Megatron-LM>`_. It is
-designed to train massive transformer-based language models efficiently by model and data parallelism. 
+Stanford Megatron-LM is a large-scale language model training framework developed 
+by NVIDIA at `https://github.com/NVIDIA/Megatron-LM <https://github.com/NVIDIA/Megatron-LM>`_. 
+It is designed to train massive transformer-based language models efficiently by model 
+and data parallelism. 

-* ROCm support for Stanford Megatron-LM is hosted in the official `https://github.com/ROCm/Stanford-Megatron-LM <https://github.com/ROCm/Stanford-Megatron-LM>`_ repository. 
-* Due to independent compatibility considerations, this location differs from the `https://github.com/stanford-futuredata/Megatron-LM <https://github.com/stanford-futuredata/Megatron-LM>`_ upstream repository. 
-* Use the prebuilt :ref:`Docker image <megatron-lm-docker-compat>` with ROCm, PyTorch, and Megatron-LM preinstalled. 
-* See the :doc:`ROCm Stanford Megatron-LM installation guide <rocm-install-on-linux:install/3rd-party/stanford-megatron-lm-install>` to install and get started.
+It provides efficient tensor, pipeline, and sequence-based model parallelism for 
+pre-training transformer-based language models such as GPT (Decoder Only), BERT 
+(Encoder Only), and T5 (Encoder-Decoder). 

-.. note::
-
-	Stanford Megatron-LM is supported on ROCm 6.3.0.
-
-
-Supported Devices
+Support overview
 ================================================================================

- **Officially Supported**: AMD Instinct MI300X
- **Partially Supported** (functionality or performance limitations): AMD Instinct MI250X, MI210
+- The ROCm-supported version of Stanford Megatron-LM is maintained in the official `https://github.com/ROCm/Stanford-Megatron-LM 
+  <https://github.com/ROCm/Stanford-Megatron-LM>`__ repository, which differs from the 
+  `https://github.com/stanford-futuredata/Megatron-LM <https://github.com/stanford-futuredata/Megatron-LM>`__ upstream repository.

+- To get started and install Stanford Megatron-LM on ROCm, use the prebuilt :ref:`Docker image <megatron-lm-docker-compat>`, 
+  which includes ROCm, Stanford Megatron-LM, and all required dependencies.

-Supported models and features
+  - See the :doc:`ROCm Stanford Megatron-LM installation guide <rocm-install-on-linux:install/3rd-party/stanford-megatron-lm-install>` 
+    for installation and setup instructions.
+
+  - You can also consult the upstream `Installation guide <https://github.com/NVIDIA/Megatron-LM>`__ 
+    for additional context.
+
+.. _megatron-lm-docker-compat:
+
+Compatibility matrix
+================================================================================
+
+.. |docker-icon| raw:: html
+
+   <i class="fab fa-docker"></i>
+
+AMD validates and publishes `Stanford Megatron-LM images <https://hub.docker.com/r/rocm/stanford-megatron-lm/tags>`_
+with ROCm and Pytorch backends on Docker Hub. The following Docker image tags and associated
+inventories represent the latest Stanford Megatron-LM version from the official Docker Hub.
+Click |docker-icon| to view the image on Docker Hub.
+
+.. list-table:: 
+    :header-rows: 1
+    :class: docker-image-compatibility
+
+    * - Docker image
+      - ROCm
+      - Stanford Megatron-LM
+      - PyTorch
+      - Ubuntu
+      - Python
+      - GPU
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/stanford-megatron-lm/stanford-megatron-lm85f95ae_rocm6.3.0_ubuntu24.04_py3.12_pytorch2.4.0/images/sha256-070556f078be10888a1421a2cb4f48c29f28b02bfeddae02588d1f7fc02a96a6"><i class="fab fa-docker fa-lg"></i> rocm/stanford-megatron-lm</a>
+
+      - `6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`_
+      - `85f95ae <https://github.com/stanford-futuredata/Megatron-LM/commit/85f95aef3b648075fe6f291c86714fdcbd9cd1f5>`_
+      - `2.4.0 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
+      - 24.04
+      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
+      - MI300X
+
+Supported models and features with ROCm 6.3.0
 ================================================================================

 This section details models & features that are supported by the ROCm version on Stanford Megatron-LM.

 Models:

-* Bert
+* BERT
 * GPT
 * T5
 * ICT
@@ -54,47 +96,21 @@ Features:
 Use cases and recommendations
 ================================================================================

-See the `Efficient MoE training on AMD ROCm: How-to use Megablocks on AMD GPUs blog <https://rocm.blogs.amd.com/artificial-intelligence/megablocks/README.html>`_ post  
-to leverage the ROCm platform for pre-training by using the Stanford Megatron-LM framework of pre-processing datasets on AMD GPUs. 
-Coverage includes:
+The following blog post mentions Megablocks, but you can run Stanford Megatron-LM with the same steps to pre-process datasets on AMD GPUs:

-  * Single-GPU pre-training
-  * Multi-GPU pre-training
+* The `Efficient MoE training on AMD ROCm: How-to use Megablocks on AMD GPUs 
+  <https://rocm.blogs.amd.com/artificial-intelligence/megablocks/README.html>`__ 
+  blog post guides how to leverage the ROCm platform for pre-training using the 
+  Megablocks framework. It introduces a streamlined approach for training Mixture-of-Experts 
+  (MoE) models using the Megablocks library on AMD hardware. Focusing on GPT-2, it 
+  demonstrates how block-sparse computations can enhance scalability and efficiency in MoE 
+  training. The guide provides step-by-step instructions for setting up the environment, 
+  including cloning the repository, building the Docker image, and running the training container. 
+  Additionally, it offers insights into utilizing the ``oscar-1GB.json`` dataset for pre-training 
+  language models. By leveraging Megablocks and the ROCm platform, you can optimize your MoE 
+  training workflows for large-scale transformer models.

+It features how to pre-process datasets and how to begin pre-training on AMD GPUs through:

-.. _megatron-lm-docker-compat:
-
-Docker image compatibility
-================================================================================
-
-.. |docker-icon| raw:: html
-
-   <i class="fab fa-docker"></i>
-
-AMD validates and publishes `Stanford Megatron-LM images <https://hub.docker.com/r/rocm/megatron-lm>`_
-with ROCm and Pytorch backends on Docker Hub. The following Docker image tags and associated
-inventories represent the latest Megatron-LM version from the official Docker Hub.
-The Docker images have been validated for `ROCm 6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`_.
-Click |docker-icon| to view the image on Docker Hub.
-
-.. list-table:: 
-    :header-rows: 1
-    :class: docker-image-compatibility
-
-    * - Docker image
-      - Stanford Megatron-LM
-      - PyTorch
-      - Ubuntu
-      - Python
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/stanford-megatron-lm/stanford-megatron-lm85f95ae_rocm6.3.0_ubuntu24.04_py3.12_pytorch2.4.0/images/sha256-070556f078be10888a1421a2cb4f48c29f28b02bfeddae02588d1f7fc02a96a6"><i class="fab fa-docker fa-lg"></i></a>
-
-      - `85f95ae <https://github.com/stanford-futuredata/Megatron-LM/commit/85f95aef3b648075fe6f291c86714fdcbd9cd1f5>`_
-      - `2.4.0 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
-      - 24.04
-      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
-
-      
-
+* Single-GPU pre-training
+* Multi-GPU pre-training
--- a/docs/compatibility/ml-compatibility/taichi-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/taichi-compatibility.rst
@@ -1,76 +0,0 @@
-:orphan:
-
-.. meta::
-    :description: Taichi compatibility
-    :keywords: GPU, Taichi compatibility
-
-.. version-set:: rocm_version latest
-
-*******************************************************************************
-Taichi compatibility
-*******************************************************************************
-
-`Taichi <https://www.taichi-lang.org/>`_ is an open-source, imperative, and parallel 
-programming language designed for high-performance numerical computation. 
-Embedded in Python, it leverages just-in-time (JIT) compilation frameworks such as LLVM to accelerate 
-compute-intensive Python code by compiling it to native GPU or CPU instructions.
-
-Taichi is widely used across various domains, including real-time physical simulation, 
-numerical computing, augmented reality, artificial intelligence, computer vision, robotics, 
-visual effects in film and gaming, and general-purpose computing.
-
-* ROCm support for Taichi is hosted in the official `https://github.com/ROCm/taichi <https://github.com/ROCm/taichi>`_ repository.
-* Due to independent compatibility considerations, this location differs from the `https://github.com/taichi-dev <https://github.com/taichi-dev>`_ upstream repository.
-* Use the prebuilt :ref:`Docker image <taichi-docker-compat>` with ROCm, PyTorch, and Taichi preinstalled.
-* See the :doc:`ROCm Taichi installation guide <rocm-install-on-linux:install/3rd-party/taichi-install>` to install and get started.
-
-.. note::
-
-	Taichi is supported on ROCm 6.3.2.
-
-Supported devices and features
-===============================================================================
-There is support through the ROCm software stack for all Taichi GPU features on AMD Instinct MI250X and MI210X Series GPUs with the exception of Taichi’s GPU rendering system, CGUI.
-AMD Instinct MI300X Series GPUs will be supported by November.
-
-.. _taichi-recommendations:
-
-Use cases and recommendations
-================================================================================
-To fully leverage Taichi's performance capabilities in compute-intensive tasks, it is best to adhere to specific coding patterns and utilize Taichi decorators. 
-A collection of example use cases is available in the `https://github.com/ROCm/taichi_examples <https://github.com/ROCm/taichi_examples>`_ repository, 
-providing practical insights and foundational knowledge for working with the Taichi programming language. 
-You can also refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`_ to search for Taichi examples and best practices to optimize your workflows on AMD GPUs.
-
-.. _taichi-docker-compat:
-
-Docker image compatibility
-================================================================================
-
-.. |docker-icon| raw:: html
-
-   <i class="fab fa-docker"></i>
-
-AMD validates and publishes ready-made `ROCm Taichi Docker images <https://hub.docker.com/r/rocm/taichi/tags>`_
-with ROCm backends on Docker Hub. The following Docker image tags and associated inventories 
-represent the latest Taichi version from the official Docker Hub.
-The Docker images have been validated for `ROCm 6.3.2 <https://rocm.docs.amd.com/en/docs-6.3.2/about/release-notes.html>`_. 
-Click |docker-icon| to view the image on Docker Hub.
-
-.. list-table:: 
-    :header-rows: 1
-    :class: docker-image-compatibility
-
-    * - Docker image
-      - ROCm
-      - Taichi
-      - Ubuntu
-      - Python
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/taichi/taichi-1.8.0b1_rocm6.3.2_ubuntu22.04_py3.10.12/images/sha256-e016964a751e6a92199032d23e70fa3a564fff8555afe85cd718f8aa63f11fc6"><i class="fab fa-docker fa-lg"></i> rocm/taichi</a>
-      - `6.3.2 <https://repo.radeon.com/rocm/apt/6.3.2/>`_
-      - `1.8.0b1 <https://github.com/taichi-dev/taichi>`_
-      - 22.04
-      - `3.10.12 <https://www.python.org/downloads/release/python-31012/>`_
--- a/docs/compatibility/ml-compatibility/tensorflow-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/tensorflow-compatibility.rst
@@ -2,7 +2,7 @@

 .. meta::
    :description: TensorFlow compatibility
-    :keywords: GPU, TensorFlow compatibility
+    :keywords: GPU, TensorFlow, deep learning, framework compatibility

 .. version-set:: rocm_version latest

@@ -12,37 +12,33 @@ TensorFlow compatibility

 `TensorFlow <https://www.tensorflow.org/>`__ is an open-source library for
 solving machine learning, deep learning, and AI problems. It can solve many
-problems across different sectors and industries but primarily focuses on
-neural network training and inference. It is one of the most popular and
-in-demand frameworks and is very active in open-source contribution and
-development.
+problems across different sectors and industries, but primarily focuses on
+neural network training and inference. It is one of the most popular deep 
+learning frameworks and is very active in open-source development.
+
+Support overview
+================================================================================
+
+- The ROCm-supported version of TensorFlow is maintained in the official `https://github.com/ROCm/tensorflow-upstream 
+  <https://github.com/ROCm/tensorflow-upstream>`__ repository, which differs from the 
+  `https://github.com/tensorflow/tensorflow <https://github.com/tensorflow/tensorflow>`__ upstream repository.
+
+- To get started and install TensorFlow on ROCm, use the prebuilt :ref:`Docker images <tensorflow-docker-compat>`, 
+  which include ROCm, TensorFlow, and all required dependencies.
+
+  - See the :doc:`ROCm TensorFlow installation guide <rocm-install-on-linux:install/3rd-party/tensorflow-install>` 
+    for installation and setup instructions.
+
+  - You can also consult the `TensorFlow API versions <https://www.tensorflow.org/versions>`__ list 
+    for additional context.
+
+Version support
+--------------------------------------------------------------------------------

 The `official TensorFlow repository <http://github.com/tensorflow/tensorflow>`__
 includes full ROCm support. AMD maintains a TensorFlow `ROCm repository
 <http://github.com/rocm/tensorflow-upstream>`__ in order to quickly add bug
-fixes, updates, and support for the latest ROCM versions.
-
- ROCm TensorFlow release:
-
-  - Offers :ref:`Docker images <tensorflow-docker-compat>` with
-    ROCm and TensorFlow pre-installed.
-
-  - ROCm TensorFlow repository: `<https://github.com/ROCm/tensorflow-upstream>`__
-
-  - See the :doc:`ROCm TensorFlow installation guide <rocm-install-on-linux:install/3rd-party/tensorflow-install>`
-    to get started.
-
- Official TensorFlow release:
-
-  - Official TensorFlow repository: `<https://github.com/tensorflow/tensorflow>`__
-
-  - See the `TensorFlow API versions <https://www.tensorflow.org/versions>`__ list.
-
-  .. note::
-
-     The official TensorFlow documentation does not cover ROCm support. Use the
-     ROCm documentation for installation instructions for Tensorflow on ROCm.
-     See :doc:`rocm-install-on-linux:install/3rd-party/tensorflow-install`.
+fixes, updates, and support for the latest ROCm versions.

 .. _tensorflow-docker-compat:

@@ -140,7 +136,7 @@ The following section maps supported data types and GPU-accelerated TensorFlow
 features to their minimum supported ROCm and TensorFlow versions.

 Data types
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+---------------

 The data type of a tensor is specified using the ``dtype`` attribute or
 argument, and TensorFlow supports a wide range of data types for different use
@@ -258,7 +254,7 @@ are as follows:
      - 1.7

 Features
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+---------------

 This table provides an overview of key features in TensorFlow and their
 availability in ROCm.
@@ -350,7 +346,7 @@ availability in ROCm.
      - 1.9.2

 Distributed library features
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+-----------------------------------

 Enables developers to scale computations across multiple devices on a single machine or
 across multiple machines.
--- a/docs/compatibility/ml-compatibility/verl-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/verl-compatibility.rst
@@ -2,7 +2,7 @@

 .. meta::
   :description: verl compatibility
-   :keywords: GPU, verl compatibility
+   :keywords: GPU, verl, deep learning, framework compatibility

 .. version-set:: rocm_version latest

@@ -10,77 +10,109 @@
 verl compatibility
 *******************************************************************************

-Volcano Engine Reinforcement Learning for LLMs (verl) is a reinforcement learning framework designed for large language models (LLMs). 
-verl offers a scalable, open-source fine-tuning solution optimized for AMD Instinct GPUs with full ROCm support.
+Volcano Engine Reinforcement Learning for LLMs (`verl <https://verl.readthedocs.io/en/latest/>`__)  
+is a reinforcement learning framework designed for large language models (LLMs). 
+verl offers a scalable, open-source fine-tuning solution by using a hybrid programming model 
+that makes it easy to define and run complex post-training dataflows efficiently. 

-* See the `verl documentation <https://verl.readthedocs.io/en/latest/>`_ for more information about verl. 
-* The official verl GitHub repository is `https://github.com/volcengine/verl <https://github.com/volcengine/verl>`_.
-* Use the AMD-validated :ref:`Docker images <verl-docker-compat>` with ROCm and verl preinstalled. 
-* See the :doc:`ROCm verl installation guide <rocm-install-on-linux:install/3rd-party/verl-install>` to install and get started.
+Its modular APIs separate computation from data, allowing smooth integration with other frameworks. 
+It also supports flexible model placement across GPUs for efficient scaling on different cluster sizes.
+verl achieves high training and generation throughput by building on existing LLM frameworks. 
+Its 3D-HybridEngine reduces memory use and communication overhead when switching between training 
+and inference, improving overall performance.

-.. note::
-
-	verl is supported on ROCm 6.2.0.
-
-.. _verl-recommendations:
-
-Use cases and recommendations
+Support overview
 ================================================================================

-The benefits of verl in large-scale reinforcement learning from human feedback (RLHF) are discussed in the `Reinforcement Learning from Human Feedback on AMD GPUs with verl and ROCm Integration <https://rocm.blogs.amd.com/artificial-intelligence/verl-large-scale/README.html>`_ blog.
+- The ROCm-supported version of verl is maintained in the official `https://github.com/ROCm/verl 
+  <https://github.com/ROCm/verl>`__ repository, which differs from the 
+  `https://github.com/volcengine/verl <https://github.com/volcengine/verl>`__ upstream repository.

-.. _verl-supported_features:
+- To get started and install verl on ROCm, use the prebuilt :ref:`Docker image <verl-docker-compat>`, 
+  which includes ROCm, verl, and all required dependencies.

-Supported features
-===============================================================================
+  - See the :doc:`ROCm verl installation guide <rocm-install-on-linux:install/3rd-party/verl-install>`
+    for installation and setup instructions.

-The following table shows verl on ROCm support for GPU-accelerated modules.
-
-.. list-table::
-    :header-rows: 1
-
-    * - Module
-      - Description
-      - verl version
-      - ROCm version
-    * - ``FSDP``
-      - Training engine
-      - 0.3.0.post0
-      - 6.2.0
-    * - ``vllm``
-      - Inference engine
-      - 0.3.0.post0
-      - 6.2.0
+  - You can also consult the upstream `verl documentation <https://verl.readthedocs.io/en/latest/>`__ 
+    for additional context.

 .. _verl-docker-compat:

-Docker image compatibility
+Compatibility matrix
 ================================================================================

 .. |docker-icon| raw:: html

   <i class="fab fa-docker"></i>

-AMD validates and publishes ready-made `ROCm verl Docker images <https://hub.docker.com/r/rocm/verl/tags>`_
-with ROCm backends on Docker Hub. The following Docker image tags and associated inventories represent the available verl versions from the official Docker Hub. 
+AMD validates and publishes `verl Docker images <https://hub.docker.com/r/rocm/verl/tags>`_
+with ROCm backends on Docker Hub. The following Docker image tag and associated inventories 
+represent the latest verl version from the official Docker Hub. 
+Click |docker-icon| to view the image on Docker Hub.

-.. list-table:: 
-    :header-rows: 1
+.. list-table::
+   :header-rows: 1
+   :class: docker-image-compatibility

-    *   - Docker image
-        - ROCm
-        - verl
-        - Ubuntu
-        - Pytorch
-        - Python
-        - vllm
+   * - Docker image
+     - ROCm
+     - verl
+     - Ubuntu
+     - PyTorch
+     - Python
+     - vllm
+     - GPU

-    *   - .. raw:: html
+   * - .. raw:: html

-            <a href="https://hub.docker.com/layers/rocm/verl/verl-0.3.0.post0_rocm6.2_vllm0.6.3/images/sha256-cbe423803fd7850448b22444176bee06f4dcf22cd3c94c27732752d3a39b04b2"><i class="fab fa-docker fa-lg"></i> rocm/verl</a>
-        - `6.2.0 <https://repo.radeon.com/rocm/apt/6.2/>`_
-        - `0.3.0post0 <https://github.com/volcengine/verl/releases/tag/v0.3.0.post0>`_
-        - 20.04
-        - `2.5.0 <https://github.com/ROCm/pytorch/tree/release/2.5>`_
-        - `3.9.19 <https://www.python.org/downloads/release/python-3919/>`_
-        - `0.6.3 <https://github.com/vllm-project/vllm/releases/tag/v0.6.3>`_
+        <a href="https://hub.docker.com/layers/rocm/verl/verl-0.6.0.amd0_rocm7.0_vllm0.11.0.dev/images/sha256-f70a3ebc94c1f66de42a2fcc3f8a6a8d6d0881eb0e65b6958d7d6d24b3eecb0d"><i class="fab fa-docker fa-lg"></i> rocm/verl</a>
+     - `7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__
+     - `0.6.0 <https://github.com/volcengine/verl/releases/tag/v0.6.0>`__
+     - 22.04
+     - `2.9.0 <https://github.com/ROCm/pytorch/tree/release/2.9-rocm7.x-gfx115x>`__
+     - `3.12.11 <https://www.python.org/downloads/release/python-31211/>`__
+     - `0.11.0 <https://github.com/vllm-project/vllm/releases/tag/v0.11.0>`__
+     - MI300X
+
+   * - .. raw:: html
+
+        <a href="https://hub.docker.com/layers/rocm/verl/verl-0.3.0.post0_rocm6.2_vllm0.6.3/images/sha256-cbe423803fd7850448b22444176bee06f4dcf22cd3c94c27732752d3a39b04b2"><i class="fab fa-docker fa-lg"></i> rocm/verl</a>
+     - `6.2.0 <https://repo.radeon.com/rocm/apt/6.2/>`__
+     - `0.3.0.post0 <https://github.com/volcengine/verl/releases/tag/v0.3.0.post0>`__
+     - 20.04
+     - `2.5.0 <https://github.com/ROCm/pytorch/tree/release/2.5>`__
+     - `3.9.19 <https://www.python.org/downloads/release/python-3919/>`__
+     - `0.6.3 <https://github.com/vllm-project/vllm/releases/tag/v0.6.3>`__
+     - MI300X
+
+.. _verl-supported_features:
+
+Supported modules with verl on ROCm
+===============================================================================
+
+The following GPU-accelerated modules are supported with verl on ROCm:
+
+- ``FSDP``: Training engine
+- ``vllm``: Inference engine
+
+.. _verl-recommendations:
+
+Use cases and recommendations
+================================================================================
+
+* The benefits of verl in large-scale reinforcement learning from human feedback 
+  (RLHF) are discussed in the `Reinforcement Learning from Human Feedback on AMD 
+  GPUs with verl and ROCm Integration <https://rocm.blogs.amd.com/artificial-intelligence/verl-large-scale/README.html>`__ 
+  blog. The blog post outlines how the Volcano Engine Reinforcement Learning 
+  (verl) framework integrates with the AMD ROCm platform to optimize training on 
+  AMD Instinct™ GPUs. The guide details the process of building a Docker image, 
+  setting up single-node and multi-node training environments, and highlights 
+  performance benchmarks demonstrating improved throughput and convergence accuracy. 
+  This resource serves as a comprehensive starting point for deploying verl on AMD GPUs, 
+  facilitating efficient RLHF training workflows.
+
+Previous versions
+===============================================================================
+See :doc:`rocm-install-on-linux:install/3rd-party/previous-versions/verl-history` to find documentation for previous releases
+of the ``ROCm/verl`` Docker image.
--- a/docs/conceptual/gpu-isolation.md
+++ b/docs/conceptual/gpu-isolation.md
@@ -34,7 +34,7 @@ Runtime

 ```{code-block} shell
 :caption: Example to expose the 1. device and a device based on UUID.
-export ROCR_VISIBLE_DEVICES="0,GPU-DEADBEEFDEADBEEF"
+export ROCR_VISIBLE_DEVICES="0,GPU-4b2c1a9f-8d3e-6f7a-b5c9-2e4d8a1f6c3b"
 ```

 ### `GPU_DEVICE_ORDINAL`
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -8,6 +8,7 @@ import os
 import shutil
 import sys
 from pathlib import Path
+from subprocess import run

 gh_release_path = os.path.join("..", "RELEASE.md")
 gh_changelog_path = os.path.join("..", "CHANGELOG.md")
@@ -80,24 +81,27 @@ latex_elements = {
 }

 html_baseurl = os.environ.get("READTHEDOCS_CANONICAL_URL", "rocm.docs.amd.com")
-html_context = {}
+html_context = {"docs_header_version": "7.1.1"}
 if os.environ.get("READTHEDOCS", "") == "True":
    html_context["READTHEDOCS"] = True

+# Check if the branch is a docs/ branch
+official_branch = run(["git", "rev-parse", "--abbrev-ref", "HEAD"], capture_output=True, text=True).stdout.find("docs/")
+
 # configurations for PDF output by Read the Docs
 project = "ROCm Documentation"
 project_path = os.path.abspath(".").replace("\\", "/")
 author = "Advanced Micro Devices, Inc."
 copyright = "Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved."
-version = "7.0.2"
-release = "7.0.2"
+version = "7.1.1"
+release = "7.1.1"
 setting_all_article_info = True
 all_article_info_os = ["linux", "windows"]
 all_article_info_author = ""

 # pages with specific settings
 article_pages = [
-    {"file": "about/release-notes", "os": ["linux"], "date": "2025-10-10"},
+    {"file": "about/release-notes", "os": ["linux"], "date": "2025-11-26"},
    {"file": "release/changelog", "os": ["linux"],},
    {"file": "compatibility/compatibility-matrix", "os": ["linux"]},
    {"file": "compatibility/ml-compatibility/pytorch-compatibility", "os": ["linux"]},
@@ -107,7 +111,6 @@ article_pages = [
    {"file": "compatibility/ml-compatibility/stanford-megatron-lm-compatibility", "os": ["linux"]},
    {"file": "compatibility/ml-compatibility/dgl-compatibility", "os": ["linux"]},
    {"file": "compatibility/ml-compatibility/megablocks-compatibility", "os": ["linux"]},
-    {"file": "compatibility/ml-compatibility/taichi-compatibility", "os": ["linux"]},
    {"file": "compatibility/ml-compatibility/ray-compatibility", "os": ["linux"]},
    {"file": "compatibility/ml-compatibility/llama-cpp-compatibility", "os": ["linux"]},
    {"file": "compatibility/ml-compatibility/flashinfer-compatibility", "os": ["linux"]},
@@ -132,9 +135,15 @@ article_pages = [
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.5", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.6", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.7", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.8", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.9", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.10", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-primus-migration-guide", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.7", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/primus-megatron", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.7", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.8", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.9", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.10", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.3", "os": ["linux"]},
@@ -142,13 +151,19 @@ article_pages = [
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.6", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.7", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.8", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.9", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.10", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-pytorch-v25.8", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-pytorch-v25.9", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-pytorch-v25.10", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.5", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/xdit-diffusion-inference", "os": ["linux"]},    

    {"file": "how-to/rocm-for-ai/fine-tuning/index", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/fine-tuning/overview", "os": ["linux"]},
@@ -173,8 +188,16 @@ article_pages = [
    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250702", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.0-20250812", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.1-20250909", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.2-20251006", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.11.1-20251103", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/sglang-history", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/xdit-diffusion-inference", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.10", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.11", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.12", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.13", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/deploy-your-model", "os": ["linux"]},

    {"file": "how-to/rocm-for-ai/inference-optimization/index", "os": ["linux"]},
@@ -202,7 +225,7 @@ external_toc_path = "./sphinx/_toc.yml"
 # Add the _extensions directory to Python's search path
 sys.path.append(str(Path(__file__).parent / 'extension'))

-extensions = ["rocm_docs", "sphinx_reredirects", "sphinx_sitemap", "sphinxcontrib.datatemplates", "version-ref", "csv-to-list-table"]
+extensions = ["rocm_docs", "sphinx_reredirects", "sphinx_sitemap", "sphinxcontrib.datatemplates", "remote-content", "version-ref", "csv-to-list-table"]

 compatibility_matrix_file = str(Path(__file__).parent / 'compatibility/compatibility-matrix-historical-6.0.csv')

@@ -212,10 +235,14 @@ external_projects_current_project = "rocm"
 # external_projects_remote_repository = ""

 html_baseurl = os.environ.get("READTHEDOCS_CANONICAL_URL", "https://rocm-stg.amd.com/")
-html_context = {}
+html_context = {"docs_header_version": "7.1.0"}
 if os.environ.get("READTHEDOCS", "") == "True":
    html_context["READTHEDOCS"] = True

+html_context["official_branch"] = official_branch
+html_context["version"] = version
+html_context["release"] = release
+
 html_theme = "rocm_docs_theme"
 html_theme_options = {"flavor": "rocm-docs-home"}

@@ -241,3 +268,6 @@ html_context = {
    "granularity_type" : [('Coarse-grained', 'coarse-grained'), ('Fine-grained', 'fine-grained')],
    "scope_type" : [('Device', 'device'), ('System', 'system')]
 }
+
+# Disable figure and table numbering
+numfig = False
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.1_20251006-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.1_20251006-benchmark-models.yaml
@@ -0,0 +1,316 @@
+dockers:
+  - pull_tag: rocm/vllm:rocm7.0.0_vllm_0.10.2_20251006
+    docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm7.0.0_vllm_0.10.2_20251006/images/sha256-94fd001964e1cf55c3224a445b1fb5be31a7dac302315255db8422d813edd7f5
+    components:
+      ROCm: 7.0.0
+      vLLM: 0.10.2 (0.11.0rc2.dev160+g790d22168.rocm700)
+      PyTorch: 2.9.0a0+git1c57644
+      hipBLASLt: 1.0.0
+    dockerfile:
+      commit: 790d22168820507f3105fef29596549378cfe399
+model_groups:
+  - group: Meta Llama
+    tag: llama
+    models:
+      - model: Llama 2 70B
+        mad_tag: pyt_vllm_llama-2-70b
+        model_repo: meta-llama/Llama-2-70b-chat-hf
+        url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
+        precision: float16
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: auto
+          max_num_batched_tokens: 4096
+          max_model_len: 4096
+      - model: Llama 3.1 8B
+        mad_tag: pyt_vllm_llama-3.1-8b
+        model_repo: meta-llama/Llama-3.1-8B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-8B
+        precision: float16
+        config:
+          tp: 1
+          dtype: auto
+          kv_cache_dtype: auto
+          max_num_batched_tokens: 131072
+          max_model_len: 8192
+      - model: Llama 3.1 8B FP8
+        mad_tag: pyt_vllm_llama-3.1-8b_fp8
+        model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
+        precision: float8
+        config:
+          tp: 1
+          dtype: auto
+          kv_cache_dtype: fp8
+          max_num_batched_tokens: 131072
+          max_model_len: 8192
+      - model: Llama 3.1 405B
+        mad_tag: pyt_vllm_llama-3.1-405b
+        model_repo: meta-llama/Llama-3.1-405B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
+        precision: float16
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: auto
+          max_num_batched_tokens: 131072
+          max_model_len: 8192
+      - model: Llama 3.1 405B FP8
+        mad_tag: pyt_vllm_llama-3.1-405b_fp8
+        model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
+        precision: float8
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: fp8
+          max_num_batched_tokens: 131072
+          max_model_len: 8192
+      - model: Llama 3.1 405B MXFP4
+        mad_tag: pyt_vllm_llama-3.1-405b_fp4
+        model_repo: amd/Llama-3.1-405B-Instruct-MXFP4-Preview
+        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-MXFP4-Preview
+        precision: float4
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: fp8
+          max_num_batched_tokens: 131072
+          max_model_len: 8192
+      - model: Llama 3.3 70B
+        mad_tag: pyt_vllm_llama-3.3-70b
+        model_repo: meta-llama/Llama-3.3-70B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
+        precision: float16
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: auto
+          max_num_batched_tokens: 131072
+          max_model_len: 8192
+      - model: Llama 3.3 70B FP8
+        mad_tag: pyt_vllm_llama-3.3-70b_fp8
+        model_repo: amd/Llama-3.3-70B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.3-70B-Instruct-FP8-KV
+        precision: float8
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: fp8
+          max_num_batched_tokens: 131072
+          max_model_len: 8192
+      - model: Llama 3.3 70B MXFP4
+        mad_tag: pyt_vllm_llama-3.3-70b_fp4
+        model_repo: amd/Llama-3.3-70B-Instruct-MXFP4-Preview
+        url: https://huggingface.co/amd/Llama-3.3-70B-Instruct-MXFP4-Preview
+        precision: float4
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: fp8
+          max_num_batched_tokens: 131072
+          max_model_len: 8192
+      - model: Llama 4 Scout 17Bx16E
+        mad_tag: pyt_vllm_llama-4-scout-17b-16e
+        model_repo: meta-llama/Llama-4-Scout-17B-16E-Instruct
+        url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct
+        precision: float16
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: auto
+          max_num_batched_tokens: 32768
+          max_model_len: 8192
+      - model: Llama 4 Maverick 17Bx128E
+        mad_tag: pyt_vllm_llama-4-maverick-17b-128e
+        model_repo: meta-llama/Llama-4-Maverick-17B-128E-Instruct
+        url: https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct
+        precision: float16
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: auto
+          max_num_batched_tokens: 32768
+          max_model_len: 8192
+      - model: Llama 4 Maverick 17Bx128E FP8
+        mad_tag: pyt_vllm_llama-4-maverick-17b-128e_fp8
+        model_repo: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
+        url: https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
+        precision: float8
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: fp8
+          max_num_batched_tokens: 131072
+          max_model_len: 8192
+  - group: DeepSeek
+    tag: deepseek
+    models:
+      - model: DeepSeek R1 0528 FP8
+        mad_tag: pyt_vllm_deepseek-r1
+        model_repo: deepseek-ai/DeepSeek-R1-0528
+        url: https://huggingface.co/deepseek-ai/DeepSeek-R1-0528
+        precision: float8
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: fp8
+          max_num_seqs: 1024
+          max_num_batched_tokens: 131072
+          max_model_len: 8192
+  - group: OpenAI GPT OSS
+    tag: gpt-oss
+    models:
+      - model: GPT OSS 20B
+        mad_tag: pyt_vllm_gpt-oss-20b
+        model_repo: openai/gpt-oss-20b
+        url: https://huggingface.co/openai/gpt-oss-20b
+        precision: bfloat16
+        config:
+          tp: 1
+          dtype: auto
+          kv_cache_dtype: auto
+          max_num_batched_tokens: 8192
+          max_model_len: 8192
+      - model: GPT OSS 120B
+        mad_tag: pyt_vllm_gpt-oss-120b
+        model_repo: openai/gpt-oss-120b
+        url: https://huggingface.co/openai/gpt-oss-120b
+        precision: bfloat16
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: auto
+          max_num_batched_tokens: 8192
+          max_model_len: 8192
+  - group: Mistral AI
+    tag: mistral
+    models:
+      - model: Mixtral MoE 8x7B
+        mad_tag: pyt_vllm_mixtral-8x7b
+        model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
+        url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
+        precision: float16
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: auto
+          max_num_batched_tokens: 32768
+          max_model_len: 8192
+      - model: Mixtral MoE 8x7B FP8
+        mad_tag: pyt_vllm_mixtral-8x7b_fp8
+        model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+        precision: float8
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: fp8
+          max_num_batched_tokens: 32768
+          max_model_len: 8192
+      - model: Mixtral MoE 8x22B
+        mad_tag: pyt_vllm_mixtral-8x22b
+        model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
+        url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
+        precision: float16
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: auto
+          max_num_batched_tokens: 65536
+          max_model_len: 8192
+      - model: Mixtral MoE 8x22B FP8
+        mad_tag: pyt_vllm_mixtral-8x22b_fp8
+        model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+        precision: float8
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: fp8
+          max_num_batched_tokens: 65536
+          max_model_len: 8192
+  - group: Qwen
+    tag: qwen
+    models:
+      - model: Qwen3 8B
+        mad_tag: pyt_vllm_qwen3-8b
+        model_repo: Qwen/Qwen3-8B
+        url: https://huggingface.co/Qwen/Qwen3-8B
+        precision: float16
+        config:
+          tp: 1
+          dtype: auto
+          kv_cache_dtype: auto
+          max_num_batched_tokens: 40960
+          max_model_len: 8192
+      - model: Qwen3 32B
+        mad_tag: pyt_vllm_qwen3-32b
+        model_repo: Qwen/Qwen3-32b
+        url: https://huggingface.co/Qwen/Qwen3-32B
+        precision: float16
+        config:
+          tp: 1
+          dtype: auto
+          kv_cache_dtype: auto
+          max_num_batched_tokens: 40960
+          max_model_len: 8192
+      - model: Qwen3 30B A3B
+        mad_tag: pyt_vllm_qwen3-30b-a3b
+        model_repo: Qwen/Qwen3-30B-A3B
+        url: https://huggingface.co/Qwen/Qwen3-30B-A3B
+        precision: float16
+        config:
+          tp: 1
+          dtype: auto
+          kv_cache_dtype: auto
+          max_num_batched_tokens: 40960
+          max_model_len: 8192
+      - model: Qwen3 30B A3B FP8
+        mad_tag: pyt_vllm_qwen3-30b-a3b_fp8
+        model_repo: Qwen/Qwen3-30B-A3B-FP8
+        url: https://huggingface.co/Qwen/Qwen3-30B-A3B-FP8
+        precision: float16
+        config:
+          tp: 1
+          dtype: auto
+          kv_cache_dtype: fp8
+          max_num_batched_tokens: 40960
+          max_model_len: 8192
+      - model: Qwen3 235B A22B
+        mad_tag: pyt_vllm_qwen3-235b-a22b
+        model_repo: Qwen/Qwen3-235B-A22B
+        url: https://huggingface.co/Qwen/Qwen3-235B-A22B
+        precision: float16
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: auto
+          max_num_batched_tokens: 40960
+          max_model_len: 8192
+      - model: Qwen3 235B A22B FP8
+        mad_tag: pyt_vllm_qwen3-235b-a22b_fp8
+        model_repo: Qwen/Qwen3-235B-A22B-FP8
+        url: https://huggingface.co/Qwen/Qwen3-235B-A22B-FP8
+        precision: float8
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: fp8
+          max_num_batched_tokens: 40960
+          max_model_len: 8192
+  - group: Microsoft Phi
+    tag: phi
+    models:
+      - model: Phi-4
+        mad_tag: pyt_vllm_phi-4
+        model_repo: microsoft/phi-4
+        url: https://huggingface.co/microsoft/phi-4
+        precision: float16
+        config:
+          tp: 1
+          dtype: auto
+          kv_cache_dtype: auto
+          max_num_batched_tokens: 16384
+          max_model_len: 8192
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.11.1_20251103-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.11.1_20251103-benchmark-models.yaml
@@ -0,0 +1,316 @@
+dockers:
+  - pull_tag: rocm/vllm:rocm7.0.0_vllm_0.11.1_20251103
+    docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm7.0.0_vllm_0.11.1_20251103/images/sha256-8d60429043d4d00958da46039a1de0d9b82df814d45da482497eef26a6076506
+    components:
+      ROCm: 7.0.0
+      vLLM: 0.11.1 (0.11.1rc2.dev141+g38f225c2a.rocm700)
+      PyTorch: 2.9.0a0+git1c57644
+      hipBLASLt: 1.0.0
+    dockerfile:
+      commit: 38f225c2abeadc04c2cc398814c2f53ea02c3c72
+model_groups:
+  - group: Meta Llama
+    tag: llama
+    models:
+      - model: Llama 2 70B
+        mad_tag: pyt_vllm_llama-2-70b
+        model_repo: meta-llama/Llama-2-70b-chat-hf
+        url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
+        precision: float16
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: auto
+          max_num_batched_tokens: 4096
+          max_model_len: 4096
+      - model: Llama 3.1 8B
+        mad_tag: pyt_vllm_llama-3.1-8b
+        model_repo: meta-llama/Llama-3.1-8B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-8B
+        precision: float16
+        config:
+          tp: 1
+          dtype: auto
+          kv_cache_dtype: auto
+          max_num_batched_tokens: 131072
+          max_model_len: 8192
+      - model: Llama 3.1 8B FP8
+        mad_tag: pyt_vllm_llama-3.1-8b_fp8
+        model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
+        precision: float8
+        config:
+          tp: 1
+          dtype: auto
+          kv_cache_dtype: fp8
+          max_num_batched_tokens: 131072
+          max_model_len: 8192
+      - model: Llama 3.1 405B
+        mad_tag: pyt_vllm_llama-3.1-405b
+        model_repo: meta-llama/Llama-3.1-405B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
+        precision: float16
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: auto
+          max_num_batched_tokens: 131072
+          max_model_len: 8192
+      - model: Llama 3.1 405B FP8
+        mad_tag: pyt_vllm_llama-3.1-405b_fp8
+        model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
+        precision: float8
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: fp8
+          max_num_batched_tokens: 131072
+          max_model_len: 8192
+      - model: Llama 3.1 405B MXFP4
+        mad_tag: pyt_vllm_llama-3.1-405b_fp4
+        model_repo: amd/Llama-3.1-405B-Instruct-MXFP4-Preview
+        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-MXFP4-Preview
+        precision: float4
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: fp8
+          max_num_batched_tokens: 131072
+          max_model_len: 8192
+      - model: Llama 3.3 70B
+        mad_tag: pyt_vllm_llama-3.3-70b
+        model_repo: meta-llama/Llama-3.3-70B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
+        precision: float16
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: auto
+          max_num_batched_tokens: 131072
+          max_model_len: 8192
+      - model: Llama 3.3 70B FP8
+        mad_tag: pyt_vllm_llama-3.3-70b_fp8
+        model_repo: amd/Llama-3.3-70B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.3-70B-Instruct-FP8-KV
+        precision: float8
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: fp8
+          max_num_batched_tokens: 131072
+          max_model_len: 8192
+      - model: Llama 3.3 70B MXFP4
+        mad_tag: pyt_vllm_llama-3.3-70b_fp4
+        model_repo: amd/Llama-3.3-70B-Instruct-MXFP4-Preview
+        url: https://huggingface.co/amd/Llama-3.3-70B-Instruct-MXFP4-Preview
+        precision: float4
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: fp8
+          max_num_batched_tokens: 131072
+          max_model_len: 8192
+      - model: Llama 4 Scout 17Bx16E
+        mad_tag: pyt_vllm_llama-4-scout-17b-16e
+        model_repo: meta-llama/Llama-4-Scout-17B-16E-Instruct
+        url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct
+        precision: float16
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: auto
+          max_num_batched_tokens: 32768
+          max_model_len: 8192
+      - model: Llama 4 Maverick 17Bx128E
+        mad_tag: pyt_vllm_llama-4-maverick-17b-128e
+        model_repo: meta-llama/Llama-4-Maverick-17B-128E-Instruct
+        url: https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct
+        precision: float16
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: auto
+          max_num_batched_tokens: 32768
+          max_model_len: 8192
+      - model: Llama 4 Maverick 17Bx128E FP8
+        mad_tag: pyt_vllm_llama-4-maverick-17b-128e_fp8
+        model_repo: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
+        url: https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
+        precision: float8
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: fp8
+          max_num_batched_tokens: 131072
+          max_model_len: 8192
+  - group: DeepSeek
+    tag: deepseek
+    models:
+      - model: DeepSeek R1 0528 FP8
+        mad_tag: pyt_vllm_deepseek-r1
+        model_repo: deepseek-ai/DeepSeek-R1-0528
+        url: https://huggingface.co/deepseek-ai/DeepSeek-R1-0528
+        precision: float8
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: fp8
+          max_num_seqs: 1024
+          max_num_batched_tokens: 131072
+          max_model_len: 8192
+  - group: OpenAI GPT OSS
+    tag: gpt-oss
+    models:
+      - model: GPT OSS 20B
+        mad_tag: pyt_vllm_gpt-oss-20b
+        model_repo: openai/gpt-oss-20b
+        url: https://huggingface.co/openai/gpt-oss-20b
+        precision: bfloat16
+        config:
+          tp: 1
+          dtype: auto
+          kv_cache_dtype: auto
+          max_num_batched_tokens: 8192
+          max_model_len: 8192
+      - model: GPT OSS 120B
+        mad_tag: pyt_vllm_gpt-oss-120b
+        model_repo: openai/gpt-oss-120b
+        url: https://huggingface.co/openai/gpt-oss-120b
+        precision: bfloat16
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: auto
+          max_num_batched_tokens: 8192
+          max_model_len: 8192
+  - group: Mistral AI
+    tag: mistral
+    models:
+      - model: Mixtral MoE 8x7B
+        mad_tag: pyt_vllm_mixtral-8x7b
+        model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
+        url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
+        precision: float16
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: auto
+          max_num_batched_tokens: 32768
+          max_model_len: 8192
+      - model: Mixtral MoE 8x7B FP8
+        mad_tag: pyt_vllm_mixtral-8x7b_fp8
+        model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+        precision: float8
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: fp8
+          max_num_batched_tokens: 32768
+          max_model_len: 8192
+      - model: Mixtral MoE 8x22B
+        mad_tag: pyt_vllm_mixtral-8x22b
+        model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
+        url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
+        precision: float16
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: auto
+          max_num_batched_tokens: 65536
+          max_model_len: 8192
+      - model: Mixtral MoE 8x22B FP8
+        mad_tag: pyt_vllm_mixtral-8x22b_fp8
+        model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+        precision: float8
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: fp8
+          max_num_batched_tokens: 65536
+          max_model_len: 8192
+  - group: Qwen
+    tag: qwen
+    models:
+      - model: Qwen3 8B
+        mad_tag: pyt_vllm_qwen3-8b
+        model_repo: Qwen/Qwen3-8B
+        url: https://huggingface.co/Qwen/Qwen3-8B
+        precision: float16
+        config:
+          tp: 1
+          dtype: auto
+          kv_cache_dtype: auto
+          max_num_batched_tokens: 40960
+          max_model_len: 8192
+      - model: Qwen3 32B
+        mad_tag: pyt_vllm_qwen3-32b
+        model_repo: Qwen/Qwen3-32b
+        url: https://huggingface.co/Qwen/Qwen3-32B
+        precision: float16
+        config:
+          tp: 1
+          dtype: auto
+          kv_cache_dtype: auto
+          max_num_batched_tokens: 40960
+          max_model_len: 8192
+      - model: Qwen3 30B A3B
+        mad_tag: pyt_vllm_qwen3-30b-a3b
+        model_repo: Qwen/Qwen3-30B-A3B
+        url: https://huggingface.co/Qwen/Qwen3-30B-A3B
+        precision: float16
+        config:
+          tp: 1
+          dtype: auto
+          kv_cache_dtype: auto
+          max_num_batched_tokens: 40960
+          max_model_len: 8192
+      - model: Qwen3 30B A3B FP8
+        mad_tag: pyt_vllm_qwen3-30b-a3b_fp8
+        model_repo: Qwen/Qwen3-30B-A3B-FP8
+        url: https://huggingface.co/Qwen/Qwen3-30B-A3B-FP8
+        precision: float16
+        config:
+          tp: 1
+          dtype: auto
+          kv_cache_dtype: fp8
+          max_num_batched_tokens: 40960
+          max_model_len: 8192
+      - model: Qwen3 235B A22B
+        mad_tag: pyt_vllm_qwen3-235b-a22b
+        model_repo: Qwen/Qwen3-235B-A22B
+        url: https://huggingface.co/Qwen/Qwen3-235B-A22B
+        precision: float16
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: auto
+          max_num_batched_tokens: 40960
+          max_model_len: 8192
+      - model: Qwen3 235B A22B FP8
+        mad_tag: pyt_vllm_qwen3-235b-a22b_fp8
+        model_repo: Qwen/Qwen3-235B-A22B-FP8
+        url: https://huggingface.co/Qwen/Qwen3-235B-A22B-FP8
+        precision: float8
+        config:
+          tp: 8
+          dtype: auto
+          kv_cache_dtype: fp8
+          max_num_batched_tokens: 40960
+          max_model_len: 8192
+  - group: Microsoft Phi
+    tag: phi
+    models:
+      - model: Phi-4
+        mad_tag: pyt_vllm_phi-4
+        model_repo: microsoft/phi-4
+        url: https://huggingface.co/microsoft/phi-4
+        precision: float16
+        config:
+          tp: 1
+          dtype: auto
+          kv_cache_dtype: auto
+          max_num_batched_tokens: 16384
+          max_model_len: 8192
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.10-inference-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.10-inference-models.yaml
@@ -0,0 +1,55 @@
+xdit_diffusion_inference:
+  docker:
+    pull_tag: rocm/pytorch-xdit:v25.10
+    docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-xdit/v25.10/images/sha256-d79715ff18a9470e3f907cec8a9654d6b783c63370b091446acffc0de4d7070e
+    ROCm: 7.9.0
+    components:
+      TheRock: 7afbe45
+      rccl: 9b04b2a
+      composable_kernel: b7a806f
+      rocm-libraries: f104555
+      rocm-systems: 25922d0
+      torch: 2.10.0a0+gite9c9017
+      torchvision: 0.22.0a0+966da7e
+      triton: 3.5.0+git52e49c12
+      accelerate: 1.11.0.dev0
+      aiter: 0.1.5.post4.dev20+ga25e55e79
+      diffusers: 0.36.0.dev0
+      xfuser: 0.4.4
+      yunchang: 0.6.3.post1
+
+  model_groups:
+    - group: Hunyuan Video
+      tag: hunyuan
+      models:
+        - model: Hunyuan Video
+          model_name: hunyuanvideo
+          model_repo: tencent/HunyuanVideo
+          revision: refs/pr/18
+          url: https://huggingface.co/tencent/HunyuanVideo
+          github: https://github.com/Tencent-Hunyuan/HunyuanVideo
+          mad_tag: pyt_xdit_hunyuanvideo
+    - group: Wan-AI
+      tag: wan
+      models:
+        - model: Wan2.1
+          model_name: wan2_1-i2v-14b-720p
+          model_repo: Wan-AI/Wan2.1-I2V-14B-720P
+          url: https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P
+          github: https://github.com/Wan-Video/Wan2.1
+          mad_tag: pyt_xdit_wan_2_1
+        - model: Wan2.2
+          model_name: wan2_2-i2v-a14b
+          model_repo: Wan-AI/Wan2.2-I2V-A14B
+          url: https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B
+          github: https://github.com/Wan-Video/Wan2.2
+          mad_tag: pyt_xdit_wan_2_2
+    - group: FLUX
+      tag: flux
+      models:
+        - model: FLUX.1
+          model_name: FLUX.1-dev
+          model_repo: black-forest-labs/FLUX.1-dev
+          url: https://huggingface.co/black-forest-labs/FLUX.1-dev
+          github: https://github.com/black-forest-labs/flux
+          mad_tag: pyt_xdit_flux
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.11-inference-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.11-inference-models.yaml
@@ -0,0 +1,109 @@
+xdit_diffusion_inference:
+  docker:
+    - version: v25-11
+      pull_tag: rocm/pytorch-xdit:v25.11
+      docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-xdit/v25.11/images/sha256-c9fa659439bb024f854b4d5eea598347251b02c341c55f66c98110832bde4216
+      ROCm: 7.10.0
+      supported_models:
+        - group: Hunyuan Video
+          models:
+            - Hunyuan Video
+        - group: Wan-AI
+          models:
+            - Wan2.1
+            - Wan2.2
+        - group: FLUX
+          models:
+            - FLUX.1
+      whats_new:
+        - "Minor bug fixes and clarifications to READMEs."
+        - "Bumps TheRock, AITER, Diffusers, xDiT versions."
+        - "Changes Aiter rounding mode for faster gfx942 FWD Attention."
+      components:
+        TheRock: 3e3f834
+        rccl: d23d18f
+        composable_kernel: 2570462
+        rocm-libraries: 0588f07
+        rocm-systems: 473025a
+        torch: 73adac
+        torchvision: f5c6c2e
+        triton: 7416ffc
+        accelerate: 34c1779
+        aiter: de14bec
+        diffusers: 40528e9
+        xfuser: 83978b5
+        yunchang: 2c9b712
+
+    - version: v25-10
+      pull_tag: rocm/pytorch-xdit:v25.10
+      docker_hub_url: https://hub.docker.com/r/rocm/pytorch-xdit
+      ROCm: 7.9.0
+      supported_models:
+        - group: Hunyuan Video
+          models:
+            - Hunyuan Video
+        - group: Wan-AI
+          models:
+            - Wan2.1
+            - Wan2.2
+        - group: FLUX
+          models:
+            - FLUX.1
+      whats_new:
+        - "First official xDiT Docker Release for Diffusion Inference."
+        - "Supports gfx942 and gfx950 series (AMD Instinct™ MI300X, MI325X, MI350X, and MI355X)."
+        - "Support Wan 2.1, Wan 2.2, HunyuanVideo and Flux workloads."
+      components:
+        TheRock: 7afbe45
+        rccl: 9b04b2a
+        composable_kernel: b7a806f
+        rocm-libraries: f104555
+        rocm-systems: 25922d0
+        torch: 2.10.0a0+gite9c9017
+        torchvision: 0.22.0a0+966da7e
+        triton: 3.5.0+git52e49c12
+        accelerate: 1.11.0.dev0
+        aiter: 0.1.5.post4.dev20+ga25e55e79
+        diffusers: 0.36.0.dev0
+        xfuser: 0.4.4
+        yunchang: 0.6.3.post1
+
+  model_groups:
+    - group: Hunyuan Video
+      tag: hunyuan
+      models:
+        - model: Hunyuan Video
+          page_tag: hunyuan_tag
+          model_name: hunyuanvideo
+          model_repo: tencent/HunyuanVideo
+          revision: refs/pr/18
+          url: https://huggingface.co/tencent/HunyuanVideo
+          github: https://github.com/Tencent-Hunyuan/HunyuanVideo
+          mad_tag: pyt_xdit_hunyuanvideo
+    - group: Wan-AI
+      tag: wan
+      models:
+        - model: Wan2.1
+          page_tag: wan_21_tag
+          model_name: wan2_1-i2v-14b-720p
+          model_repo: Wan-AI/Wan2.1-I2V-14B-720P
+          url: https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P
+          github: https://github.com/Wan-Video/Wan2.1
+          mad_tag: pyt_xdit_wan_2_1
+        - model: Wan2.2
+          page_tag: wan_22_tag
+          model_name: wan2_2-i2v-a14b
+          model_repo: Wan-AI/Wan2.2-I2V-A14B
+          url: https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B
+          github: https://github.com/Wan-Video/Wan2.2
+          mad_tag: pyt_xdit_wan_2_2
+    - group: FLUX
+      tag: flux
+      models:
+        - model: FLUX.1
+          page_tag: flux_1_tag
+          model_name: FLUX.1-dev
+          model_repo: black-forest-labs/FLUX.1-dev
+          url: https://huggingface.co/black-forest-labs/FLUX.1-dev
+          github: https://github.com/black-forest-labs/flux
+          mad_tag: pyt_xdit_flux
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.12-inference-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.12-inference-models.yaml
@@ -0,0 +1,91 @@
+docker:
+  pull_tag: rocm/pytorch-xdit:v25.12
+  docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-xdit/v25.12/images/sha256-e06895132316bf3c393366b70a91eaab6755902dad0100e6e2b38310547d9256
+  ROCm: 7.10.0
+  whats_new:
+      - "Adds T2V and TI2V support for Wan models."
+      - "Adds support for SD-3.5 T2I model."
+  components:
+    TheRock: 
+      version: 3e3f834
+      url: https://github.com/ROCm/TheRock
+    rccl:
+      version: d23d18f
+      url: https://github.com/ROCm/rccl
+    composable_kernel:
+      version: 2570462
+      url: https://github.com/ROCm/composable_kernel
+    rocm-libraries:
+      version: 0588f07
+      url: https://github.com/ROCm/rocm-libraries
+    rocm-systems:
+      version: 473025a
+      url: https://github.com/ROCm/rocm-systems
+    torch:
+      version: 73adac
+      url: https://github.com/pytorch/pytorch
+    torchvision:
+      version: f5c6c2e
+      url: https://github.com/pytorch/vision
+    triton:
+      version: 7416ffc
+      url: https://github.com/triton-lang/triton
+    accelerate:
+      version: 34c1779
+      url: https://github.com/huggingface/accelerate
+    aiter:
+      version: de14bec
+      url: https://github.com/ROCm/aiter
+    diffusers:
+      version: 40528e9
+      url: https://github.com/huggingface/diffusers
+    xfuser:
+      version: ccba9d5
+      url: https://github.com/xdit-project/xDiT
+    yunchang:
+      version: 2c9b712
+      url: https://github.com/feifeibear/long-context-attention
+  supported_models:
+    - group: Hunyuan Video
+      js_tag: hunyuan
+      models:
+        - model: Hunyuan Video
+          model_repo: tencent/HunyuanVideo
+          revision: refs/pr/18
+          url: https://huggingface.co/tencent/HunyuanVideo
+          github: https://github.com/Tencent-Hunyuan/HunyuanVideo
+          mad_tag: pyt_xdit_hunyuanvideo
+          js_tag: hunyuan_tag
+    - group: Wan-AI
+      js_tag: wan
+      models:
+        - model: Wan2.1
+          model_repo: Wan-AI/Wan2.1-I2V-14B-720P-Diffusers
+          url: https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P-Diffusers
+          github: https://github.com/Wan-Video/Wan2.1
+          mad_tag: pyt_xdit_wan_2_1
+          js_tag: wan_21_tag
+        - model: Wan2.2
+          model_repo: Wan-AI/Wan2.2-I2V-A14B-Diffusers
+          url: https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B-Diffusers
+          github: https://github.com/Wan-Video/Wan2.2
+          mad_tag: pyt_xdit_wan_2_2
+          js_tag: wan_22_tag
+    - group: FLUX
+      js_tag: flux
+      models:
+        - model: FLUX.1
+          model_repo: black-forest-labs/FLUX.1-dev
+          url: https://huggingface.co/black-forest-labs/FLUX.1-dev
+          github: https://github.com/black-forest-labs/flux
+          mad_tag: pyt_xdit_flux
+          js_tag: flux_1_tag
+    - group: Stable Diffusion
+      js_tag: stablediffusion
+      models:
+        - model: stable-diffusion-3.5-large
+          model_repo: stabilityai/stable-diffusion-3.5-large
+          url: https://huggingface.co/stabilityai/stable-diffusion-3.5-large
+          github: https://github.com/Stability-AI/sd3.5
+          mad_tag: pyt_xdit_sd_3_5
+          js_tag: stable_diffusion_3_5_large_tag
--- a/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
@@ -1,13 +1,13 @@
 dockers:
-  - pull_tag: rocm/vllm:rocm7.0.0_vllm_0.10.2_20251006
-    docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm7.0.0_vllm_0.10.2_20251006/images/sha256-94fd001964e1cf55c3224a445b1fb5be31a7dac302315255db8422d813edd7f5
+  - pull_tag: rocm/vllm:rocm7.0.0_vllm_0.11.2_20251210
+    docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm7.0.0_vllm_0.11.2_20251210/images/sha256-e7f02dd2ce3824959658bc0391296f6158638e3ebce164f6c019c4eca8150ec7
    components:
      ROCm: 7.0.0
-      vLLM: 0.10.2 (0.11.0rc2.dev160+g790d22168.rocm700)
+      vLLM: 0.11.2 (0.11.2.dev673+g839868462.rocm700)
      PyTorch: 2.9.0a0+git1c57644
      hipBLASLt: 1.0.0
    dockerfile:
-      commit: 790d22168820507f3105fef29596549378cfe399
+      commit: 8398684622109c806a35d660647060b0b9910663
 model_groups:
  - group: Meta Llama
    tag: llama
--- a/docs/data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml
@@ -0,0 +1,105 @@
+docker:
+  pull_tag: rocm/pytorch-xdit:v25.13
+  docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-xdit/v25.13/images/sha256-81954713070d67bde08595e03f62110c8a3dd66a9ae17a77d611e01f83f0f4ef
+  ROCm: 7.11.0
+  whats_new:
+    - "Flux.1 Kontext support"
+    - "Flux.2 Dev support"
+    - "Flux FP8 GEMM support"
+    - "Hybrid FP8 attention support for Wan models"
+  components:
+    TheRock: 
+      version: 1728a81
+      url: https://github.com/ROCm/TheRock
+    rccl:
+      version: d23d18f
+      url: https://github.com/ROCm/rccl
+    composable_kernel:
+      version: ab0101c
+      url: https://github.com/ROCm/composable_kernel
+    rocm-libraries:
+      version: a2f7c35
+      url: https://github.com/ROCm/rocm-libraries
+    rocm-systems:
+      version: 659737c
+      url: https://github.com/ROCm/rocm-systems
+    torch:
+      version: 91be249
+      url: https://github.com/ROCm/pytorch
+    torchvision:
+      version: b919bd0
+      url: https://github.com/pytorch/vision
+    triton:
+      version: a272dfa
+      url: https://github.com/ROCm/triton
+    accelerate:
+      version: b521400f
+      url: https://github.com/huggingface/accelerate
+    aiter:
+      version: de14bec0
+      url: https://github.com/ROCm/aiter
+    diffusers:
+      version: a1f36ee3e
+      url: https://github.com/huggingface/diffusers
+    xfuser:
+      version: adf2681
+      url: https://github.com/xdit-project/xDiT
+    yunchang:
+      version: 2c9b712
+      url: https://github.com/feifeibear/long-context-attention
+  supported_models:
+    - group: Hunyuan Video
+      js_tag: hunyuan
+      models:
+        - model: Hunyuan Video
+          model_repo: tencent/HunyuanVideo
+          revision: refs/pr/18
+          url: https://huggingface.co/tencent/HunyuanVideo
+          github: https://github.com/Tencent-Hunyuan/HunyuanVideo
+          mad_tag: pyt_xdit_hunyuanvideo
+          js_tag: hunyuan_tag
+    - group: Wan-AI
+      js_tag: wan
+      models:
+        - model: Wan2.1
+          model_repo: Wan-AI/Wan2.1-I2V-14B-720P-Diffusers
+          url: https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P-Diffusers
+          github: https://github.com/Wan-Video/Wan2.1
+          mad_tag: pyt_xdit_wan_2_1
+          js_tag: wan_21_tag
+        - model: Wan2.2
+          model_repo: Wan-AI/Wan2.2-I2V-A14B-Diffusers
+          url: https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B-Diffusers
+          github: https://github.com/Wan-Video/Wan2.2
+          mad_tag: pyt_xdit_wan_2_2
+          js_tag: wan_22_tag
+    - group: FLUX
+      js_tag: flux
+      models:
+        - model: FLUX.1
+          model_repo: black-forest-labs/FLUX.1-dev
+          url: https://huggingface.co/black-forest-labs/FLUX.1-dev
+          github: https://github.com/black-forest-labs/flux
+          mad_tag: pyt_xdit_flux
+          js_tag: flux_1_tag
+        - model: FLUX.1 Kontext
+          model_repo: black-forest-labs/FLUX.1-Kontext-dev
+          url: https://huggingface.co/black-forest-labs/FLUX.1-Kontext-dev
+          github: https://github.com/black-forest-labs/flux
+          mad_tag: pyt_xdit_flux_kontext
+          js_tag: flux_1_kontext_tag
+        - model: FLUX.2
+          model_repo: black-forest-labs/FLUX.2-dev
+          url: https://huggingface.co/black-forest-labs/FLUX.2-dev
+          github: https://github.com/black-forest-labs/flux2
+          mad_tag: pyt_xdit_flux_2
+          js_tag: flux_2_tag
+    - group: StableDiffusion
+      js_tag: stablediffusion
+      models:
+        - model: stable-diffusion-3.5-large
+          model_repo: stabilityai/stable-diffusion-3.5-large
+          url: https://huggingface.co/stabilityai/stable-diffusion-3.5-large
+          github: https://github.com/Stability-AI/sd3.5
+          mad_tag: pyt_xdit_sd_3_5
+          js_tag: stable_diffusion_3_5_large_tag
--- a/docs/data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
@@ -1,12 +1,12 @@
 dockers:
-  - pull_tag: rocm/jax-training:maxtext-v25.9
-    docker_hub_url: https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7/images/sha256-45f4c727d4019a63fc47313d3a5f5a5105569539294ddfd2d742218212ae9025
+  - pull_tag: rocm/jax-training:maxtext-v25.11
+    docker_hub_url: https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.11/images/sha256-18e4d8f0b8ce7a7422c58046940dd5f32249960449fca09a562b65fb8eb1562a
    components:
-      ROCm: 7.0.0
-      JAX: 0.6.2
-      Python: 3.10.18
-      Transformer Engine: 2.2.0.dev0+c91bac54
-      hipBLASLt: 1.x.x
+      ROCm: 7.1.0
+      JAX: 0.7.1
+      Python: 3.12
+      Transformer Engine: 2.4.0.dev0+281042de
+      hipBLASLt: 1.2.x
 model_groups:
  - group: Meta Llama
    tag: llama
--- a/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
@@ -1,21 +1,17 @@
-dockers:
-  MI355X and MI350X:
-    pull_tag: rocm/megatron-lm:v25.9_gfx950
-    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
-    components: &docker_components
-      ROCm: 7.0.0
-      Primus: aab4234
-      PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
-      Python: "3.10"
-      Transformer Engine: 2.2.0.dev0+54dd2bdc
-      Flash Attention: 2.8.3
-      hipBLASLt: 911283acd1
-      Triton: 3.4.0+rocm7.0.0.git56765e8c
-      RCCL: 2.26.6
-  MI325X and MI300X:
-    pull_tag: rocm/megatron-lm:v25.9_gfx942
-    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
-    components: *docker_components
+docker:
+  pull_tag: rocm/primus:v25.10
+  docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197
+  components:
+    ROCm: 7.1.0
+    Primus: 0.3.0
+    Primus Turbo: 0.1.1
+    PyTorch: 2.10.0.dev20251112+rocm7.1
+    Python: "3.10"
+    Transformer Engine: 2.4.0.dev0+32e2d1d4
+    Flash Attention: 2.8.3
+    hipBLASLt: 1.2.0-09ab7153e2
+    Triton: 3.4.0
+    RCCL: 2.27.7
 model_groups:
  - group: Meta Llama
    tag: llama
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/jax-maxtext-v25.9-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/jax-maxtext-v25.9-benchmark-models.yaml
@@ -0,0 +1,64 @@
+dockers:
+  - pull_tag: rocm/jax-training:maxtext-v25.9.1
+    docker_hub_url: https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.9.1/images/sha256-60946cfbd470f6ee361fc9da740233a4fb2e892727f01719145b1f7627a1cff6
+    components:
+      ROCm: 7.0.0
+      JAX: 0.6.2
+      Python: 3.10.18
+      Transformer Engine: 2.2.0.dev0+c91bac54
+      hipBLASLt: 1.x.x
+model_groups:
+  - group: Meta Llama
+    tag: llama
+    models:
+      - model: Llama 2 7B
+        mad_tag: jax_maxtext_train_llama-2-7b
+        model_repo: Llama-2-7B
+        precision: bf16
+        multinode_training_script: llama2_7b_multinode.sh
+        doc_options: ["single-node", "multi-node"]
+      - model: Llama 2 70B
+        mad_tag: jax_maxtext_train_llama-2-70b
+        model_repo: Llama-2-70B
+        precision: bf16
+        multinode_training_script: llama2_70b_multinode.sh
+        doc_options: ["single-node", "multi-node"]
+      - model: Llama 3 8B (multi-node)
+        mad_tag: jax_maxtext_train_llama-3-8b
+        multinode_training_script: llama3_8b_multinode.sh
+        doc_options: ["multi-node"]
+      - model: Llama 3 70B (multi-node)
+        mad_tag: jax_maxtext_train_llama-3-70b
+        multinode_training_script: llama3_70b_multinode.sh
+        doc_options: ["multi-node"]
+      - model: Llama 3.1 8B
+        mad_tag: jax_maxtext_train_llama-3.1-8b
+        model_repo: Llama-3.1-8B
+        precision: bf16
+        doc_options: ["single-node"]
+      - model: Llama 3.1 70B
+        mad_tag: jax_maxtext_train_llama-3.1-70b
+        model_repo: Llama-3.1-70B
+        precision: bf16
+        doc_options: ["single-node"]
+      - model: Llama 3.3 70B
+        mad_tag: jax_maxtext_train_llama-3.3-70b
+        model_repo: Llama-3.3-70B
+        precision: bf16
+        doc_options: ["single-node"]
+  - group: DeepSeek
+    tag: deepseek
+    models:
+      - model: DeepSeek-V2-Lite (16B)
+        mad_tag: jax_maxtext_train_deepseek-v2-lite-16b
+        model_repo: DeepSeek-V2-lite
+        precision: bf16
+        doc_options: ["single-node"]
+  - group: Mistral AI
+    tag: mistral
+    models:
+      - model: Mixtral 8x7B
+        mad_tag: jax_maxtext_train_mixtral-8x7b
+        model_repo: Mixtral-8x7B
+        precision: bf16
+        doc_options: ["single-node"]
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.10-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.10-benchmark-models.yaml
@@ -0,0 +1,49 @@
+docker:
+  pull_tag: rocm/primus:v25.10
+  docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197
+  components:
+    ROCm: 7.1.0
+    Primus: 0.3.0
+    Primus Turbo: 0.1.1
+    PyTorch: 2.10.0.dev20251112+rocm7.1
+    Python: "3.10"
+    Transformer Engine: 2.4.0.dev0+32e2d1d4
+    Flash Attention: 2.8.3
+    hipBLASLt: 1.2.0-09ab7153e2
+    Triton: 3.4.0
+    RCCL: 2.27.7
+model_groups:
+  - group: Meta Llama
+    tag: llama
+    models:
+      - model: Llama 3.3 70B
+        mad_tag: pyt_megatron_lm_train_llama-3.3-70b
+      - model: Llama 3.1 8B
+        mad_tag: pyt_megatron_lm_train_llama-3.1-8b
+      - model: Llama 3.1 70B
+        mad_tag: pyt_megatron_lm_train_llama-3.1-70b
+      - model: Llama 2 7B
+        mad_tag: pyt_megatron_lm_train_llama-2-7b
+      - model: Llama 2 70B
+        mad_tag: pyt_megatron_lm_train_llama-2-70b
+  - group: DeepSeek
+    tag: deepseek
+    models:
+      - model: DeepSeek-V3 (proxy)
+        mad_tag: pyt_megatron_lm_train_deepseek-v3-proxy
+      - model: DeepSeek-V2-Lite
+        mad_tag: pyt_megatron_lm_train_deepseek-v2-lite-16b
+  - group: Mistral AI
+    tag: mistral
+    models:
+      - model: Mixtral 8x7B
+        mad_tag: pyt_megatron_lm_train_mixtral-8x7b
+      - model: Mixtral 8x22B (proxy)
+        mad_tag: pyt_megatron_lm_train_mixtral-8x22b-proxy
+  - group: Qwen
+    tag: qwen
+    models:
+      - model: Qwen 2.5 7B
+        mad_tag: pyt_megatron_lm_train_qwen2.5-7b
+      - model: Qwen 2.5 72B
+        mad_tag: pyt_megatron_lm_train_qwen2.5-72b
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.9-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.9-benchmark-models.yaml
@@ -0,0 +1,53 @@
+dockers:
+  MI355X and MI350X:
+    pull_tag: rocm/megatron-lm:v25.9_gfx950
+    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
+    components: &docker_components
+      ROCm: 7.0.0
+      Primus: aab4234
+      PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
+      Python: "3.10"
+      Transformer Engine: 2.2.0.dev0+54dd2bdc
+      Flash Attention: 2.8.3
+      hipBLASLt: 911283acd1
+      Triton: 3.4.0+rocm7.0.0.git56765e8c
+      RCCL: 2.26.6
+  MI325X and MI300X:
+    pull_tag: rocm/megatron-lm:v25.9_gfx942
+    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
+    components: *docker_components
+model_groups:
+  - group: Meta Llama
+    tag: llama
+    models:
+      - model: Llama 3.3 70B
+        mad_tag: pyt_megatron_lm_train_llama-3.3-70b
+      - model: Llama 3.1 8B
+        mad_tag: pyt_megatron_lm_train_llama-3.1-8b
+      - model: Llama 3.1 70B
+        mad_tag: pyt_megatron_lm_train_llama-3.1-70b
+      - model: Llama 2 7B
+        mad_tag: pyt_megatron_lm_train_llama-2-7b
+      - model: Llama 2 70B
+        mad_tag: pyt_megatron_lm_train_llama-2-70b
+  - group: DeepSeek
+    tag: deepseek
+    models:
+      - model: DeepSeek-V3 (proxy)
+        mad_tag: pyt_megatron_lm_train_deepseek-v3-proxy
+      - model: DeepSeek-V2-Lite
+        mad_tag: pyt_megatron_lm_train_deepseek-v2-lite-16b
+  - group: Mistral AI
+    tag: mistral
+    models:
+      - model: Mixtral 8x7B
+        mad_tag: pyt_megatron_lm_train_mixtral-8x7b
+      - model: Mixtral 8x22B (proxy)
+        mad_tag: pyt_megatron_lm_train_mixtral-8x22b-proxy
+  - group: Qwen
+    tag: qwen
+    models:
+      - model: Qwen 2.5 7B
+        mad_tag: pyt_megatron_lm_train_qwen2.5-7b
+      - model: Qwen 2.5 72B
+        mad_tag: pyt_megatron_lm_train_qwen2.5-72b
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.10-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.10-benchmark-models.yaml
@@ -0,0 +1,58 @@
+docker:
+  pull_tag: rocm/primus:v25.10
+  docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197
+  components:
+    ROCm: 7.1.0
+    PyTorch: 2.10.0.dev20251112+rocm7.1
+    Python: "3.10"
+    Transformer Engine: 2.4.0.dev0+32e2d1d4
+    Flash Attention: 2.8.3
+    hipBLASLt: 1.2.0-09ab7153e2
+    Triton: 3.4.0
+    RCCL: 2.27.7
+model_groups:
+  - group: Meta Llama
+    tag: llama
+    models:
+      - model: Llama 3.3 70B
+        mad_tag: primus_pyt_megatron_lm_train_llama-3.3-70b
+        config_name: llama3.3_70B-pretrain.yaml
+      - model: Llama 3.1 70B
+        mad_tag: primus_pyt_megatron_lm_train_llama-3.1-70b
+        config_name: llama3.1_70B-pretrain.yaml
+      - model: Llama 3.1 8B
+        mad_tag: primus_pyt_megatron_lm_train_llama-3.1-8b
+        config_name: llama3.1_8B-pretrain.yaml
+      - model: Llama 2 7B
+        mad_tag: primus_pyt_megatron_lm_train_llama-2-7b
+        config_name: llama2_7B-pretrain.yaml
+      - model: Llama 2 70B
+        mad_tag: primus_pyt_megatron_lm_train_llama-2-70b
+        config_name: llama2_70B-pretrain.yaml
+  - group: DeepSeek
+    tag: deepseek
+    models:
+      - model: DeepSeek-V3 (proxy)
+        mad_tag: primus_pyt_megatron_lm_train_deepseek-v3-proxy
+        config_name: deepseek_v3-pretrain.yaml
+      - model: DeepSeek-V2-Lite
+        mad_tag: primus_pyt_megatron_lm_train_deepseek-v2-lite-16b
+        config_name: deepseek_v2_lite-pretrain.yaml
+  - group: Mistral AI
+    tag: mistral
+    models:
+      - model: Mixtral 8x7B
+        mad_tag: primus_pyt_megatron_lm_train_mixtral-8x7b
+        config_name: mixtral_8x7B_v0.1-pretrain.yaml
+      - model: Mixtral 8x22B (proxy)
+        mad_tag: primus_pyt_megatron_lm_train_mixtral-8x22b-proxy
+        config_name: mixtral_8x22B_v0.1-pretrain.yaml
+  - group: Qwen
+    tag: qwen
+    models:
+      - model: Qwen 2.5 7B
+        mad_tag: primus_pyt_megatron_lm_train_qwen2.5-7b
+        config_name: primus_qwen2.5_7B-pretrain.yaml
+      - model: Qwen 2.5 72B
+        mad_tag: primus_pyt_megatron_lm_train_qwen2.5-72b
+        config_name: qwen2.5_72B-pretrain.yaml
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.9-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.9-benchmark-models.yaml
@@ -0,0 +1,65 @@
+dockers:
+  MI355X and MI350X:
+    pull_tag: rocm/primus:v25.9_gfx950
+    docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
+    components: &docker_components
+      ROCm: 7.0.0
+      Primus: 0.3.0
+      Primus Turbo: 0.1.1
+      PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
+      Python: "3.10"
+      Transformer Engine: 2.2.0.dev0+54dd2bdc
+      Flash Attention: 2.8.3
+      hipBLASLt: 911283acd1
+      Triton: 3.4.0+rocm7.0.0.git56765e8c
+      RCCL: 2.26.6
+  MI325X and MI300X:
+    pull_tag: rocm/primus:v25.9_gfx942
+    docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
+    components: *docker_components
+model_groups:
+  - group: Meta Llama
+    tag: llama
+    models:
+      - model: Llama 3.3 70B
+        mad_tag: primus_pyt_megatron_lm_train_llama-3.3-70b
+        config_name: llama3.3_70B-pretrain.yaml
+      - model: Llama 3.1 70B
+        mad_tag: primus_pyt_megatron_lm_train_llama-3.1-70b
+        config_name: llama3.1_70B-pretrain.yaml
+      - model: Llama 3.1 8B
+        mad_tag: primus_pyt_megatron_lm_train_llama-3.1-8b
+        config_name: llama3.1_8B-pretrain.yaml
+      - model: Llama 2 7B
+        mad_tag: primus_pyt_megatron_lm_train_llama-2-7b
+        config_name: llama2_7B-pretrain.yaml
+      - model: Llama 2 70B
+        mad_tag: primus_pyt_megatron_lm_train_llama-2-70b
+        config_name: llama2_70B-pretrain.yaml
+  - group: DeepSeek
+    tag: deepseek
+    models:
+      - model: DeepSeek-V3 (proxy)
+        mad_tag: primus_pyt_megatron_lm_train_deepseek-v3-proxy
+        config_name: deepseek_v3-pretrain.yaml
+      - model: DeepSeek-V2-Lite
+        mad_tag: primus_pyt_megatron_lm_train_deepseek-v2-lite-16b
+        config_name: deepseek_v2_lite-pretrain.yaml
+  - group: Mistral AI
+    tag: mistral
+    models:
+      - model: Mixtral 8x7B
+        mad_tag: primus_pyt_megatron_lm_train_mixtral-8x7b
+        config_name: mixtral_8x7B_v0.1-pretrain.yaml
+      - model: Mixtral 8x22B (proxy)
+        mad_tag: primus_pyt_megatron_lm_train_mixtral-8x22b-proxy
+        config_name: mixtral_8x22B_v0.1-pretrain.yaml
+  - group: Qwen
+    tag: qwen
+    models:
+      - model: Qwen 2.5 7B
+        mad_tag: primus_pyt_megatron_lm_train_qwen2.5-7b
+        config_name: primus_qwen2.5_7B-pretrain.yaml
+      - model: Qwen 2.5 72B
+        mad_tag: primus_pyt_megatron_lm_train_qwen2.5-72b
+        config_name: qwen2.5_72B-pretrain.yaml
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.10-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.10-benchmark-models.yaml
@@ -0,0 +1,32 @@
+docker:
+  pull_tag: rocm/primus:v25.10
+  docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197
+  components:
+    ROCm: 7.1.0
+    PyTorch: 2.10.0.dev20251112+rocm7.1
+    Python: "3.10"
+    Transformer Engine: 2.4.0.dev0+32e2d1d4
+    Flash Attention: 2.8.3
+    hipBLASLt: 1.2.0-09ab7153e2
+model_groups:
+  - group: Meta Llama
+    tag: llama
+    models:
+      - model: Llama 3.1 8B
+        mad_tag: primus_pyt_train_llama-3.1-8b
+        model_repo: Llama-3.1-8B
+        url: https://huggingface.co/meta-llama/Llama-3.1-8B
+        precision: BF16
+      - model: Llama 3.1 70B
+        mad_tag: primus_pyt_train_llama-3.1-70b
+        model_repo: Llama-3.1-70B
+        url: https://huggingface.co/meta-llama/Llama-3.1-70B
+        precision: BF16
+  - group: DeepSeek
+    tag: deepseek
+    models:
+      - model: DeepSeek V2 16B
+        mad_tag: primus_pyt_train_deepseek-v2
+        model_repo: DeepSeek-V2
+        url: https://huggingface.co/deepseek-ai/DeepSeek-V2
+        precision: BF16
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.9-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.9-benchmark-models.yaml
@@ -0,0 +1,39 @@
+dockers:
+  MI355X and MI350X:
+    pull_tag: rocm/primus:v25.9_gfx950
+    docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
+    components: &docker_components
+      ROCm: 7.0.0
+      Primus: 0.3.0
+      Primus Turbo: 0.1.1
+      PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
+      Python: "3.10"
+      Transformer Engine: 2.2.0.dev0+54dd2bdc
+      Flash Attention: 2.8.3
+      hipBLASLt: 911283acd1
+      Triton: 3.4.0+rocm7.0.0.git56765e8c
+      RCCL: 2.26.6
+  MI325X and MI300X:
+    pull_tag: rocm/primus:v25.9_gfx942
+    docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
+    components: *docker_components
+model_groups:
+  - group: Meta Llama
+    tag: llama
+    models:
+    - model: Llama 3.1 8B
+      mad_tag: primus_pyt_train_llama-3.1-8b
+      model_repo: meta-llama/Llama-3.1-8B
+      url: https://huggingface.co/meta-llama/Llama-3.1-8B
+      precision: BF16
+      config_file:
+        bf16: "./llama3_8b_fsdp_bf16.toml"
+        fp8: "./llama3_8b_fsdp_fp8.toml"
+    - model: Llama 3.1 70B
+      mad_tag: primus_pyt_train_llama-3.1-70b
+      model_repo: meta-llama/Llama-3.1-70B
+      url: https://huggingface.co/meta-llama/Llama-3.1-70B
+      precision: BF16
+      config_file:
+        bf16: "./llama3_70b_fsdp_bf16.toml"
+        fp8: "./llama3_70b_fsdp_fp8.toml"
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.10-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.10-benchmark-models.yaml
@@ -0,0 +1,197 @@
+docker:
+  pull_tag: rocm/primus:v25.10
+  docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197
+  components:
+    ROCm: 7.1.0
+    Primus: 0.3.0
+    Primus Turbo: 0.1.1
+    PyTorch: 2.10.0.dev20251112+rocm7.1
+    Python: "3.10"
+    Transformer Engine: 2.4.0.dev0+32e2d1d4
+    Flash Attention: 2.8.3
+    hipBLASLt: 1.2.0-09ab7153e2
+model_groups:
+  - group: Meta Llama
+    tag: llama
+    models:
+    - model: Llama 4 Scout 17B-16E
+      mad_tag: pyt_train_llama-4-scout-17b-16e
+      model_repo: Llama-4-17B_16E
+      url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3.3 70B
+      mad_tag: pyt_train_llama-3.3-70b
+      model_repo: Llama-3.3-70B
+      url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
+    - model: Llama 3.2 1B
+      mad_tag: pyt_train_llama-3.2-1b
+      model_repo: Llama-3.2-1B
+      url: https://huggingface.co/meta-llama/Llama-3.2-1B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3.2 3B
+      mad_tag: pyt_train_llama-3.2-3b
+      model_repo: Llama-3.2-3B
+      url: https://huggingface.co/meta-llama/Llama-3.2-3B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3.2 Vision 11B
+      mad_tag: pyt_train_llama-3.2-vision-11b
+      model_repo: Llama-3.2-Vision-11B
+      url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision
+      precision: BF16
+      training_modes: [finetune_fw]
+    - model: Llama 3.2 Vision 90B
+      mad_tag: pyt_train_llama-3.2-vision-90b
+      model_repo: Llama-3.2-Vision-90B
+      url: https://huggingface.co/meta-llama/Llama-3.2-90B-Vision
+      precision: BF16
+      training_modes: [finetune_fw]
+    - model: Llama 3.1 8B
+      mad_tag: pyt_train_llama-3.1-8b
+      model_repo: Llama-3.1-8B
+      url: https://huggingface.co/meta-llama/Llama-3.1-8B
+      precision: BF16
+      training_modes: [pretrain, finetune_fw, finetune_lora, HF_pretrain]
+    - model: Llama 3.1 70B
+      mad_tag: pyt_train_llama-3.1-70b
+      model_repo: Llama-3.1-70B
+      url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
+      precision: BF16
+      training_modes: [pretrain, finetune_fw, finetune_lora]
+    - model: Llama 3.1 405B
+      mad_tag: pyt_train_llama-3.1-405b
+      model_repo: Llama-3.1-405B
+      url: https://huggingface.co/meta-llama/Llama-3.1-405B
+      precision: BF16
+      training_modes: [finetune_qlora]
+    - model: Llama 3 8B
+      mad_tag: pyt_train_llama-3-8b
+      model_repo: Llama-3-8B
+      url: https://huggingface.co/meta-llama/Meta-Llama-3-8B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3 70B
+      mad_tag: pyt_train_llama-3-70b
+      model_repo: Llama-3-70B
+      url: https://huggingface.co/meta-llama/Meta-Llama-3-70B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 2 7B
+      mad_tag: pyt_train_llama-2-7b
+      model_repo: Llama-2-7B
+      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
+    - model: Llama 2 13B
+      mad_tag: pyt_train_llama-2-13b
+      model_repo: Llama-2-13B
+      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 2 70B
+      mad_tag: pyt_train_llama-2-70b
+      model_repo: Llama-2-70B
+      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
+      precision: BF16
+      training_modes: [finetune_lora, finetune_qlora]
+  - group: OpenAI
+    tag: openai
+    models:
+    - model: GPT OSS 20B
+      mad_tag: pyt_train_gpt_oss_20b
+      model_repo: GPT-OSS-20B
+      url: https://huggingface.co/openai/gpt-oss-20b
+      precision: BF16
+      training_modes: [HF_finetune_lora]
+    - model: GPT OSS 120B
+      mad_tag: pyt_train_gpt_oss_120b
+      model_repo: GPT-OSS-120B
+      url: https://huggingface.co/openai/gpt-oss-120b
+      precision: BF16
+      training_modes: [HF_finetune_lora]
+  - group: DeepSeek
+    tag: deepseek
+    models:
+    - model: DeepSeek V2 16B
+      mad_tag: primus_pyt_train_deepseek-v2
+      model_repo: DeepSeek-V2
+      url: https://huggingface.co/deepseek-ai/DeepSeek-V2
+      precision: BF16
+      training_modes: [pretrain]
+  - group: Qwen
+    tag: qwen
+    models:
+    - model: Qwen 3 8B
+      mad_tag: pyt_train_qwen3-8b
+      model_repo: Qwen3-8B
+      url: https://huggingface.co/Qwen/Qwen3-8B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Qwen 3 32B
+      mad_tag: pyt_train_qwen3-32b
+      model_repo: Qwen3-32
+      url: https://huggingface.co/Qwen/Qwen3-32B
+      precision: BF16
+      training_modes: [finetune_lora]
+    - model: Qwen 2.5 32B
+      mad_tag: pyt_train_qwen2.5-32b
+      model_repo: Qwen2.5-32B
+      url: https://huggingface.co/Qwen/Qwen2.5-32B
+      precision: BF16
+      training_modes: [finetune_lora]
+    - model: Qwen 2.5 72B
+      mad_tag: pyt_train_qwen2.5-72b
+      model_repo: Qwen2.5-72B
+      url: https://huggingface.co/Qwen/Qwen2.5-72B
+      precision: BF16
+      training_modes: [finetune_lora]
+    - model: Qwen 2 1.5B
+      mad_tag: pyt_train_qwen2-1.5b
+      model_repo: Qwen2-1.5B
+      url: https://huggingface.co/Qwen/Qwen2-1.5B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Qwen 2 7B
+      mad_tag: pyt_train_qwen2-7b
+      model_repo: Qwen2-7B
+      url: https://huggingface.co/Qwen/Qwen2-7B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+  - group: Stable Diffusion
+    tag: sd
+    models:
+    - model: Stable Diffusion XL
+      mad_tag: pyt_huggingface_stable_diffusion_xl_2k_lora_finetuning
+      model_repo: SDXL
+      url: https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0
+      precision: BF16
+      training_modes: [posttrain]
+  - group: Flux
+    tag: flux
+    models:
+    - model: FLUX.1-dev
+      mad_tag: pyt_train_flux
+      model_repo: Flux
+      url: https://huggingface.co/black-forest-labs/FLUX.1-dev
+      precision: BF16
+      training_modes: [posttrain]
+  - group: NCF
+    tag: ncf
+    models:
+    - model: NCF
+      mad_tag: pyt_ncf_training
+      model_repo:
+      url: https://github.com/ROCm/FluxBenchmark
+      precision: FP32
+  - group: DLRM
+    tag: dlrm
+    models:
+    - model: DLRM v2
+      mad_tag: pyt_train_dlrm
+      model_repo: DLRM
+      url: https://github.com/AMD-AGI/DLRMBenchmark
+      training_modes: [pretrain]
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.9-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.9-benchmark-models.yaml
@@ -0,0 +1,186 @@
+dockers:
+  MI355X and MI350X:
+    pull_tag: rocm/pytorch-training:v25.9_gfx950
+    docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
+    components: &docker_components
+      ROCm: 7.0.0
+      Primus: aab4234
+      PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
+      Python: "3.10"
+      Transformer Engine: 2.2.0.dev0+54dd2bdc
+      Flash Attention: 2.8.3
+      hipBLASLt: 911283acd1
+      Triton: 3.4.0+rocm7.0.0.git56765e8c
+      RCCL: 2.26.6
+  MI325X and MI300X:
+    pull_tag: rocm/pytorch-training:v25.9_gfx942
+    docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
+    components: *docker_components
+model_groups:
+  - group: Meta Llama
+    tag: llama
+    models:
+    - model: Llama 4 Scout 17B-16E
+      mad_tag: pyt_train_llama-4-scout-17b-16e
+      model_repo: Llama-4-17B_16E
+      url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3.3 70B
+      mad_tag: pyt_train_llama-3.3-70b
+      model_repo: Llama-3.3-70B
+      url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
+    - model: Llama 3.2 1B
+      mad_tag: pyt_train_llama-3.2-1b
+      model_repo: Llama-3.2-1B
+      url: https://huggingface.co/meta-llama/Llama-3.2-1B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3.2 3B
+      mad_tag: pyt_train_llama-3.2-3b
+      model_repo: Llama-3.2-3B
+      url: https://huggingface.co/meta-llama/Llama-3.2-3B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3.2 Vision 11B
+      mad_tag: pyt_train_llama-3.2-vision-11b
+      model_repo: Llama-3.2-Vision-11B
+      url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision
+      precision: BF16
+      training_modes: [finetune_fw]
+    - model: Llama 3.2 Vision 90B
+      mad_tag: pyt_train_llama-3.2-vision-90b
+      model_repo: Llama-3.2-Vision-90B
+      url: https://huggingface.co/meta-llama/Llama-3.2-90B-Vision
+      precision: BF16
+      training_modes: [finetune_fw]
+    - model: Llama 3.1 8B
+      mad_tag: pyt_train_llama-3.1-8b
+      model_repo: Llama-3.1-8B
+      url: https://huggingface.co/meta-llama/Llama-3.1-8B
+      precision: BF16
+      training_modes: [pretrain, finetune_fw, finetune_lora, HF_pretrain]
+    - model: Llama 3.1 70B
+      mad_tag: pyt_train_llama-3.1-70b
+      model_repo: Llama-3.1-70B
+      url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
+      precision: BF16
+      training_modes: [pretrain, finetune_fw, finetune_lora]
+    - model: Llama 3.1 405B
+      mad_tag: pyt_train_llama-3.1-405b
+      model_repo: Llama-3.1-405B
+      url: https://huggingface.co/meta-llama/Llama-3.1-405B
+      precision: BF16
+      training_modes: [finetune_qlora]
+    - model: Llama 3 8B
+      mad_tag: pyt_train_llama-3-8b
+      model_repo: Llama-3-8B
+      url: https://huggingface.co/meta-llama/Meta-Llama-3-8B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3 70B
+      mad_tag: pyt_train_llama-3-70b
+      model_repo: Llama-3-70B
+      url: https://huggingface.co/meta-llama/Meta-Llama-3-70B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 2 7B
+      mad_tag: pyt_train_llama-2-7b
+      model_repo: Llama-2-7B
+      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
+    - model: Llama 2 13B
+      mad_tag: pyt_train_llama-2-13b
+      model_repo: Llama-2-13B
+      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 2 70B
+      mad_tag: pyt_train_llama-2-70b
+      model_repo: Llama-2-70B
+      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
+      precision: BF16
+      training_modes: [finetune_lora, finetune_qlora]
+  - group: OpenAI
+    tag: openai
+    models:
+    - model: GPT OSS 20B
+      mad_tag: pyt_train_gpt_oss_20b
+      model_repo: GPT-OSS-20B
+      url: https://huggingface.co/openai/gpt-oss-20b
+      precision: BF16
+      training_modes: [HF_finetune_lora]
+    - model: GPT OSS 120B
+      mad_tag: pyt_train_gpt_oss_120b
+      model_repo: GPT-OSS-120B
+      url: https://huggingface.co/openai/gpt-oss-120b
+      precision: BF16
+      training_modes: [HF_finetune_lora]
+  - group: Qwen
+    tag: qwen
+    models:
+    - model: Qwen 3 8B
+      mad_tag: pyt_train_qwen3-8b
+      model_repo: Qwen3-8B
+      url: https://huggingface.co/Qwen/Qwen3-8B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Qwen 3 32B
+      mad_tag: pyt_train_qwen3-32b
+      model_repo: Qwen3-32
+      url: https://huggingface.co/Qwen/Qwen3-32B
+      precision: BF16
+      training_modes: [finetune_lora]
+    - model: Qwen 2.5 32B
+      mad_tag: pyt_train_qwen2.5-32b
+      model_repo: Qwen2.5-32B
+      url: https://huggingface.co/Qwen/Qwen2.5-32B
+      precision: BF16
+      training_modes: [finetune_lora]
+    - model: Qwen 2.5 72B
+      mad_tag: pyt_train_qwen2.5-72b
+      model_repo: Qwen2.5-72B
+      url: https://huggingface.co/Qwen/Qwen2.5-72B
+      precision: BF16
+      training_modes: [finetune_lora]
+    - model: Qwen 2 1.5B
+      mad_tag: pyt_train_qwen2-1.5b
+      model_repo: Qwen2-1.5B
+      url: https://huggingface.co/Qwen/Qwen2-1.5B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Qwen 2 7B
+      mad_tag: pyt_train_qwen2-7b
+      model_repo: Qwen2-7B
+      url: https://huggingface.co/Qwen/Qwen2-7B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+  - group: Stable Diffusion
+    tag: sd
+    models:
+    - model: Stable Diffusion XL
+      mad_tag: pyt_huggingface_stable_diffusion_xl_2k_lora_finetuning
+      model_repo: SDXL
+      url: https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0
+      precision: BF16
+      training_modes: [posttrain-p]
+  - group: Flux
+    tag: flux
+    models:
+    - model: FLUX.1-dev
+      mad_tag: pyt_train_flux
+      model_repo: Flux
+      url: https://huggingface.co/black-forest-labs/FLUX.1-dev
+      precision: BF16
+      training_modes: [posttrain-p]
+  - group: NCF
+    tag: ncf
+    models:
+    - model: NCF
+      mad_tag: pyt_ncf_training
+      model_repo:
+      url: https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Recommendation/NCF
+      precision: FP32
--- a/docs/data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
@@ -1,22 +1,15 @@
-dockers:
-  MI355X and MI350X:
-    pull_tag: rocm/primus:v25.9_gfx950
-    docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
-    components: &docker_components
-      ROCm: 7.0.0
-      Primus: 0.3.0
-      Primus Turbo: 0.1.1
-      PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
-      Python: "3.10"
-      Transformer Engine: 2.2.0.dev0+54dd2bdc
-      Flash Attention: 2.8.3
-      hipBLASLt: 911283acd1
-      Triton: 3.4.0+rocm7.0.0.git56765e8c
-      RCCL: 2.26.6
-  MI325X and MI300X:
-    pull_tag: rocm/primus:v25.9_gfx942
-    docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
-    components: *docker_components
+docker:
+  pull_tag: rocm/primus:v25.11
+  docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197
+  components:
+    ROCm: 7.1.0
+    PyTorch: 2.10.0.dev20251112+rocm7.1
+    Python: "3.10"
+    Transformer Engine: 2.4.0.dev0+32e2d1d4
+    Flash Attention: 2.8.3
+    hipBLASLt: 1.2.0-09ab7153e2
+    Triton: 3.4.0
+    RCCL: 2.27.7
 model_groups:
  - group: Meta Llama
    tag: llama
--- a/docs/data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
@@ -1,39 +1,32 @@
-dockers:
-  MI355X and MI350X:
-    pull_tag: rocm/primus:v25.9_gfx950
-    docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
-    components: &docker_components
-      ROCm: 7.0.0
-      Primus: 0.3.0
-      Primus Turbo: 0.1.1
-      PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
-      Python: "3.10"
-      Transformer Engine: 2.2.0.dev0+54dd2bdc
-      Flash Attention: 2.8.3
-      hipBLASLt: 911283acd1
-      Triton: 3.4.0+rocm7.0.0.git56765e8c
-      RCCL: 2.26.6
-  MI325X and MI300X:
-    pull_tag: rocm/primus:v25.9_gfx942
-    docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
-    components: *docker_components
+docker:
+  pull_tag: rocm/primus:v25.11
+  docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197
+  components:
+    ROCm: 7.1.0
+    PyTorch: 2.10.0.dev20251112+rocm7.1
+    Python: "3.10"
+    Transformer Engine: 2.4.0.dev0+32e2d1d4
+    Flash Attention: 2.8.3
+    hipBLASLt: 1.2.0-09ab7153e2
 model_groups:
  - group: Meta Llama
    tag: llama
    models:
-    - model: Llama 3.1 8B
-      mad_tag: primus_pyt_train_llama-3.1-8b
-      model_repo: meta-llama/Llama-3.1-8B
-      url: https://huggingface.co/meta-llama/Llama-3.1-8B
-      precision: BF16
-      config_file:
-        bf16: "./llama3_8b_fsdp_bf16.toml"
-        fp8: "./llama3_8b_fsdp_fp8.toml"
-    - model: Llama 3.1 70B
-      mad_tag: primus_pyt_train_llama-3.1-70b
-      model_repo: meta-llama/Llama-3.1-70B
-      url: https://huggingface.co/meta-llama/Llama-3.1-70B
-      precision: BF16
-      config_file:
-        bf16: "./llama3_70b_fsdp_bf16.toml"
-        fp8: "./llama3_70b_fsdp_fp8.toml"
+      - model: Llama 3.1 8B
+        mad_tag: primus_pyt_train_llama-3.1-8b
+        model_repo: Llama-3.1-8B
+        url: https://huggingface.co/meta-llama/Llama-3.1-8B
+        precision: BF16
+      - model: Llama 3.1 70B
+        mad_tag: primus_pyt_train_llama-3.1-70b
+        model_repo: Llama-3.1-70B
+        url: https://huggingface.co/meta-llama/Llama-3.1-70B
+        precision: BF16
+  - group: DeepSeek
+    tag: deepseek
+    models:
+      - model: DeepSeek V3 16B
+        mad_tag: primus_pyt_train_deepseek-v3-16b
+        model_repo: DeepSeek-V3
+        url: https://huggingface.co/deepseek-ai/DeepSeek-V3
+        precision: BF16
--- a/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
@@ -1,21 +1,15 @@
-dockers:
-  MI355X and MI350X:
-    pull_tag: rocm/pytorch-training:v25.9_gfx950
-    docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
-    components: &docker_components
-      ROCm: 7.0.0
-      Primus: aab4234
-      PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
-      Python: "3.10"
-      Transformer Engine: 2.2.0.dev0+54dd2bdc
-      Flash Attention: 2.8.3
-      hipBLASLt: 911283acd1
-      Triton: 3.4.0+rocm7.0.0.git56765e8c
-      RCCL: 2.26.6
-  MI325X and MI300X:
-    pull_tag: rocm/pytorch-training:v25.9_gfx942
-    docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
-    components: *docker_components
+docker:
+  pull_tag: rocm/primus:v25.10
+  docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197
+  components:
+    ROCm: 7.1.0
+    Primus: 0.3.0
+    Primus Turbo: 0.1.1
+    PyTorch: 2.10.0.dev20251112+rocm7.1
+    Python: "3.10"
+    Transformer Engine: 2.4.0.dev0+32e2d1d4
+    Flash Attention: 2.8.3
+    hipBLASLt: 1.2.0-09ab7153e2
 model_groups:
  - group: Meta Llama
    tag: llama
@@ -119,6 +113,15 @@ model_groups:
      url: https://huggingface.co/openai/gpt-oss-120b
      precision: BF16
      training_modes: [HF_finetune_lora]
+  - group: DeepSeek
+    tag: deepseek
+    models:
+    - model: DeepSeek V2 16B
+      mad_tag: primus_pyt_train_deepseek-v2
+      model_repo: DeepSeek-V2
+      url: https://huggingface.co/deepseek-ai/DeepSeek-V2
+      precision: BF16
+      training_modes: [pretrain]
  - group: Qwen
    tag: qwen
    models:
@@ -166,7 +169,7 @@ model_groups:
      model_repo: SDXL
      url: https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0
      precision: BF16
-      training_modes: [posttrain-p]
+      training_modes: [posttrain]
  - group: Flux
    tag: flux
    models:
@@ -175,12 +178,20 @@ model_groups:
      model_repo: Flux
      url: https://huggingface.co/black-forest-labs/FLUX.1-dev
      precision: BF16
-      training_modes: [posttrain-p]
+      training_modes: [posttrain]
  - group: NCF
    tag: ncf
    models:
    - model: NCF
      mad_tag: pyt_ncf_training
      model_repo:
-      url: https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Recommendation/NCF
+      url: https://github.com/ROCm/FluxBenchmark
      precision: FP32
+  - group: DLRM
+    tag: dlrm
+    models:
+    - model: DLRM v2
+      mad_tag: pyt_train_dlrm
+      model_repo: DLRM
+      url: https://github.com/AMD-AGI/DLRMBenchmark
+      training_modes: [pretrain]
--- a/docs/data/reference/precision-support/precision-support.yaml
+++ b/docs/data/reference/precision-support/precision-support.yaml
@@ -32,7 +32,7 @@ library_groups:

      - name: "MIGraphX"
        tag: "migraphx"
-        doc_link: "amdmigraphx:reference/cpp"
+        doc_link: "amdmigraphx:reference/MIGraphX-cpp"
        data_types:
          - type: "int8"
            support: "⚠️"
@@ -290,7 +290,7 @@ library_groups:

      - name: "Tensile"
        tag: "tensile"
-        doc_link: "tensile:reference/precision-support"
+        doc_link: "tensile:src/reference/precision-support"
        data_types:
          - type: "int8"
            support: "✅"
--- a/docs/extension/remote-content.py
+++ b/docs/extension/remote-content.py
@@ -0,0 +1,141 @@
+from docutils import nodes
+from docutils.parsers.rst import Directive
+from docutils.statemachine import ViewList
+from sphinx.util import logging
+from sphinx.util.nodes import nested_parse_with_titles
+import requests
+import re
+
+logger = logging.getLogger(__name__)
+
+class BranchAwareRemoteContent(Directive):
+    """
+    Directive that downloads and includes content from other repositories,
+    matching the branch/tag of the current documentation build.
+
+    Usage:
+    .. remote-content::
+       :repo: owner/repository
+       :path: path/to/file.rst
+       :default_branch: docs/develop  # Branch to use when not on a release
+       :tag_prefix: Docs/  # Optional
+    """
+
+    required_arguments = 0
+    optional_arguments = 0
+    final_argument_whitespace = True
+    has_content = False
+    option_spec = {
+        'repo': str,
+        'path': str,
+        'default_branch': str,  # Branch to use when not on a release tag
+        'start_line': int,      # Include the file from a specific line
+        'tag_prefix': str,      # Prefix for release tags (e.g., 'Docs/')
+    }
+
+    def get_current_version(self):
+        """Get current version/branch being built"""
+        env = self.state.document.settings.env
+        html_context = env.config.html_context
+
+        # Check if building from a tag
+        if "official_branch" in html_context:
+            if html_context["official_branch"] == 0:
+                if "version" in html_context:
+                    # Remove any 'v' prefix
+                    version = html_context["version"]
+                    if re.match(r'^\d+\.\d+\.\d+$', version):
+                        return version
+
+        # Not a version tag, so we'll use the default branch
+        return None
+
+    def get_target_ref(self):
+        """Get target reference for the remote repository"""
+        current_version = self.get_current_version()
+
+        # If it's a version number, use tag prefix and version
+        if current_version:
+            tag_prefix = self.options.get('tag_prefix', '')
+            return f'{tag_prefix}{current_version}'
+
+        # For any other case, use the specified default branch
+        if 'default_branch' not in self.options:
+            logger.warning('No default_branch specified and not building from a version tag')
+            return None
+
+        return self.options['default_branch']
+
+    def construct_raw_url(self, repo, path, ref):
+        """Construct the raw.githubusercontent.com URL"""
+        return f'https://raw.githubusercontent.com/{repo}/{ref}/{path}'
+
+    def fetch_and_parse_content(self, url, source_path):
+        """Fetch content and parse it as RST"""
+        response = requests.get(url)
+        response.raise_for_status()
+        content = response.text
+
+        start_line = self.options.get('start_line', 0)
+
+        # Create ViewList for parsing
+        line_count = 0
+        content_list = ViewList()
+        for line_no, line in enumerate(content.splitlines()):
+            if line_count >= start_line:
+                content_list.append(line, source_path, line_no)
+            line_count+=1 
+
+        # Create a section node and parse content
+        node = nodes.section()
+        nested_parse_with_titles(self.state, content_list, node)
+
+        return node.children
+
+    def run(self):
+        if 'repo' not in self.options or 'path' not in self.options:
+            logger.warning('Both repo and path options are required')
+            return []
+
+        target_ref = self.get_target_ref()
+        if not target_ref:
+            return []
+
+        raw_url = self.construct_raw_url(
+            self.options['repo'],
+            self.options['path'],
+            target_ref
+        )
+
+        try:
+            logger.info(f'Attempting to fetch content from {raw_url}')
+            return self.fetch_and_parse_content(raw_url, self.options['path'])
+        except requests.exceptions.RequestException as e:
+            logger.warning(f'Failed to fetch content from {raw_url}: {str(e)}')
+
+            # If we failed on a tag, try falling back to default_branch
+            if re.match(r'^\d+\.\d+\.\d+$', target_ref) or target_ref.startswith('Docs/'):
+                if 'default_branch' in self.options:
+                    try:
+                        fallback_ref = self.options['default_branch']
+                        logger.info(f'Attempting fallback to {fallback_ref}...')
+
+                        fallback_url = self.construct_raw_url(
+                            self.options['repo'],
+                            self.options['path'],
+                            fallback_ref
+                        )
+
+                        return self.fetch_and_parse_content(fallback_url, self.options['path'])
+                    except requests.exceptions.RequestException as e2:
+                        logger.warning(f'Fallback also failed: {str(e2)}')
+
+            return []
+
+def setup(app):
+    app.add_directive('remote-content', BranchAwareRemoteContent)
+
+    return {
+        'parallel_read_safe': True,
+        'parallel_write_safe': True,
+    }
--- a/docs/how-to/deep-learning-rocm.rst
+++ b/docs/how-to/deep-learning-rocm.rst
@@ -84,6 +84,8 @@ The table below summarizes information about ROCm-enabled deep learning framewor
          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/dgl-install.html"><i class="fas fa-link fa-lg"></i></a>
      - 
        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/dgl-install.html#use-a-prebuilt-docker-image-with-dgl-pre-installed>`__
+        - `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/dgl-install.html#use-a-wheels-package>`__
+
      - .. raw:: html

          <a href="https://github.com/ROCm/dgl"><i class="fab fa-github fa-lg"></i></a> 
@@ -98,18 +100,6 @@ The table below summarizes information about ROCm-enabled deep learning framewor

          <a href="https://github.com/ROCm/megablocks"><i class="fab fa-github fa-lg"></i></a>

-    * - `Taichi <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/taichi-compatibility.html>`__
-      - .. raw:: html
-
-          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html"><i class="fas fa-link fa-lg"></i></a>
-      - 
-        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html#use-a-prebuilt-docker-image-with-taichi-pre-installed>`__
-        - `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html#use-a-wheels-package>`__
-
-      - .. raw:: html
-
-          <a href="https://github.com/ROCm/taichi"><i class="fab fa-github fa-lg"></i></a>
-
    * - `Ray <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/ray-compatibility.html>`__
      - .. raw:: html

--- a/docs/how-to/rocm-for-ai/fine-tuning/multi-gpu-fine-tuning-and-inference.rst
+++ b/docs/how-to/rocm-for-ai/fine-tuning/multi-gpu-fine-tuning-and-inference.rst
@@ -130,7 +130,7 @@ After loading the model in this way, the model is fully ready to use the resourc
 torchtune for fine-tuning and inference
 =============================================

-`torchtune <https://pytorch.org/torchtune/main/>`_ is a PyTorch-native library for easy single and multi-GPU 
+`torchtune <https://meta-pytorch.org/torchtune/main/>`_ is a PyTorch-native library for easy single and multi-GPU 
 model fine-tuning and inference with LLMs.

 #. Install torchtune using pip.
--- a/docs/how-to/rocm-for-ai/inference-optimization/model-acceleration-libraries.rst
+++ b/docs/how-to/rocm-for-ai/inference-optimization/model-acceleration-libraries.rst
@@ -24,94 +24,102 @@ performance.
   :alt: Attention module of a large language module utilizing tiling
   :align: center

+Installation prerequisites
+----------------------------
+
+Before installing Flash Attention 2, ensure the following are available:
+
+*  ROCm-enabled PyTorch
+*  Triton
+
+These can be installed by following the official
+`PyTorch installation guide <https://pytorch.org/get-started/locally/>`_. Alternatively, for a simpler setup, you can use a preconfigured
+:ref:`ROCm PyTorch Docker image <using-docker-with-pytorch-pre-installed>`, which already includes the required libraries.
+
 Installing Flash Attention 2 
 ----------------------------

-ROCm provides two different implementations of Flash Attention 2 modules. They can be deployed interchangeably:
+`Flash Attention <https://github.com/Dao-AILab/flash-attention>`_ supports two backend implementations on AMD GPUs.

-*  ROCm `Composable Kernel <https://github.com/ROCm/composable_kernel/tree/develop/example/01_gemm>`_
-   (CK) Flash Attention 2
+*  `Composable Kernel (CK) <https://github.com/ROCm/composable_kernel>`__ - the default backend
+*  `OpenAI Triton <https://github.com/triton-lang/triton>`__ - an alternative backend

-*  `OpenAI Triton <https://triton-lang.org/main/index.html>`_ Flash Attention 2
+You can switch between these backends using the environment variable ``FLASH_ATTENTION_TRITON_AMD_ENABLE``:

-.. tab-set::
+``FLASH_ATTENTION_TRITON_AMD_ENABLE="FALSE"``
+→ Use Composable Kernel (CK) backend (Flash Attention 2)

-   .. tab-item:: CK Flash Attention 2
+``FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE"``
+→ Use OpenAI Triton backend (Flash Attention 2)

-      To install CK Flash Attention 2, use the following commands.
+To install Flash Attention 2, use the following commands:

-      .. code-block:: shell
+.. code-block:: shell

-         # Install from source
-         git clone https://github.com/ROCm/flash-attention.git
-         cd flash-attention/
-         GPU_ARCHS=gfx942 python setup.py install #MI300 Series
+   git clone https://github.com/Dao-AILab/flash-attention.git
+   cd flash-attention/
+   pip install ninja

-      Hugging Face Transformers can easily deploy the CK Flash Attention 2 module by passing an argument
-      ``attn_implementation="flash_attention_2"`` in the ``from_pretrained`` class.
+   # To install the CK backend flash attention
+   python setup.py install 

-      .. code-block:: python
+   # To install the Triton backend flash attention
+   FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE" python setup.py install 

-         import torch
-         from transformers import AutoModelForCausalLM, AutoTokenizer
-         device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-         model_name = "NousResearch/Meta-Llama-3-8B"
+   # To install both CK and Triton backend flash attention
+   FLASH_ATTENTION_TRITON_AMD_ENABLE=TRUE && FLASH_ATTENTION_SKIP_CK_BUILD=FALSE python setup.py install

-         tokenizer = AutoTokenizer.from_pretrained(model_name, torch_dtype=torch.float16, use_fast=False)
-         inputs = tokenizer('Today is', return_tensors='pt').to(device)
+For detailed installation instructions, see `Flash Attention <https://github.com/Dao-AILab/flash-attention>`_.

-         model_eager = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, attn_implementation="eager").cuda(device)
-         model_ckFAv2 = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, attn_implementation="flash_attention_2").cuda(device)
+Benchmarking Flash Attention 2 
+------------------------------

-         print("eager GQA: ", tokenizer.decode(model_eager.generate(**inputs, max_new_tokens=10)[0], skip_special_tokens=True))
-         print("ckFAv2 GQA: ", tokenizer.decode(model_ckFAv2.generate(**inputs, max_new_tokens=10)[0], skip_special_tokens=True))
+Benchmark scripts to evaluate the performance of Flash Attention 2 are stored in the ``flash-attention/benchmarks/`` directory.

-         #  eager GQA:  Today is the day of the Lord, and we are the
-         # ckFAv2 GQA: Today is the day of the Lord, and we are the
+To benchmark the CK backend 

-   .. tab-item:: Triton Flash Attention 2
+.. code-block:: shell

-      The Triton Flash Attention 2 module is implemented in Python and uses OpenAI’s JIT compiler. This module has been
-      upstreamed into the vLLM serving toolkit, discussed in :doc:'llm-inference-frameworks'. 
+   cd flash-attention/benchmarks
+   pip install transformers einops ninja

-      1. To install Triton Flash Attention 2 and run the benchmark, use the following commands.
+   python3 benchmark_flash_attention.py 

-         .. code-block:: shell
+To benchmark the Triton backend

-            # Install from the source
-            pip uninstall pytorch-triton-rocm triton -y
-            git clone https://github.com/ROCm/triton.git 
-            cd triton/python
-            GPU_ARCHS=gfx942 python setup.py install #MI300 series
-            pip install matplotlib pandas
+.. code-block:: shell

-      2. To test, run the Triton Flash Attention 2 performance benchmark.
+   FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE" python3 benchmark_flash_attention.py

-         .. code-block:: shell
-         
-            # Test the triton FA v2 kernel
-            python https://github.com/ROCm/triton/blob/triton-mlir/python/perf-kernels/flash-attention.py
-            # Results (Okay to release TFLOPS number ???)
-            fused-attention-fwd-d128:
-                BATCH    HQ    HK  N_CTX_Q  N_CTX_K      TFLOPS
-            0    16.0  16.0  16.0   1024.0   1024.0  287.528411
-            1     8.0  16.0  16.0   2048.0   2048.0  287.490806
-            2     4.0  16.0  16.0   4096.0   4096.0  345.966031
-            3     2.0  16.0  16.0   8192.0   8192.0  361.369510
-            4     1.0  16.0  16.0  16384.0  16384.0  356.873720
-            5     2.0  48.0  48.0   1024.0   1024.0  216.916235
-            6     2.0  48.0  48.0   2048.0   1024.0  271.027578
-            7     2.0  48.0  48.0   4096.0   8192.0  337.367372
-            8     2.0  48.0  48.0   8192.0   4096.0  363.481649
-            9     2.0  48.0  48.0  16384.0   8192.0  375.013622
-            10    8.0  16.0  16.0   1989.0  15344.0  321.791333
-            11    4.0  16.0  16.0   4097.0    163.0  122.104888
-            12    2.0  16.0  16.0   8122.0   2159.0  337.060283
-            13    1.0  16.0  16.0  16281.0      7.0    5.234012
-            14    2.0  48.0  48.0   1021.0   1020.0  214.657425
-            15    2.0  48.0  48.0   2001.0   2048.0  314.429118
-            16    2.0  48.0  48.0   3996.0   9639.0  330.411368
-            17    2.0  48.0  48.0   8181.0   1021.0  324.614980
+Using Flash Attention 2 
+-----------------------
+
+.. code-block:: python
+
+   import torch
+   from transformers import AutoModelForCausalLM, AutoTokenizer
+   device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+   model_name = "NousResearch/Llama-3.2-1B"
+
+   tokenizer = AutoTokenizer.from_pretrained(model_name, dtype=torch.bfloat16, use_fast=False)
+   inputs = tokenizer('Today is', return_tensors='pt').to(device)
+
+   model_eager = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.bfloat16, attn_implementation="eager").cuda(device)
+   model_ckFAv2 = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.bfloat16, attn_implementation="flash_attention_2").cuda(device)
+   model_eager.generation_config.pad_token_id = model_eager.generation_config.eos_token_id
+   model_ckFAv2.generation_config.pad_token_id = model_ckFAv2.generation_config.eos_token_id
+
+   print("eager\n GQA: ", tokenizer.decode(model_eager.generate(**inputs, max_new_tokens=22)[0], skip_special_tokens=True, do_sample=False, num_beams=1))
+   print("ckFAv2\n GQA: ", tokenizer.decode(model_ckFAv2.generate(**inputs, max_new_tokens=22)[0], skip_special_tokens=True, do_sample=False, num_beams=1))
+
+The outputs from eager mode and FlashAttention-2 are identical, although their performance behavior differs.
+
+.. code-block:: shell
+
+   eager
+   GQA:  Today is the 10th anniversary of the 9/11 attacks. I remember that day like it was yesterday.
+   ckFAv2
+   GQA:  Today is the 10th anniversary of the 9/11 attacks. I remember that day like it was yesterday.

 xFormers
 ========
--- a/docs/how-to/rocm-for-ai/inference-optimization/vllm-optimization.rst
+++ b/docs/how-to/rocm-for-ai/inference-optimization/vllm-optimization.rst
--- a/docs/how-to/rocm-for-ai/inference-optimization/workload.rst
+++ b/docs/how-to/rocm-for-ai/inference-optimization/workload.rst
@@ -15,10 +15,9 @@ using PyTorch. It delves into specific workloads such as
 :ref:`model inference <mi300x-vllm-optimization>`, offering strategies to
 enhance efficiency.

-The following topics highlight :ref:`auto-tunable configurations <mi300x-auto-tune>`
-that streamline optimization as well as advanced techniques like
-:ref:`Triton kernel optimization <mi300x-triton-kernel-performance-optimization>` for
-meticulous tuning.
+The following topics highlight :ref:`auto-tunable configurations <mi300x-auto-tune>` as
+well as :ref:`Triton kernel optimization <mi300x-triton-kernel-performance-optimization>`
+for meticulous tuning.

 Workload tuning strategy
 ========================
@@ -86,27 +85,28 @@ Optimize model inference with vLLM
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

 vLLM provides tools and techniques specifically designed for efficient model
-inference on AMD Instinct MI300X GPUs. See :ref:`fine-tuning-llms-vllm`
-for installation guidance. Optimizing performance with vLLM
-involves configuring tensor parallelism, leveraging advanced features, and
-ensuring efficient execution. Here’s how to optimize vLLM performance:
+inference on AMD Instinct GPUs. See the official `vLLM installation docs
+<https://docs.vllm.ai/en/latest/getting_started/installation/gpu.html>`__ for
+installation guidance. Optimizing performance with vLLM involves configuring
+tensor parallelism, leveraging advanced features, and ensuring efficient
+execution.

-* Tensor parallelism: Configure the
-  :ref:`tensor-parallel-size parameter <mi300x-vllm-multiple-gpus>` to distribute
-  tensor computations across multiple GPUs. Adjust parameters such as
-  ``batch-size``, ``input-len``, and ``output-len`` based on your workload.
-
-* Configuration for vLLM: Set :ref:`parameters <mi300x-vllm-optimization>`
-  according to workload requirements. Benchmark performance to understand
-  characteristics and identify bottlenecks.
+* Configuration for vLLM: Set engine arguments according to workload
+  requirements.

 * Benchmarking and performance metrics: Measure latency and throughput to
  evaluate performance.

+.. seealso::
+
+   See :doc:`vllm-optimization` to learn more about vLLM performance
+   optimization techniques.
+
 .. _mi300x-auto-tune:

 Auto-tunable configurations
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
 Auto-tunable configurations can significantly streamline performance
 optimization by automatically adjusting parameters based on workload
 characteristics. For example:
@@ -120,8 +120,7 @@ characteristics. For example:
  your specific hardware.

 * Triton: Use :ref:`Triton’s auto-tuning features <mi300x-autotunable-kernel-config>`
-  to explore various kernel configurations and automatically select the
-  best-performing ones.
+  to explore various kernel configurations and select the best-performing ones.

 Manual tuning
 ^^^^^^^^^^^^^
@@ -328,380 +327,21 @@ hardware counters are also included.

   ROCm Systems Profiler timeline trace example.

-.. _mi300x-vllm-optimization:
-
 vLLM performance optimization
 =============================

-vLLM is a high-throughput and memory efficient inference and serving engine for large language models that has gained traction in the AI community for
-its performance and ease of use. See :ref:`fine-tuning-llms-vllm` for a primer on vLLM with ROCm.
-
-Performance environment variables
---------------------------------
-
-The following performance tips are not *specific* to vLLM -- they are general
-but relevant in this context. You can tune the following vLLM parameters to
-achieve optimal request latency and throughput performance.
-
-* As described in `Environment variables (MI300X)
-  <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#environment-variables>`_,
-  the environment variable ``HIP_FORCE_DEV_KERNARG`` can improve vLLM
-  performance. Set it to ``export HIP_FORCE_DEV_KERNARG=1``.
-
-* Set the :ref:`RCCL environment variable <mi300x-rccl>` ``NCCL_MIN_NCHANNELS``
-  to ``112`` to increase the number of channels on MI300X to potentially improve
-  performance.
-
-* Set the environment variable ``TORCH_BLAS_PREFER_HIPBLASLT=1`` to use hipBLASLt to improve performance.
-
-Auto-tuning using PyTorch TunableOp
------------------------------------
-
-Since vLLM is based on the PyTorch framework, PyTorch TunableOp can be used for auto-tuning. 
-You can run auto-tuning with TunableOp in two simple steps without modifying your code:
-
-* Enable TunableOp and tuning. Optionally, enable verbose mode:
-
-  .. code-block:: shell
-
-     PYTORCH_TUNABLEOP_ENABLED=1 PYTORCH_TUNABLEOP_VERBOSE=1 your_vllm_script.sh
-
-* Enable TunableOp and disable tuning and measure.
-
-  .. code-block:: shell
-
-     PYTORCH_TUNABLEOP_ENABLED=1 PYTORCH_TUNABLEOP_TUNING=0 your_vllm_script.sh
-
-Learn more about TunableOp in the :ref:`PyTorch TunableOp <mi300x-tunableop>` section.
-
-Performance tuning based on vLLM engine configurations
-------------------------------------------------------
-
-The following subsections describe vLLM-specific configurations for performance tuning.
-You can tune the following vLLM parameters to achieve optimal performance.
-
-*  ``tensor_parallel_size``
-
-*  ``gpu_memory_utilization``
-
-*  ``dtype``
-
-*  ``enforce_eager``
-
-*  ``kv_cache_dtype``
-
-*  ``input_len``
-
-*  ``output_len``
-
-*  ``max_num_seqs``
-
-*  ``num_scheduler_steps``
-
-*  ``max_model_len``
-
-*  ``enable_chunked_prefill``
-
-*  ``distributed_executor_backend``
-
-*  ``max_seq_len_to_capture``
-
-Refer to `vLLM documentation <https://docs.vllm.ai/en/latest/models/performance.html>`_
-for additional performance tips. :ref:`fine-tuning-llms-vllm` describes vLLM
-usage with ROCm.
-
-ROCm provides a prebuilt optimized Docker image for validating the performance
-of LLM inference with vLLM on MI300X Series GPUs. The Docker image includes
-ROCm, vLLM, and PyTorch. For more information, see
-:doc:`/how-to/rocm-for-ai/inference/benchmark-docker/vllm`.
-
-.. _mi300x-vllm-throughput-measurement:
-
-Evaluating performance by throughput measurement
-------------------------------------------------
-
-This tuning guide evaluates the performance of LLM inference workloads by measuring throughput in tokens per second (TPS). Throughput can be assessed using both real-world and synthetic data, depending on your evaluation goals.
-
-Refer to the benchmarking script located at ``benchmarks/benchmark_throughput.py`` in the `vLLM repository <https://github.com/ROCm/vllm/blob/main/benchmarks/benchmark_throughput.py>`_.
-Use this script to measure throughput effectively. You can assess throughput using real-world and synthetic data, depending on your evaluation goals.
-
-* For realistic performance evaluation, you can use datasets like Hugging Face's
-  ``ShareGPT_V3_unfiltered_cleaned_split.json``. This dataset includes real-world conversational
-  data, making it a good representation of typical use cases for language models. Download it using
-  the following command:
-
-  .. code-block:: shell
-
-     wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-
-* For standardized benchmarking, you can set fixed input and output token
-  lengths. Synthetic prompts provide consistent benchmarking runs, making it
-  easier to compare performance across different models or configurations.
-  Additionally, a controlled environment simplifies analysis.
-
-By balancing real-world data and synthetic data approaches, you can get a well-rounded understanding of model performance in varied scenarios.
-
-.. _mi300x-vllm-single-node:
-
-Maximizing vLLM instances on a single node
------------------------------------------
-
-The general guideline is to maximize per-node throughput by running as many vLLM instances as possible.
-However, running too many instances might lead to insufficient memory for the KV-cache, which can affect performance.
-
-The Instinct MI300X GPU is equipped with 192 GB of HBM3 memory capacity and bandwidth.
-For models that fit in one GPU -- to maximize the accumulated throughput -- you can run as many as eight vLLM instances
-simultaneously on one MI300X node (with eight GPUs). To do so, use the GPU isolation environment
-variable ``CUDA_VISIBLE_DEVICES``.
-
-For example, this script runs eight instances of vLLM for throughput benchmarking at the same time
-with a model that can fit in one GPU:
-
-.. code-block:: shell
-
-   for i in $(seq 0 7);
-   do
-       CUDA_VISIBLE_DEVICES="$i" python3 /app/vllm/benchmarks/benchmark_throughput.py -tp 1 --dataset "/path/to/dataset/ShareGPT_V3_unfiltered_cleaned_split.json" --model /path/to/model &
-   done
-
-The total throughput achieved by running ``N`` instances of vLLM is generally much higher than running a
-single vLLM instance across ``N`` GPUs simultaneously (that is, configuring ``tensor_parallel_size`` as N or
-using the ``-tp`` N option, where ``1 < N ≤ 8``).
-
-vLLM on MI300X GPUs can run a variety of model weights, including Llama 2 (7b, 13b, 70b), Llama 3 (8b, 70b), Qwen2 (7b, 72b), Mixtral-8x7b, Mixtral-8x22b, and so on.
-Notable configurations include Llama2-70b and Llama3-70b models on a single MI300X GPU, and the Llama3.1 405b model can fit on one single node with 8 MI300X GPUs.
-
-.. _mi300x-vllm-gpu-memory-utilization:
-
-Configure the gpu_memory_utilization parameter
----------------------------------------------
-
-There are two ways to increase throughput by configuring ``gpu-memory-utilization`` parameter.
-
-1. Increase ``gpu-memory-utilization`` to improve the throughput for a single instance as long as
-   it does not incur HIP or CUDA Out Of Memory. The default ``gpu-memory-utilization`` is 0.9.
-   You can set it to ``>0.9`` and ``<1``.
-
-   For example, below benchmarking command set the ``gpu-memory-utilization`` as 0.98, or 98%.
-
-   .. code-block:: shell
-
-      /vllm-workspace/benchmarks/benchmark_throughput.py --gpu-memory-utilization 0.98 --input-len 1024 --output-len 128 --model /path/to/model
-
-2. Decrease ``gpu-memory-utilization`` to maximize the number of vLLM instances on the same GPU.
-
-   Specify GPU memory utilization to run as many instances of vLLM as possible on a single
-   GPU. However, too many instances can result in no memory for KV-cache. For small models, run
-   multiple instances of vLLM on the same GPU by specifying a smaller ``gpu-memory-utilization`` -- as
-   long as it would not cause HIP Out Of Memory. 
-
-   For example, run two instances of the Llama3-8b model at the same time on a single GPU by specifying
-   ``--gpu-memory-utilization`` to 0.4 (40%) as follows (on GPU ``0``):
-
-   .. code-block:: shell
-
-      CUDA_VISIBLE_DEVICES=0 python3 /vllm-workspace/benchmarks/benchmark_throughput.py --gpu-memory-utilization 0.4 
-      --dataset "/path/to/dataset/ShareGPT_V3_unfiltered_cleaned_split.json" --model /path/to/model &
-
-      CUDA_VISIBLE_DEVICES=0 python3 /vllm-workspace/benchmarks/benchmark_throughput.py --gpu-memory-utilization 0.4 
-      --dataset "/path/to/dataset/ShareGPT_V3_unfiltered_cleaned_split.json" --model /path/to/model &
-
-See :ref:`vllm-engine-args` for other performance suggestions.
-
-.. _mi300x-vllm-multiple-gpus:
-
-Run vLLM on multiple GPUs
-------------------------
-
-The two main reasons to use multiple GPUs are:
-
-*  The model size is too big to run vLLM using one GPU as it results HIP Out of Memory.
-
-*  To achieve better latency when using a single GPU is not desirable.
-
-To run one vLLM instance on multiple GPUs, use the ``-tp`` or ``--tensor-parallel-size`` option to
-specify multiple GPUs. Optionally, use the ``CUDA_VISIBLE_DEVICES`` environment variable to specify
-the GPUs.
-
-For example, you can use two GPUs to start an API server on port 8000:
-
-.. code-block:: shell
-
-   python -m vllm.entrypoints.api_server --model /path/to/model --dtype
-   float16 -tp 2 --port 8000 &
-
-To achieve both latency and throughput performance for serving, you can run multiple API servers on
-different GPUs by specifying different ports for each server and use ``CUDA_VISIBLE_DEVICES`` to
-specify the GPUs for each server, for example:
-
-.. code-block:: shell
-
-   CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.api_server --model
-   /path/to/model --dtype float16 -tp 2 --port 8000 &
-
-   CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.api_server --model
-   /path/to/model --dtype float16 -tp 2 --port 8001 &
-
-Choose an attention backend
---------------------------
-
-vLLM on ROCm supports two attention backends, each suitable for different use cases and performance
-requirements:
-
- **Triton Flash Attention** - For benchmarking, run vLLM scripts at
-  least once as a warm-up step so Triton can perform auto-tuning before
-  collecting benchmarking numbers. This is the default setting.
-
- **Composable Kernel (CK) Flash Attention** - To use CK Flash Attention, specify
-  the environment variable as ``export VLLM_USE_TRITON_FLASH_ATTN=0``.
-
-
-Refer to :ref:`Model acceleration libraries <acceleration-flash-attention>`
-to learn more about Flash Attention with Triton or CK backends.
-
-.. _vllm-engine-args:
-
-vLLM engine arguments
---------------------
-
-The following are configuration suggestions to potentially improve performance with vLLM. See
-`vLLM's engine arguments documentation <https://docs.vllm.ai/en/latest/serving/engine_args.html>`_
-for a full list of configurable engine arguments.
-
-Configure the max-num-seqs parameter
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Increase the ``max-num-seqs`` parameter from the default ``256`` to ``512`` (``--max-num-seqs
-512``). This increases the maximum number of sequences per iteration and can improve throughput.
-
-Use the float16 dtype
-^^^^^^^^^^^^^^^^^^^^^
-
-The default data type (``dtype``) is specified in the model’s configuration file. For instance, some models use ``torch.bfloat16`` as their default ``dtype``.
-Use float16 (``--dtype float16``) for better performance.
-
-Multi-step scheduling
-^^^^^^^^^^^^^^^^^^^^^
-
-Setting ``num-scheduler-steps`` for multi-step scheduling can increase performance. Set it between 10 to 15 (``--num-scheduler-steps 10``).
-
-Distributed executor backend
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-The vLLM supports two modes of distributed executor backend: ``ray`` and ``mp``. When using the `<https://github.com/ROCm/vllm>`__ fork, using the ``mp``
-backend (``--distributed_executor_backend mp``) is recommended.
-
-Graph mode max-seq-len-to-capture
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Maximum sequence length covered by CUDA graphs. In the default mode (where ``enforce_eager`` is ``False``), when a sequence has context length
-larger than this, vLLM engine falls back to eager mode. The default is 8192.
-
-When working with models that support long context lengths, set the parameter ``--max-seq-len-to-capture`` to 16384.
-See this `vLLM blog <https://blog.vllm.ai/2024/10/23/vllm-serving-amd.html>`__ for details.
-
-An example of long context length model is Qwen2-7b.
-
-Whether to enable chunked prefill
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Another vLLM performance tip is to enable chunked prefill to improve
-throughput. Chunked prefill allows large prefills to be chunked into
-smaller chunks and batched together with decode requests.
-
-You can enable the feature by specifying ``--enable-chunked-prefill`` in the
-command line or setting ``enable_chunked_prefill=True`` in the LLM
-constructor. 
-
-As stated in `vLLM's documentation, <https://docs.vllm.ai/en/latest/models/performance.html#chunked-prefill>`__,
-you can tune the performance by changing ``max_num_batched_tokens``. By
-default, it is set to 512 and optimized for ITL (inter-token latency).
-Smaller ``max_num_batched_tokens`` achieves better ITL because there are
-fewer prefills interrupting decodes.
-Higher ``max_num_batched_tokens`` achieves better TTFT (time to the first
-token) as you can put more prefill to the batch.
-
-You might experience noticeable throughput improvements when
-benchmarking on a single GPU or 8 GPUs using the vLLM throughput
-benchmarking script along with the ShareGPT dataset as input.
-
-In the case of fixed ``input-len``/``output-len``, for some configurations,
-enabling chunked prefill increases the throughput. For some other
-configurations, the throughput may be worse and elicit a need to tune
-parameter ``max_num_batched_tokens`` (for example, increasing ``max_num_batched_tokens`` value to 4096 or larger).
-
-.. note::
-
-   Chunked prefill is no longer recommended. See the vLLM blog: `Serving LLMs on AMD MI300X: Best Practices <https://blog.vllm.ai/2024/10/23/vllm-serving-amd.html>`_ (October 2024).
-
-Quantization support
---------------------
-
-Quantization reduces the precision of the model’s weights and activations, which significantly decreases the memory footprint.
-``fp8(w8a8)`` and ``AWQ`` quantization are supported for ROCm.
-
-FP8 quantization
-^^^^^^^^^^^^^^^^^
-
-`<https://github.com/ROCm/vllm>`__ supports FP8 (8-bit floating point) weight and activation quantization using hardware acceleration on the Instinct MI300X.
-Quantization of models with FP8 allows for a 2x reduction in model memory requirements and up to a 1.6x improvement in throughput with minimal impact on accuracy.
-
-AMD publishes Quark Quantized OCP FP8 models on Hugging Face. For example:
-
-* `Llama-3.1-8B-Instruct-FP8-KV <https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV>`__
-* `Llama-3.1-70B-Instruct-FP8-KV <https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV>`__
-* `Llama-3.1-405B-Instruct-FP8-KV <https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV>`__
-* `Mixtral-8x7B-Instruct-v0.1-FP8-KV <https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV>`__
-* `Mixtral-8x22B-Instruct-v0.1-FP8-KV <https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV>`__
-
-To enable vLLM benchmarking to run on fp8 quantized models, use the ``--quantization`` parameter with value ``fp8`` (``--quantization fp8``).
-
-AWQ quantization
-^^^^^^^^^^^^^^^^
-
-You can quantize your own models by installing AutoAWQ or picking one of the 400+ models on Hugging Face. Be aware that
-that AWQ support in vLLM is currently underoptimized.
-
-To enable vLLM to run on ``awq`` quantized models, using ``--quantization`` parameter with ``awq`` (``--quantization awq``).
-
-You can find more specifics in the `vLLM AutoAWQ documentation <https://docs.vllm.ai/en/stable/quantization/auto_awq.html>`_.
-
-fp8 kv-cached-dtype
-^^^^^^^^^^^^^^^^^^^^^^^
-
-Using ``fp8 kv-cache dtype`` can improve performance as it reduces the size
-of ``kv-cache``. As a result, it reduces the cost required for reading and
-writing the ``kv-cache``.
-
-To use this feature, specify ``--kv-cache-dtype`` as ``fp8``.
-
-To specify the quantization scaling config, use the
-``--quantization-param-path`` parameter. If the parameter is not specified,
-the default scaling factor of ``1`` is used, which can lead to less accurate
-results. To generate ``kv-cache`` scaling JSON file, see `FP8 KV
-Cache <https://github.com/vllm-project/llm-compressor/blob/main/examples/quantization_kv_cache/README.md>`__
-in the vLLM GitHub repository.
-
-Two sample Llama scaling configuration files are in vLLM for ``llama2-70b`` and
-``llama2-7b``.
-
-If building the vLLM using
-`Dockerfile.rocm <https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile.rocm>`_
-for ``llama2-70b`` scale config, find the file at
-``/vllm-workspace/tests/fp8_kv/llama2-70b-fp8-kv/kv_cache_scales.json`` at
-runtime.
-
-Below is a sample command to run benchmarking with this feature enabled
-for the ``llama2-70b`` model:
-
-.. code-block:: shell
-
-   python3 /vllm-workspace/benchmarks/benchmark_throughput.py --model \
-   /path/to/llama2-70b-model --kv-cache-dtype "fp8" \
-   --quantization-param-path \
-   "/vllm-workspace/tests/fp8_kv/llama2-70b-fp8-kv/kv_cache_scales.json" \
-   --input-len 512 --output-len 256 --num-prompts 500
-
+vLLM is a high-throughput and memory efficient inference and serving engine for
+large language models that has gained traction in the AI community for its
+performance and ease of use. See :doc:`vllm-optimization`, where you'll learn
+how to:
+
+* Enable AITER (AI Tensor Engine for ROCm) to speed up on LLM models.
+* Configure environment variables for optimal HIP, RCCL, and Quick Reduce performance.
+* Select the right attention backend for your workload (AITER MHA/MLA vs. Triton).
+* Choose parallelism strategies (tensor, pipeline, data, expert) for multi-GPU deployments.
+* Apply quantization (``FP8``/``FP4``) to reduce memory usage by 2-4× with minimal accuracy loss.
+* Tune engine arguments (batch size, memory utilization, graph modes) for your use case.
+* Benchmark and scale across single-node and multi-node configurations.

 .. _mi300x-tunableop:

@@ -946,33 +586,33 @@ for details.

  .. code-block:: shell

-     HIP_FORCE_DEV_KERNARG=1  hipblaslt-bench --alpha 1 --beta 0 -r f16_r \
+     HIP_FORCE_DEV_KERNARG=1  hipblaslt-bench --alpha 1 --beta 0 -r f16_r \
     --a_type f16_r --b_type f8_r --compute_type f32_f16_r \
-     --initialization trig_float  --cold_iters 100 --iters 1000 --rotating 256
+     --initialization trig_float  --cold_iters 100 --iters 1000 --rotating 256

 * Example 2: Benchmark forward epilogues and backward epilogues

-  *  ``HIPBLASLT_EPILOGUE_RELU: "--activation_type relu";``
+  *  ``HIPBLASLT_EPILOGUE_RELU: "--activation_type relu";``

-  *  ``HIPBLASLT_EPILOGUE_BIAS: "--bias_vector";``
+  *  ``HIPBLASLT_EPILOGUE_BIAS: "--bias_vector";``

-  *  ``HIPBLASLT_EPILOGUE_RELU_BIAS: "--activation_type relu --bias_vector";``
+  *  ``HIPBLASLT_EPILOGUE_RELU_BIAS: "--activation_type relu --bias_vector";``

-  *  ``HIPBLASLT_EPILOGUE_GELU: "--activation_type gelu";``
+  *  ``HIPBLASLT_EPILOGUE_GELU: "--activation_type gelu";``

  *  ``HIPBLASLT_EPILOGUE_DGELU": --activation_type gelu --gradient";``

-  *  ``HIPBLASLT_EPILOGUE_GELU_BIAS: "--activation_type gelu --bias_vector";``
+  *  ``HIPBLASLT_EPILOGUE_GELU_BIAS: "--activation_type gelu --bias_vector";``

-  *  ``HIPBLASLT_EPILOGUE_GELU_AUX: "--activation_type gelu --use_e";``
+  *  ``HIPBLASLT_EPILOGUE_GELU_AUX: "--activation_type gelu --use_e";``

-  *  ``HIPBLASLT_EPILOGUE_GELU_AUX_BIAS: "--activation_type gelu --bias_vector --use_e";``
+  *  ``HIPBLASLT_EPILOGUE_GELU_AUX_BIAS: "--activation_type gelu --bias_vector --use_e";``

-  *  ``HIPBLASLT_EPILOGUE_DGELU_BGRAD: "--activation_type gelu --bias_vector --gradient";``
+  *  ``HIPBLASLT_EPILOGUE_DGELU_BGRAD: "--activation_type gelu --bias_vector --gradient";``

-  *  ``HIPBLASLT_EPILOGUE_BGRADA: "--bias_vector --gradient --bias_source a";``
+  *  ``HIPBLASLT_EPILOGUE_BGRADA: "--bias_vector --gradient --bias_source a";``

-  *  ``HIPBLASLT_EPILOGUE_BGRADB:  "--bias_vector --gradient --bias_source b";``
+  *  ``HIPBLASLT_EPILOGUE_BGRADB:  "--bias_vector --gradient --bias_source b";``


 hipBLASLt auto-tuning using hipblaslt-bench
@@ -1031,26 +671,26 @@ The tuning tool is a two-step tool. It first runs the benchmark, then it creates

  .. code-block:: python

-     defaultBenchOptions = {"ProblemType": {
-         "TransposeA": 0,
-         "TransposeB": 0,
-         "ComputeInputDataType": "s",
-         "ComputeDataType": "s",
-         "DataTypeC": "s",
-         "DataTypeD": "s",
-         "UseBias": False
-     }, "TestConfig": {
-         "ColdIter": 20,
-         "Iter": 100,
-         "AlgoMethod": "all",
-         "RequestedSolutions": 2, # Only works in AlgoMethod heuristic
-         "SolutionIndex": None, # Only works in AlgoMethod index
-         "ApiMethod": "cpp",
-         "RotatingBuffer": 0,
-     }, "TuningParameters": {
-         "SplitK": [0]
-     }, "ProblemSizes": []}
-     defaultCreateLogicOptions = {}  # Currently unused
+     defaultBenchOptions = {"ProblemType": {
+         "TransposeA": 0,
+         "TransposeB": 0,
+         "ComputeInputDataType": "s",
+         "ComputeDataType": "s",
+         "DataTypeC": "s",
+         "DataTypeD": "s",
+         "UseBias": False
+     }, "TestConfig": {
+         "ColdIter": 20,
+         "Iter": 100,
+         "AlgoMethod": "all",
+         "RequestedSolutions": 2, # Only works in AlgoMethod heuristic
+         "SolutionIndex": None, # Only works in AlgoMethod index
+         "ApiMethod": "cpp",
+         "RotatingBuffer": 0,
+     }, "TuningParameters": {
+         "SplitK": [0]
+     }, "ProblemSizes": []}
+     defaultCreateLogicOptions = {}  # Currently unused

 * ``TestConfig``
   1. ``ColdIter``: This is number the warm-up iterations before starting the kernel benchmark.
@@ -1230,7 +870,7 @@ command:

 .. code-block:: shell

-   merge.py original_dir new_tuned_yaml_dir output_dir 
+   merge.py original_dir new_tuned_yaml_dir output_dir 

 The following table describes the logic YAML files.

@@ -1833,7 +1473,7 @@ de-quantize the ``int4`` key-value from the ``int4`` data type to ``fp16``.

 From the IR snippet, you can see ``i32`` data is loaded from global memory to
 registers (``%190``). With a few element-wise operations in registers, it is
-stored in shared memory (``%269``) for the transpose operation (``%270``), which
+stored in shared memory (``%269``) for the transpose operation (``%270``), which
 needs data movement across different threads. With the transpose done, it is
 loaded from LDS to register again (``%276``), and with a few more
 element-wise operations, it is stored to LDS again (``%298``). The last step
@@ -1967,7 +1607,7 @@ something similar to the following:
   loaded at: [0x7fd4f100c000-0x7fd4f100e070]

 The kernel name and the code object file should be listed. In the
-example above, the kernel name is vector_add_assert_trap, but this might
+example above, the kernel name is vector_add_assert_trap, but this might
 also look like:

 .. code-block:: text
@@ -2081,3 +1721,8 @@ Hardware efficiency is maximized with 4 or fewer HIP streams. These environment
 configuration to two compute streams and two RCCL streams, aligning with this best practice.
 Additionally, RCCL is often pre-optimized for MI300 systems in production by querying the node
 topology during startup, reducing the need for extensive manual tuning.
+
+Further reading
+===============
+
+* :doc:`vllm-optimization`
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.2-20251006.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.2-20251006.rst
@@ -0,0 +1,482 @@
+:orphan:
+
+.. meta::
+   :description: Learn how to validate LLM inference performance on MI300X GPUs using AMD MAD and the ROCm vLLM Docker image.
+   :keywords: model, MAD, automation, dashboarding, validate
+
+**********************************
+vLLM inference performance testing
+**********************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm vLLM
+   inference performance documentation. See :doc:`../vllm` for the latest version.
+
+.. _vllm-benchmark-unified-docker-930:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.1_20251006-benchmark-models.yaml
+
+   {% set docker = data.dockers[0] %}
+
+   The `ROCm vLLM Docker <{{ docker.docker_hub_url }}>`_ image offers a
+   prebuilt, optimized environment for validating large language model (LLM)
+   inference performance on AMD Instinct™ MI355X, MI350X, MI325X and MI300X
+   GPUs. This ROCm vLLM Docker image integrates vLLM and PyTorch tailored
+   specifically for AMD data center GPUs and includes the following components:
+
+   .. tab-set::
+
+      .. tab-item:: {{ docker.pull_tag }}
+
+         .. list-table::
+            :header-rows: 1
+
+            * - Software component
+              - Version
+
+            {% for component_name, component_version in docker.components.items() %}
+            * - {{ component_name }}
+              - {{ component_version }}
+            {% endfor %}
+
+With this Docker image, you can quickly test the :ref:`expected
+inference performance numbers <vllm-benchmark-performance-measurements-930>` for
+AMD Instinct GPUs.
+
+What's new
+==========
+
+The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <vllm-history>`.
+
+* Added support for AMD Instinct MI355X and MI350X GPUs.
+
+* Added support and benchmarking instructions for the following models. See :ref:`vllm-benchmark-supported-models-930`.
+
+  * Llama 4 Scout and Maverick
+
+  * DeepSeek R1 0528 FP8
+
+  * MXFP4 models (MI355X and MI350X only): Llama 3.3 70B MXFP4 and Llama 3.1 405B MXFP4
+
+  * GPT OSS 20B and 120B
+
+  * Qwen 3 32B, 30B-A3B, and 235B-A22B
+
+* Removed the deprecated ``--max-seq-len-to-capture`` flag.
+
+* ``--gpu-memory-utilization`` is now configurable via the `configuration files
+  <https://github.com/ROCm/MAD/tree/develop/scripts/vllm/configs>`__ in the MAD
+  repository.
+
+.. _vllm-benchmark-supported-models-930:
+
+Supported models
+================
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.1_20251006-benchmark-models.yaml
+
+   {% set docker = data.dockers[0] %}
+   {% set model_groups = data.model_groups %}
+
+   .. _vllm-benchmark-available-models-930:
+
+   The following models are supported for inference performance benchmarking
+   with vLLM and ROCm. Some instructions, commands, and recommendations in this
+   documentation might vary by model -- select one to get started. MXFP4 models
+   are only supported on MI355X and MI350X GPUs.
+
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+         <div class="row gx-0">
+            <div class="col-2 me-1 px-2 model-param-head">Model</div>
+            <div class="row col-10 pe-0">
+      {% for model_group in model_groups %}
+               <div class="col-4 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+      {% endfor %}
+            </div>
+         </div>
+
+         <div class="row gx-0 pt-1">
+            <div class="col-2 me-1 px-2 model-param-head">Variant</div>
+            <div class="row col-10 pe-0">
+      {% for model_group in model_groups %}
+         {% set models = model_group.models %}
+         {% for model in models %}
+            {% if models|length % 3 == 0 %}
+               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% else %}
+               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% endif %}
+         {% endfor %}
+      {% endfor %}
+            </div>
+         </div>
+      </div>
+
+   .. _vllm-benchmark-vllm-930:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{ model.mad_tag }}
+
+
+      {% if model.precision == "float4" %}
+      .. important::
+
+         MXFP4 is supported only on MI355X and MI350X GPUs.
+      {% endif %}
+
+      .. note::
+
+         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
+         Some models require access authorization prior to use via an external license agreement through a third party.
+      {% if model.precision == "float8" and model.model_repo.startswith("amd") %}
+         This model uses FP8 quantization via `AMD Quark <https://quark.docs.amd.com/latest/>`__ for efficient inference on AMD GPUs.
+      {% endif %}
+      {% if model.precision == "float4" and model.model_repo.startswith("amd") %}
+         This model uses FP4 quantization via `AMD Quark <https://quark.docs.amd.com/latest/>`__ for efficient inference on AMD GPUs.
+      {% endif %}
+
+      {% endfor %}
+   {% endfor %}
+
+.. _vllm-benchmark-performance-measurements-930:
+
+Performance measurements
+========================
+
+To evaluate performance, the
+`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+page provides reference throughput and serving measurements for inferencing popular AI models.
+
+.. important::
+
+   The performance data presented in
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+   only reflects the latest version of this inference benchmarking environment.
+   The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct GPUs or ROCm software.
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting training.
+
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
+Pull the Docker image
+=====================
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.1_20251006-benchmark-models.yaml
+
+   {% set docker = data.dockers[0] %}
+
+   Download the `ROCm vLLM Docker image <{{ docker.docker_hub_url }}>`_.
+   Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull {{ docker.pull_tag }}
+
+Benchmarking
+============
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.1_20251006-benchmark-models.yaml
+
+   {% set docker = data.dockers[0] %}
+   {% set model_groups = data.model_groups %}
+
+   Once the setup is complete, choose between two options to reproduce the
+   benchmark results:
+
+   .. _vllm-benchmark-mad-930:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. tab-set::
+
+         .. tab-item:: MAD-integrated benchmarking
+
+            The following run command is tailored to {{ model.model }}.
+            See :ref:`vllm-benchmark-supported-models-930` to switch to another available model.
+
+            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+               directory and install the required packages on the host machine.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD
+                  pip install -r requirements.txt
+
+            2. On the host machine, use this command to run the performance benchmark test on
+               the `{{model.model}} <{{ model.url }}>`_ model using one node with the
+               :literal:`{{model.precision}}` data type.
+
+               .. code-block:: shell
+
+                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+                  madengine run \
+                      --tags {{model.mad_tag}} \
+                      --keep-model-dir \
+                      --live-output
+
+            MAD launches a Docker container with the name
+            ``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the
+            model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv``
+            and ``{{ model.mad_tag }}_serving.csv``.
+
+            Although the :ref:`available models
+            <vllm-benchmark-available-models-930>` are preconfigured to collect
+            offline throughput and online serving performance data, you can
+            also change the benchmarking parameters. See the standalone
+            benchmarking tab for more information.
+
+            {% if model.tunableop %}
+
+            .. note::
+
+               For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
+               TunableOp automatically explores different implementations and configurations of certain PyTorch
+               operators to find the fastest one for your hardware.
+
+               By default, ``{{model.mad_tag}}`` runs with TunableOp disabled (see
+               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To enable it, include
+               the ``--tunableop on`` argument in your run.
+
+               Enabling TunableOp triggers a two-pass run -- a warm-up followed by the
+               performance-collection run.
+
+            {% endif %}
+
+         .. tab-item:: Standalone benchmarking
+
+            The following commands are optimized for {{ model.model }}.
+            See :ref:`vllm-benchmark-supported-models-930` to switch to another available model.
+
+            .. seealso::
+
+               For more information on configuration, see the `config files
+               <https://github.com/ROCm/MAD/tree/develop/scripts/vllm/configs>`__
+               in the MAD repository. Refer to the `vLLM engine <https://docs.vllm.ai/en/latest/configuration/engine_args.html#engineargs>`__
+               for descriptions of available configuration options
+               and `Benchmarking vLLM <https://github.com/vllm-project/vllm/blob/main/benchmarks/README.md>`__ for
+               additional benchmarking information.
+
+            .. rubric:: Launch the container
+
+            You can run the vLLM benchmark tool independently by starting the
+            `Docker container <{{ docker.docker_hub_url }}>`_ as shown
+            in the following snippet.
+
+            .. code-block:: shell
+
+               docker pull {{ docker.pull_tag }}
+               docker run -it \
+                   --device=/dev/kfd \
+                   --device=/dev/dri \
+                   --group-add video \
+                   --shm-size 16G \
+                   --security-opt seccomp=unconfined \
+                   --security-opt apparmor=unconfined \
+                   --cap-add=SYS_PTRACE \
+                   -v $(pwd):/workspace \
+                   --env HUGGINGFACE_HUB_CACHE=/workspace \
+                   --name test \
+                   {{ docker.pull_tag }}
+
+            .. rubric:: Throughput command
+
+            Use the following command to start the throughput benchmark.
+
+            .. code-block:: shell
+
+               model={{ model.model_repo }}
+               tp={{ model.config.tp }}
+               num_prompts={{ model.config.num_prompts | default(1024) }}
+               in={{ model.config.in | default(128) }}
+               out={{ model.config.in | default(128) }}
+               dtype={{ model.config.dtype | default("auto") }}
+               kv_cache_dtype={{ model.config.kv_cache_dtype }}
+               max_num_seqs={{ model.config.max_num_seqs | default(1024) }}
+               max_num_batched_tokens={{ model.config.max_num_batched_tokens }}
+               max_model_len={{ model.config.max_model_len }}
+
+               vllm bench throughput --model $model \
+                   -tp $tp \
+                   --num-prompts $num_prompts \
+                   --input-len $in \
+                   --output-len $out \
+                   --dtype $dtype \
+                   --kv-cache-dtype $kv_cache_dtype \
+                   --max-num-seqs $max_num_seqs \
+                   --max-num-batched-tokens $max_num_batched_tokens \
+                   --max-model-len $max_model_len \
+                   --trust-remote-code \
+                   --output-json ${model}_throughput.json \
+                   --gpu-memory-utilization {{ model.config.gpu_memory_utilization | default(0.9) }}
+
+            .. rubric:: Serving command
+
+            1. Start the server using the following command:
+
+               .. code-block:: shell
+
+                  model={{ model.model_repo }}
+                  tp={{ model.config.tp }}
+                  dtype={{ model.config.dtype }}
+                  kv_cache_dtype={{ model.config.kv_cache_dtype }}
+                  max_num_seqs=256
+                  max_num_batched_tokens={{ model.config.max_num_batched_tokens }}
+                  max_model_len={{ model.config.max_model_len }}
+
+                  vllm serve $model \
+                      -tp $tp \
+                      --dtype $dtype \
+                      --kv-cache-dtype $kv_cache_dtype \
+                      --max-num-seqs $max_num_seqs \
+                      --max-num-batched-tokens $max_num_batched_tokens \
+                      --max-model-len $max_model_len \
+                      --no-enable-prefix-caching \
+                      --swap-space 16 \
+                      --disable-log-requests \
+                      --trust-remote-code \
+                      --gpu-memory-utilization 0.9
+
+               Wait until the model has loaded and the server is ready to accept requests.
+
+            2. On another terminal on the same machine, run the benchmark:
+
+               .. code-block:: shell
+
+                  # Connect to the container
+                  docker exec -it test bash
+
+                  # Wait for the server to start
+                  until curl -s http://localhost:8000/v1/models; do sleep 30; done
+
+                  # Run the benchmark
+                  model={{ model.model_repo }}
+                  max_concurrency=1
+                  num_prompts=10
+                  in=128
+                  out=128
+                  vllm bench serve --model $model \
+                      --percentile-metrics "ttft,tpot,itl,e2el" \
+                      --dataset-name random \
+                      --ignore-eos \
+                      --max-concurrency $max_concurrency \
+                      --num-prompts $num_prompts \
+                      --random-input-len $in \
+                      --random-output-len $out \
+                      --trust-remote-code \
+                      --save-result \
+                      --result-filename ${model}_serving.json
+
+            .. note::
+
+               For improved performance with certain Mixture of Experts models, such as Mixtral 8x22B,
+               try adding ``export VLLM_ROCM_USE_AITER=1`` to your commands.
+
+               If you encounter the following error, pass your access-authorized Hugging
+               Face token to the gated models.
+
+               .. code-block::
+
+                  OSError: You are trying to access a gated repo.
+
+                  # pass your HF_TOKEN
+                  export HF_TOKEN=$your_personal_hf_token
+
+            .. raw:: html
+
+               <style>
+               mjx-container[jax="CHTML"][display="true"] {
+                  text-align: left;
+                  margin: 0;
+               }
+               </style>
+
+            .. note::
+
+               Throughput is calculated as:
+
+               - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
+
+               - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
+      {% endfor %}
+   {% endfor %}
+
+Advanced usage
+==============
+
+For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
+see the developer's guide at `<https://github.com/ROCm/vllm/blob/documentation/docs/dev-docker/README.md>`__.
+
+Reproducing the Docker image
+----------------------------
+
+To reproduce this ROCm-enabled vLLM Docker image release, follow these steps:
+
+1. Clone the `vLLM repository <https://github.com/vllm-project/vllm>`__.
+
+   .. code-block:: shell
+
+      git clone https://github.com/vllm-project/vllm.git
+      cd vllm
+
+2. Use the following command to build the image directly from the specified commit.
+
+   .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.1_20251006-benchmark-models.yaml
+
+      {% set docker = data.dockers[0] %}
+      .. code-block:: shell
+
+         docker build -f docker/Dockerfile.rocm \
+             --build-arg REMOTE_VLLM=1 \
+             --build-arg VLLM_REPO=https://github.com/ROCm/vllm \
+             --build-arg VLLM_BRANCH="{{ docker.dockerfile.commit }}" \
+             -t vllm-rocm .
+
+   .. tip::
+
+      Replace ``vllm-rocm`` with your desired image tag.
+
+Further reading
+===============
+
+- To learn more about the options for latency and throughput benchmark scripts,
+  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
+
+- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
+
+- To learn more about system settings and management practices to configure your system for
+  AMD Instinct MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+
+- See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
+  a brief introduction to vLLM and optimization strategies.
+
+- For application performance optimization strategies for HPC and AI workloads,
+  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`vllm-history` to find documentation for previous releases
+of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.11.1-20251103.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.11.1-20251103.rst
@@ -0,0 +1,472 @@
+:orphan:
+
+.. meta::
+   :description: Learn how to validate LLM inference performance on MI300X GPUs using AMD MAD and the ROCm vLLM Docker image.
+   :keywords: model, MAD, automation, dashboarding, validate
+
+**********************************
+vLLM inference performance testing
+**********************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm vLLM
+   inference performance documentation. See :doc:`../vllm` for the latest version.
+
+.. _vllm-benchmark-unified-docker-1103:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.11.1_20251103-benchmark-models.yaml
+
+   {% set docker = data.dockers[0] %}
+
+   The `ROCm vLLM Docker <{{ docker.docker_hub_url }}>`_ image offers a
+   prebuilt, optimized environment for validating large language model (LLM)
+   inference performance on AMD Instinct™ MI355X, MI350X, MI325X and MI300X
+   GPUs. This ROCm vLLM Docker image integrates vLLM and PyTorch tailored
+   specifically for AMD data center GPUs and includes the following components:
+
+   .. tab-set::
+
+      .. tab-item:: {{ docker.pull_tag }}
+
+         .. list-table::
+            :header-rows: 1
+
+            * - Software component
+              - Version
+
+            {% for component_name, component_version in docker.components.items() %}
+            * - {{ component_name }}
+              - {{ component_version }}
+            {% endfor %}
+
+With this Docker image, you can quickly test the :ref:`expected
+inference performance numbers <vllm-benchmark-performance-measurements-1103>` for
+AMD Instinct GPUs.
+
+What's new
+==========
+
+The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <vllm-history>`.
+
+* Enabled :ref:`AITER <vllm-optimization-aiter-switches>` by default.
+
+* Fixed ``rms_norm`` segfault issue with Qwen 3 235B.
+
+* Known performance degradation on Llama 4 models due to `an upstream vLLM issue <https://github.com/vllm-project/vllm/issues/26320>`_.
+
+.. _vllm-benchmark-supported-models-1103:
+
+Supported models
+================
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.11.1_20251103-benchmark-models.yaml
+
+   {% set docker = data.dockers[0] %}
+   {% set model_groups = data.model_groups %}
+
+   .. _vllm-benchmark-available-models-1103:
+
+   The following models are supported for inference performance benchmarking
+   with vLLM and ROCm. Some instructions, commands, and recommendations in this
+   documentation might vary by model -- select one to get started. MXFP4 models
+   are only supported on MI355X and MI350X GPUs.
+
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+         <div class="row gx-0">
+            <div class="col-2 me-1 px-2 model-param-head">Model</div>
+            <div class="row col-10 pe-0">
+      {% for model_group in model_groups %}
+               <div class="col-4 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+      {% endfor %}
+            </div>
+         </div>
+
+         <div class="row gx-0 pt-1">
+            <div class="col-2 me-1 px-2 model-param-head">Variant</div>
+            <div class="row col-10 pe-0">
+      {% for model_group in model_groups %}
+         {% set models = model_group.models %}
+         {% for model in models %}
+            {% if models|length % 3 == 0 %}
+               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% else %}
+               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% endif %}
+         {% endfor %}
+      {% endfor %}
+            </div>
+         </div>
+      </div>
+
+   .. _vllm-benchmark-vllm-1103:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{ model.mad_tag }}
+
+
+      {% if model.precision == "float4" %}
+      .. important::
+
+         MXFP4 is supported only on MI355X and MI350X GPUs.
+      {% endif %}
+
+      .. note::
+
+         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
+         Some models require access authorization prior to use via an external license agreement through a third party.
+      {% if model.precision == "float8" and model.model_repo.startswith("amd") %}
+         This model uses FP8 quantization via `AMD Quark <https://quark.docs.amd.com/latest/>`__ for efficient inference on AMD GPUs.
+      {% endif %}
+      {% if model.precision == "float4" and model.model_repo.startswith("amd") %}
+         This model uses FP4 quantization via `AMD Quark <https://quark.docs.amd.com/latest/>`__ for efficient inference on AMD GPUs.
+      {% endif %}
+
+      {% endfor %}
+   {% endfor %}
+
+.. _vllm-benchmark-performance-measurements-1103:
+
+Performance measurements
+========================
+
+To evaluate performance, the
+`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+page provides reference throughput and serving measurements for inferencing popular AI models.
+
+.. important::
+
+   The performance data presented in
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+   only reflects the latest version of this inference benchmarking environment.
+   The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct GPUs or ROCm software.
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting training.
+
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
+Pull the Docker image
+=====================
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.11.1_20251103-benchmark-models.yaml
+
+   {% set docker = data.dockers[0] %}
+
+   Download the `ROCm vLLM Docker image <{{ docker.docker_hub_url }}>`_.
+   Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull {{ docker.pull_tag }}
+
+Benchmarking
+============
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.11.1_20251103-benchmark-models.yaml
+
+   {% set docker = data.dockers[0] %}
+   {% set model_groups = data.model_groups %}
+
+   Once the setup is complete, choose between two options to reproduce the
+   benchmark results:
+
+   .. _vllm-benchmark-mad-1103:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. tab-set::
+
+         .. tab-item:: MAD-integrated benchmarking
+
+            The following run command is tailored to {{ model.model }}.
+            See :ref:`vllm-benchmark-supported-models-1103` to switch to another available model.
+
+            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+               directory and install the required packages on the host machine.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD
+                  pip install -r requirements.txt
+
+            2. On the host machine, use this command to run the performance benchmark test on
+               the `{{model.model}} <{{ model.url }}>`_ model using one node with the
+               :literal:`{{model.precision}}` data type.
+
+               .. code-block:: shell
+
+                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+                  madengine run \
+                      --tags {{model.mad_tag}} \
+                      --keep-model-dir \
+                      --live-output
+
+            MAD launches a Docker container with the name
+            ``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the
+            model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv``
+            and ``{{ model.mad_tag }}_serving.csv``.
+
+            Although the :ref:`available models
+            <vllm-benchmark-available-models-1103>` are preconfigured to collect
+            offline throughput and online serving performance data, you can
+            also change the benchmarking parameters. See the standalone
+            benchmarking tab for more information.
+
+            {% if model.tunableop %}
+
+            .. note::
+
+               For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
+               TunableOp automatically explores different implementations and configurations of certain PyTorch
+               operators to find the fastest one for your hardware.
+
+               By default, ``{{model.mad_tag}}`` runs with TunableOp disabled (see
+               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To enable it, include
+               the ``--tunableop on`` argument in your run.
+
+               Enabling TunableOp triggers a two-pass run -- a warm-up followed by the
+               performance-collection run.
+
+            {% endif %}
+
+         .. tab-item:: Standalone benchmarking
+
+            The following commands are optimized for {{ model.model }}.
+            See :ref:`vllm-benchmark-supported-models-1103` to switch to another available model.
+
+            .. seealso::
+
+               For more information on configuration, see the `config files
+               <https://github.com/ROCm/MAD/tree/develop/scripts/vllm/configs>`__
+               in the MAD repository. Refer to the `vLLM engine <https://docs.vllm.ai/en/latest/configuration/engine_args.html#engineargs>`__
+               for descriptions of available configuration options
+               and `Benchmarking vLLM <https://github.com/vllm-project/vllm/blob/main/benchmarks/README.md>`__ for
+               additional benchmarking information.
+
+            .. rubric:: Launch the container
+
+            You can run the vLLM benchmark tool independently by starting the
+            `Docker container <{{ docker.docker_hub_url }}>`_ as shown
+            in the following snippet.
+
+            .. code-block:: shell
+
+               docker pull {{ docker.pull_tag }}
+               docker run -it \
+                   --device=/dev/kfd \
+                   --device=/dev/dri \
+                   --group-add video \
+                   --shm-size 16G \
+                   --security-opt seccomp=unconfined \
+                   --security-opt apparmor=unconfined \
+                   --cap-add=SYS_PTRACE \
+                   -v $(pwd):/workspace \
+                   --env HUGGINGFACE_HUB_CACHE=/workspace \
+                   --name test \
+                   {{ docker.pull_tag }}
+
+            .. rubric:: Throughput command
+
+            Use the following command to start the throughput benchmark.
+
+            .. code-block:: shell
+
+               model={{ model.model_repo }}
+               tp={{ model.config.tp }}
+               num_prompts={{ model.config.num_prompts | default(1024) }}
+               in={{ model.config.in | default(128) }}
+               out={{ model.config.in | default(128) }}
+               dtype={{ model.config.dtype | default("auto") }}
+               kv_cache_dtype={{ model.config.kv_cache_dtype }}
+               max_num_seqs={{ model.config.max_num_seqs | default(1024) }}
+               max_num_batched_tokens={{ model.config.max_num_batched_tokens }}
+               max_model_len={{ model.config.max_model_len }}
+
+               vllm bench throughput --model $model \
+                   -tp $tp \
+                   --num-prompts $num_prompts \
+                   --input-len $in \
+                   --output-len $out \
+                   --dtype $dtype \
+                   --kv-cache-dtype $kv_cache_dtype \
+                   --max-num-seqs $max_num_seqs \
+                   --max-num-batched-tokens $max_num_batched_tokens \
+                   --max-model-len $max_model_len \
+                   --trust-remote-code \
+                   --output-json ${model}_throughput.json \
+                   --gpu-memory-utilization {{ model.config.gpu_memory_utilization | default(0.9) }}
+
+            .. rubric:: Serving command
+
+            1. Start the server using the following command:
+
+               .. code-block:: shell
+
+                  model={{ model.model_repo }}
+                  tp={{ model.config.tp }}
+                  dtype={{ model.config.dtype }}
+                  kv_cache_dtype={{ model.config.kv_cache_dtype }}
+                  max_num_seqs=256
+                  max_num_batched_tokens={{ model.config.max_num_batched_tokens }}
+                  max_model_len={{ model.config.max_model_len }}
+
+                  vllm serve $model \
+                      -tp $tp \
+                      --dtype $dtype \
+                      --kv-cache-dtype $kv_cache_dtype \
+                      --max-num-seqs $max_num_seqs \
+                      --max-num-batched-tokens $max_num_batched_tokens \
+                      --max-model-len $max_model_len \
+                      --no-enable-prefix-caching \
+                      --swap-space 16 \
+                      --disable-log-requests \
+                      --trust-remote-code \
+                      --gpu-memory-utilization 0.9
+
+               Wait until the model has loaded and the server is ready to accept requests.
+
+            2. On another terminal on the same machine, run the benchmark:
+
+               .. code-block:: shell
+
+                  # Connect to the container
+                  docker exec -it test bash
+
+                  # Wait for the server to start
+                  until curl -s http://localhost:8000/v1/models; do sleep 30; done
+
+                  # Run the benchmark
+                  model={{ model.model_repo }}
+                  max_concurrency=1
+                  num_prompts=10
+                  in=128
+                  out=128
+                  vllm bench serve --model $model \
+                      --percentile-metrics "ttft,tpot,itl,e2el" \
+                      --dataset-name random \
+                      --ignore-eos \
+                      --max-concurrency $max_concurrency \
+                      --num-prompts $num_prompts \
+                      --random-input-len $in \
+                      --random-output-len $out \
+                      --trust-remote-code \
+                      --save-result \
+                      --result-filename ${model}_serving.json
+
+            .. note::
+
+               For improved performance with certain Mixture of Experts models, such as Mixtral 8x22B,
+               try adding ``export VLLM_ROCM_USE_AITER=1`` to your commands.
+
+               If you encounter the following error, pass your access-authorized Hugging
+               Face token to the gated models.
+
+               .. code-block::
+
+                  OSError: You are trying to access a gated repo.
+
+                  # pass your HF_TOKEN
+                  export HF_TOKEN=$your_personal_hf_token
+
+            .. raw:: html
+
+               <style>
+               mjx-container[jax="CHTML"][display="true"] {
+                  text-align: left;
+                  margin: 0;
+               }
+               </style>
+
+            .. note::
+
+               Throughput is calculated as:
+
+               - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
+
+               - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
+      {% endfor %}
+   {% endfor %}
+
+Advanced usage
+==============
+
+For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
+see the developer's guide at `<https://github.com/ROCm/vllm/blob/documentation/docs/dev-docker/README.md>`__.
+
+.. note::
+
+   If you’re using this Docker image on other AMD GPUs such as the AMD Instinct MI200 Series or Radeon, add ``export VLLM_ROCM_USE_AITER=0`` to your command, since AITER is only supported on gfx942 and gfx950 architectures.
+
+Reproducing the Docker image
+----------------------------
+
+To reproduce this ROCm-enabled vLLM Docker image release, follow these steps:
+
+1. Clone the `vLLM repository <https://github.com/vllm-project/vllm>`__.
+
+   .. code-block:: shell
+
+      git clone https://github.com/vllm-project/vllm.git
+      cd vllm
+
+2. Use the following command to build the image directly from the specified commit.
+
+   .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.11.1_20251103-benchmark-models.yaml
+
+      {% set docker = data.dockers[0] %}
+      .. code-block:: shell
+
+         docker build -f docker/Dockerfile.rocm \
+             --build-arg REMOTE_VLLM=1 \
+             --build-arg VLLM_REPO=https://github.com/ROCm/vllm \
+             --build-arg VLLM_BRANCH="{{ docker.dockerfile.commit }}" \
+             -t vllm-rocm .
+
+   .. tip::
+
+      Replace ``vllm-rocm`` with your desired image tag.
+
+Further reading
+===============
+
+- To learn more about the options for latency and throughput benchmark scripts,
+  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
+
+- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
+
+- To learn more about system settings and management practices to configure your system for
+  AMD Instinct MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+
+- See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
+  a brief introduction to vLLM and optimization strategies.
+
+- For application performance optimization strategies for HPC and AI workloads,
+  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`vllm-history` to find documentation for previous releases
+of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst
@@ -16,14 +16,31 @@ previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub <https://hub.
     - Components
     - Resources

+   * - ``rocm/vllm:rocm7.0.0_vllm_0.11.2_20251210``
+     -
+       * ROCm 7.0.0
+       * vLLM 0.11.2
+       * PyTorch 2.9.0
+     -
+       * :doc:`Documentation <../vllm>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm7.0.0_vllm_0.11.2_20251210/images/sha256-e7f02dd2ce3824959658bc0391296f6158638e3ebce164f6c019c4eca8150ec7>`__
+
+   * - ``rocm/vllm:rocm7.0.0_vllm_0.11.1_20251103``
+     -
+       * ROCm 7.0.0
+       * vLLM 0.11.1
+       * PyTorch 2.9.0
+     -
+       * :doc:`Documentation <vllm-0.11.1-20251103>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm7.0.0_vllm_0.11.1_20251103/images/sha256-8d60429043d4d00958da46039a1de0d9b82df814d45da482497eef26a6076506>`__
+
   * - ``rocm/vllm:rocm7.0.0_vllm_0.10.2_20251006``
-       (latest)
     -
       * ROCm 7.0.0
       * vLLM 0.10.2
       * PyTorch 2.9.0
     -
-       * :doc:`Documentation <../vllm>`
+       * :doc:`Documentation <vllm-0.10.2-20251006>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm7.0.0_vllm_0.10.2_20251006/images/sha256-94fd001964e1cf55c3224a445b1fb5be31a7dac302315255db8422d813edd7f5>`__

   * - ``rocm/vllm:rocm6.4.1_vllm_0.10.1_20250909``
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.10.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.10.rst
@@ -0,0 +1,398 @@
+:orphan:
+
+.. meta::
+   :description: Learn to validate diffusion model video generation on MI300X, MI350X and MI355X accelerators using
+                 prebuilt and optimized docker images.
+   :keywords: xDiT, diffusion, video, video generation, image, image generation, validate, benchmark
+
+************************
+xDiT diffusion inference
+************************
+
+.. _xdit-video-diffusion-2510:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.10-inference-models.yaml
+
+   {% set docker = data.xdit_diffusion_inference.docker %}
+   {% set model_groups = data.xdit_diffusion_inference.model_groups%}
+
+   The `rocm/pytorch-xdit <{{ docker.docker_hub_url }}>`_ Docker image offers
+   a prebuilt, optimized inference environment based on `xDiT
+   <https://github.com/xdit-project/xDiT>`_ for benchmarking diffusion-based
+   video and image generation on AMD Instinct MI355X, MI350X (gfx950), MI325X,
+   and MI300X (gfx942) GPUs.
+   This image is based on ROCm {{docker.ROCm}} preview release via `TheRock <https://github.com/ROCm/TheRock>`_
+   and includes the following software components:
+
+   .. tab-set::
+
+      .. tab-item:: {{ docker.pull_tag }}
+
+         .. list-table::
+            :header-rows: 1
+
+            * - Software component
+              - Version
+
+            {% for component_name, component_version in docker.components.items() %}
+            * - {{ component_name }}
+              - {{ component_version }}
+            {% endfor %}
+
+Follow this guide to pull the required image, spin up a container, download the model, and run a benchmark.
+For preview and development releases, see `amdsiloai/pytorch-xdit <https://hub.docker.com/r/amdsiloai/pytorch-xdit>`_.
+
+What's new
+==========
+
+- Initial ROCm-enabled xDiT Docker release for diffusion inference.
+- Supported architectures: gfx942 and gfx950 (AMD Instinct™ MI300X, MI325X, MI350X, and MI355X).
+- Supported workloads: Wan 2.1, Wan 2.2, HunyuanVideo, and Flux models.
+
+.. _xdit-video-diffusion-supported-models-2510:
+
+Supported models
+================
+
+The following models are supported for inference performance benchmarking.
+Some instructions, commands, and recommendations in this documentation might
+vary by model -- select one to get started.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.10-inference-models.yaml
+
+   {% set docker = data.xdit_diffusion_inference.docker %}
+   {% set model_groups = data.xdit_diffusion_inference.model_groups%}
+
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+          <div class="row gx-0">
+              <div class="col-2 me-1 px-2 model-param-head">Model</div>
+              <div class="row col-10 pe-0">
+        {% for model_group in model_groups %}
+                  <div class="col-4 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+        {% endfor %}
+              </div>
+          </div>
+
+          <div class="row gx-0 pt-1">
+              <div class="col-2 me-1 px-2 model-param-head">Variant</div>
+              <div class="row col-10 pe-0">
+        {% for model_group in model_groups %}
+            {% set models = model_group.models %}
+            {% for model in models %}
+                {% if models|length == 1 %}
+                <div class="col-12 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+                {% else %}
+                <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+                {% endif %}
+            {% endfor %}
+        {% endfor %}
+              </div>
+          </div>
+      </div>
+
+   {% for model_group in model_groups %}
+       {% for model in model_group.models %}
+
+   .. container:: model-doc {{ model.mad_tag }}
+
+      .. note::
+
+         To learn more about your specific model see the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_
+         or visit the `GitHub page <{{ model.github }}>`__. Note that some models require access authorization before use via an
+         external license agreement through a third party.
+
+       {% endfor %}
+   {% endfor %}
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA
+auto-balancing, you can skip this step. Otherwise, complete the procedures in
+the `System validation and optimization
+<https://rocm.docs.amd.com/en/latest/how-to/rocm-for-ai/system-setup/prerequisite-system-validation.html>`__
+guide to properly configure your system settings before starting.
+
+Pull the Docker image
+=====================
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.10-inference-models.yaml
+
+   {% set docker = data.xdit_diffusion_inference.docker %}
+
+   For this tutorial, it's recommended to use the latest ``{{ docker.pull_tag }}`` Docker image.
+   Pull the image using the following command:
+
+   .. code-block:: shell
+
+      docker pull {{ docker.pull_tag }}
+
+Validate and benchmark
+======================
+
+Once the image has been downloaded you can follow these steps to
+run benchmarks and generate outputs.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.10-inference-models.yaml
+
+   {% set model_groups = data.xdit_diffusion_inference.model_groups %}
+   {% for model_group in model_groups %}
+     {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      The following commands are written for {{ model.model }}.
+      See :ref:`xdit-video-diffusion-supported-models-2510` to switch to another available model.
+
+     {% endfor %}
+   {% endfor %}
+
+.. _xdit-video-diffusion-setup-2510:
+
+Prepare the model
+-----------------
+
+.. note::
+
+   If you're using ROCm MAD to :ref:`run your model
+   <xdit-video-diffusion-run-2510>`, you can skip this section. MAD will handle
+   starting the container and downloading required models inside the container.
+
+You can either use an existing Hugging Face cache or download the model fresh inside the container.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.10-inference-models.yaml
+
+   {% set docker = data.xdit_diffusion_inference.docker %}
+   {% set model_groups = data.xdit_diffusion_inference.model_groups%}
+
+   {% for model_group in model_groups %}
+     {% for model in model_group.models %}
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. tab-set::
+
+         .. tab-item:: Option 1: Use existing Hugging Face cache
+
+            If you already have models downloaded on your host system, you can mount your existing cache.
+
+            1. Set your Hugging Face cache location.
+
+               .. code-block:: shell
+
+                  export HF_HOME=/your/hf_cache/location
+
+            2. Download the model (if not already cached).
+
+               .. code-block:: shell
+
+                  huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %}
+
+            3. Launch the container with mounted cache.
+
+               .. code-block:: shell
+
+                  docker run \
+                      -it --rm \
+                      --cap-add=SYS_PTRACE \
+                      --security-opt seccomp=unconfined \
+                      --user root \
+                      --device=/dev/kfd \
+                      --device=/dev/dri \
+                      --group-add video \
+                      --ipc=host \
+                      --network host \
+                      --privileged \
+                      --shm-size 128G \
+                      --name pytorch-xdit \
+                      -e HSA_NO_SCRATCH_RECLAIM=1 \
+                      -e OMP_NUM_THREADS=16 \
+                      -e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+                      -e HF_HOME=/app/huggingface_models \
+                      -v $HF_HOME:/app/huggingface_models \
+                      {{ docker.pull_tag }}
+
+         .. tab-item:: Option 2: Download inside container
+
+            If you prefer to keep the container self-contained or don't have an existing cache.
+
+            1. Launch the container
+
+               .. code-block:: shell
+
+                  docker run \
+                      -it --rm \
+                      --cap-add=SYS_PTRACE \
+                      --security-opt seccomp=unconfined \
+                      --user root \
+                      --device=/dev/kfd \
+                      --device=/dev/dri \
+                      --group-add video \
+                      --ipc=host \
+                      --network host \
+                      --privileged \
+                      --shm-size 128G \
+                      --name pytorch-xdit \
+                      -e HSA_NO_SCRATCH_RECLAIM=1 \
+                      -e OMP_NUM_THREADS=16 \
+                      -e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+                      {{ docker.pull_tag }}
+
+            2. Inside the container, set the Hugging Face cache location and download the model.
+
+               .. code-block:: shell
+
+                  export HF_HOME=/app/huggingface_models
+                  huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %}
+
+               .. warning::
+
+                  Models will be downloaded to the container's filesystem and will be lost when the container is removed unless you persist the data with a volume.
+     {% endfor %}
+   {% endfor %}
+
+.. _xdit-video-diffusion-run-2510:
+
+Run inference
+=============
+
+You can benchmark models through `MAD <https://github.com/ROCm/MAD>`__-integrated automation or standalone
+torchrun commands.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.10-inference-models.yaml
+
+   {% set model_groups = data.xdit_diffusion_inference.model_groups%}
+   {% for model_group in model_groups %}
+     {% for model in model_group.models %}
+
+   .. container:: model-doc {{ model.mad_tag }}
+
+      .. tab-set::
+
+         .. tab-item:: MAD-integrated benchmarking
+
+            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+               directory and install the required packages on the host machine.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD
+                  pip install -r requirements.txt
+
+            2. On the host machine, use this command to run the performance benchmark test on
+               the `{{model.model}} <{{ model.url }}>`_ model using one node.
+
+               .. code-block:: shell
+
+                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+                  madengine run \
+                      --tags {{model.mad_tag}} \
+                      --keep-model-dir \
+                      --live-output
+                     
+            MAD launches a Docker container with the name
+            ``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the
+            model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv``
+            and ``{{ model.mad_tag }}_serving.csv``.
+
+         .. tab-item:: Standalone benchmarking
+
+            To run the benchmarks for {{ model.model }}, use the following command:
+
+            .. code-block:: shell
+            {% if model.model == "Hunyuan Video" %}
+               cd /app/Hunyuanvideo
+               mkdir results
+
+               torchrun --nproc_per_node=8 run.py \
+                  --model tencent/HunyuanVideo \
+                  --prompt "In the large cage, two puppies were wagging their tails at each other." \
+                  --height 720 --width 1280 --num_frames 129 \
+                  --num_inference_steps 50 --warmup_steps 1 --n_repeats 1 \
+                  --ulysses_degree 8 \
+                  --enable_tiling --enable_slicing \
+                  --use_torch_compile \
+                  --bench_output results
+            {% endif %}
+            {% if model.model == "Wan2.1" %}
+               cd Wan2.1
+               mkdir results
+
+               torchrun --nproc_per_node=8 run.py \
+                  --task i2v-14B \
+                  --size 720*1280 --frame_num 81 \
+                  --ckpt_dir "${HF_HOME}/hub/models--Wan-AI--Wan2.1-I2V-14B-720P/snapshots/8823af45fcc58a8aa999a54b04be9abc7d2aac98/" \
+                  --image "/app/Wan2.1/examples/i2v_input.JPG" \
+                  --ulysses_size 8 --ring_size 1 \
+                  --prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \
+                  --benchmark_output_directory results --save_file video.mp4 --num_benchmark_steps 1 \
+                  --offload_model 0 \
+                  --vae_dtype bfloat16 \
+                  --allow_tf32 \
+                  --compile
+            {% endif %}
+            {% if model.model == "Wan2.2" %}
+               cd Wan2.2
+               mkdir results
+
+               torchrun --nproc_per_node=8 run.py \
+                  --task i2v-A14B \
+                  --size 720*1280 --frame_num 81 \
+                  --ckpt_dir "${HF_HOME}/hub/models--Wan-AI--Wan2.2-I2V-A14B/snapshots/206a9ee1b7bfaaf8f7e4d81335650533490646a3/" \
+                  --image "/app/Wan2.2/examples/i2v_input.JPG" \
+                  --ulysses_size 8 --ring_size 1 \
+                  --prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \
+                  --benchmark_output_directory results --save_file video.mp4 --num_benchmark_steps 1 \
+                  --offload_model 0 \
+                  --vae_dtype bfloat16 \
+                  --allow_tf32 \
+                  --compile
+            {% endif %}
+
+            {% if model.model == "FLUX.1" %}
+               cd Flux
+               mkdir results
+
+               torchrun --nproc_per_node=8 /app/Flux/run.py \
+                  --model black-forest-labs/FLUX.1-dev \
+                  --seed 42 \
+                  --prompt "A small cat" \
+                  --height 1024 \
+                  --width 1024 \
+                  --num_inference_steps 25 \
+                  --max_sequence_length 256 \
+                  --warmup_steps 5 \
+                  --no_use_resolution_binning \
+                  --ulysses_degree 8 \
+                  --use_torch_compile \
+                  --num_repetitions 1 \
+                  --benchmark_output_directory results
+
+            {% endif %}
+
+            The generated video will be stored under the results directory. For the actual benchmark step runtimes, see {% if model.model == "Hunyuan Video" %}stdout.{% elif model.model in ["Wan2.1", "Wan2.2"] %}results/outputs/rank0_*.json{% elif model.model == "FLUX.1" %}results/timing.json{% endif %}
+
+            {% if model.model == "FLUX.1" %}You may also use ``run_usp.py`` which implements USP without modifying the default diffusers pipeline. {% endif %}
+
+      {% endfor %}
+    {% endfor %}
+
+Further reading
+===============
+
+- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
+
+- For a list of other ready-made Docker images for AI with ROCm, see `AMD
+  Infinity Hub
+  <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`__.
+
+Previous versions
+=================
+
+See :doc:`xdit-history` to find documentation for previous releases
+of xDiT diffusion inference performance testing.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.11.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.11.rst
@@ -0,0 +1,389 @@
+:orphan:
+
+.. meta::
+   :description: Learn to validate diffusion model video generation on MI300X, MI350X and MI355X accelerators using
+                 prebuilt and optimized docker images.
+   :keywords: xDiT, diffusion, video, video generation, image, image generation, validate, benchmark
+
+************************
+xDiT diffusion inference
+************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm vLLM
+   inference performance documentation. See
+   :doc:`/how-to/rocm-for-ai/inference/xdit-diffusion-inference` for the latest
+   version.
+
+.. _xdit-video-diffusion-2511:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.11-inference-models.yaml
+
+   {% set docker = data.xdit_diffusion_inference.docker | selectattr("version", "equalto", "v25-11") | first %}
+   {% set model_groups = data.xdit_diffusion_inference.model_groups%}
+
+   The `rocm/pytorch-xdit <{{ docker.docker_hub_url }}>`_ Docker image offers a prebuilt, optimized environment based on `xDiT <https://github.com/xdit-project/xDiT>`_ for
+   benchmarking diffusion model video and image generation on gfx942 and gfx950 series (AMD Instinct™ MI300X, MI325X, MI350X, and MI355X) GPUs.
+   The image runs ROCm **{{docker.ROCm}}** (preview) based on `TheRock <https://github.com/ROCm/TheRock>`_
+   and includes the following components:
+
+   .. dropdown:: Software components
+
+      .. list-table::
+         :header-rows: 1
+
+         * - Software component
+           - Version
+
+         {% for component_name, component_version in docker.components.items() %}
+         * - {{ component_name }}
+           - {{ component_version }}
+         {% endfor %}
+
+Follow this guide to pull the required image, spin up a container, download the model, and run a benchmark.
+For preview and development releases, see `amdsiloai/pytorch-xdit <https://hub.docker.com/r/amdsiloai/pytorch-xdit>`_.
+
+What's new
+==========
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.11-inference-models.yaml
+
+   {% set docker = data.xdit_diffusion_inference.docker | selectattr("version", "equalto", "v25-11") | first %}
+   {% set model_groups = data.xdit_diffusion_inference.model_groups%}
+
+   {% for item in docker.whats_new %}
+   * {{ item }}
+   {% endfor %}
+
+.. _xdit-video-diffusion-supported-models-2511:
+
+Supported models
+================
+
+The following models are supported for inference performance benchmarking.
+Some instructions, commands, and recommendations in this documentation might
+vary by model -- select one to get started.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.11-inference-models.yaml
+
+   {% set docker = data.xdit_diffusion_inference.docker | selectattr("version", "equalto", "v25-11") | first %}
+   {% set model_groups = data.xdit_diffusion_inference.model_groups %}
+
+   {# Create a lookup for supported models #}
+   {% set supported_lookup = {} %}
+   {% for supported in docker.supported_models %}
+   {% set _ = supported_lookup.update({supported.group: supported.models}) %}
+   {% endfor %}
+
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+          <div class="row gx-0">
+              <div class="col-2 me-1 px-2 model-param-head">Model</div>
+              <div class="row col-10 pe-0">
+        {% for model_group in model_groups %}
+            {% if model_group.group in supported_lookup %}
+                  <div class="col-4 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+            {% endif %}
+        {% endfor %}
+              </div>
+          </div>
+
+          <div class="row gx-0 pt-1">
+              <div class="col-2 me-1 px-2 model-param-head">Variant</div>
+              <div class="row col-10 pe-0">
+        {% for model_group in model_groups %}
+            {% if model_group.group in supported_lookup %}
+            {% set supported_models = supported_lookup[model_group.group] %}
+            {% set models = model_group.models %}
+            {% for model in models %}
+                {% if model.model in supported_models %}
+                {% if models|length % 3 == 0 %}
+                <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.page_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+                {% else %}
+                <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.page_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+                {% endif %}
+                {% endif %}
+            {% endfor %}
+            {% endif %}
+        {% endfor %}
+              </div>
+          </div>
+      </div>
+
+   {% for model_group in model_groups %}
+       {% for model in model_group.models %}
+
+   .. container:: model-doc {{ model.page_tag }}
+
+      .. note::
+
+         To learn more about your specific model see the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_
+         or visit the `GitHub page <{{ model.github }}>`__. Note that some models require access authorization before use via an
+         external license agreement through a third party.
+
+       {% endfor %}
+   {% endfor %}
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting.
+
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
+Pull the Docker image
+=====================
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.11-inference-models.yaml
+
+   {% set docker = data.xdit_diffusion_inference.docker | selectattr("version", "equalto", "v25-11") | first %}
+
+   For this tutorial, it's recommended to use the latest ``{{ docker.pull_tag }}`` Docker image.
+   Pull the image using the following command:
+
+   .. code-block:: shell
+
+      docker pull {{ docker.pull_tag }}
+
+Validate and benchmark
+======================
+
+Once the image has been downloaded you can follow these steps to
+run benchmarks and generate outputs.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.11-inference-models.yaml
+
+   {% for model_group in model_groups %}
+     {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.page_tag}}
+
+      The following commands are written for {{ model.model }}.
+      See :ref:`xdit-video-diffusion-supported-models-2511` to switch to another available model.
+
+     {% endfor %}
+   {% endfor %}
+
+Choose your setup method
+------------------------
+
+You can either use an existing Hugging Face cache or download the model fresh inside the container.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.11-inference-models.yaml
+
+   {% set docker = data.xdit_diffusion_inference.docker | selectattr("version", "equalto", "v25-11") | first %}
+   {% set model_groups = data.xdit_diffusion_inference.model_groups%}
+
+   {% for model_group in model_groups %}
+     {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.page_tag}}
+
+      .. tab-set::
+
+         .. tab-item:: Option 1: Use existing Hugging Face cache
+
+            If you already have models downloaded on your host system, you can mount your existing cache.
+
+            1. Set your Hugging Face cache location.
+
+               .. code-block:: shell
+
+                  export HF_HOME=/your/hf_cache/location
+            2. Download the model (if not already cached).
+
+               .. code-block:: shell
+
+                  huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %}
+            3. Launch the container with mounted cache.
+
+               .. code-block:: shell
+
+                  docker run \
+                      -it --rm \
+                      --cap-add=SYS_PTRACE \
+                      --security-opt seccomp=unconfined \
+                      --user root \
+                      --device=/dev/kfd \
+                      --device=/dev/dri \
+                      --group-add video \
+                      --ipc=host \
+                      --network host \
+                      --privileged \
+                      --shm-size 128G \
+                      --name pytorch-xdit \
+                      -e HSA_NO_SCRATCH_RECLAIM=1 \
+                      -e OMP_NUM_THREADS=16 \
+                      -e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+                      -e HF_HOME=/app/huggingface_models \
+                      -v $HF_HOME:/app/huggingface_models \
+                      {{ docker.pull_tag }}
+         .. tab-item:: Option 2: Download inside container
+
+            If you prefer to keep the container self-contained or don't have an existing cache.
+
+            1. Launch the container
+
+               .. code-block:: shell
+
+                  docker run \
+                      -it --rm \
+                      --cap-add=SYS_PTRACE \
+                      --security-opt seccomp=unconfined \
+                      --user root \
+                      --device=/dev/kfd \
+                      --device=/dev/dri \
+                      --group-add video \
+                      --ipc=host \
+                      --network host \
+                      --privileged \
+                      --shm-size 128G \
+                      --name pytorch-xdit \
+                      -e HSA_NO_SCRATCH_RECLAIM=1 \
+                      -e OMP_NUM_THREADS=16 \
+                      -e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+                      {{ docker.pull_tag }}
+            2. Inside the container, set the Hugging Face cache location and download the model.
+
+               .. code-block:: shell
+
+                  export HF_HOME=/app/huggingface_models
+                  huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %}
+
+               .. warning::
+
+                  Models will be downloaded to the container's filesystem and will be lost when the container is removed unless you persist the data with a volume.
+     {% endfor %}
+   {% endfor %}
+
+Run inference
+=============
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.11-inference-models.yaml
+
+   {% set model_groups = data.xdit_diffusion_inference.model_groups%}
+   {% for model_group in model_groups %}
+     {% for model in model_group.models %}
+
+   .. container:: model-doc {{ model.page_tag }}
+
+      .. tab-set::
+
+         .. tab-item:: MAD-integrated benchmarking
+
+            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+               directory and install the required packages on the host machine.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD
+                  pip install -r requirements.txt
+
+            2. On the host machine, use this command to run the performance benchmark test on
+               the `{{model.model}} <{{ model.url }}>`_ model using one node.
+
+               .. code-block:: shell
+
+                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+                  madengine run \
+                      --tags {{model.mad_tag}} \
+                      --keep-model-dir \
+                      --live-output
+
+            MAD launches a Docker container with the name
+            ``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the
+            model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv``
+            and ``{{ model.mad_tag }}_serving.csv``.
+
+         .. tab-item:: Standalone benchmarking
+
+            To run the benchmarks for {{ model.model }}, use the following command:
+
+            .. code-block:: shell
+
+            {% if model.model == "Hunyuan Video" %}
+               cd /app/Hunyuanvideo
+               mkdir results
+               torchrun --nproc_per_node=8 run.py \
+                  --model tencent/HunyuanVideo \
+                  --prompt "In the large cage, two puppies were wagging their tails at each other." \
+                  --height 720 --width 1280 --num_frames 129 \
+                  --num_inference_steps 50 --warmup_steps 1 --n_repeats 1 \
+                  --ulysses_degree 8 \
+                  --enable_tiling --enable_slicing \
+                  --use_torch_compile \
+                  --bench_output results
+            {% endif %}
+            {% if model.model == "Wan2.1" %}
+               cd Wan2.1
+               mkdir results
+               torchrun --nproc_per_node=8 run.py \
+                  --task i2v-14B \
+                  --size 720*1280 --frame_num 81 \
+                  --ckpt_dir "${HF_HOME}/hub/models--Wan-AI--Wan2.1-I2V-14B-720P/snapshots/8823af45fcc58a8aa999a54b04be9abc7d2aac98/" \
+                  --image "/app/Wan2.1/examples/i2v_input.JPG" \
+                  --ulysses_size 8 --ring_size 1 \
+                  --prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \
+                  --benchmark_output_directory results --save_file video.mp4 --num_benchmark_steps 1 \
+                  --offload_model 0 \
+                  --vae_dtype bfloat16 \
+                  --allow_tf32 \
+                  --compile
+            {% endif %}
+            {% if model.model == "Wan2.2" %}
+               cd Wan2.2
+               mkdir results
+               torchrun --nproc_per_node=8 run.py \
+                  --task i2v-A14B \
+                  --size 720*1280 --frame_num 81 \
+                  --ckpt_dir "${HF_HOME}/hub/models--Wan-AI--Wan2.2-I2V-A14B/snapshots/206a9ee1b7bfaaf8f7e4d81335650533490646a3/" \
+                  --image "/app/Wan2.2/examples/i2v_input.JPG" \
+                  --ulysses_size 8 --ring_size 1 \
+                  --prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \
+                  --benchmark_output_directory results --save_file video.mp4 --num_benchmark_steps 1 \
+                  --offload_model 0 \
+                  --vae_dtype bfloat16 \
+                  --allow_tf32 \
+                  --compile
+            {% endif %}
+            {% if model.model == "FLUX.1" %}
+               cd Flux
+               mkdir results
+               torchrun --nproc_per_node=8 /app/Flux/run.py \
+                  --model black-forest-labs/FLUX.1-dev \
+                  --seed 42 \
+                  --prompt "A small cat" \
+                  --height 1024 \
+                  --width 1024 \
+                  --num_inference_steps 25 \
+                  --max_sequence_length 256 \
+                  --warmup_steps 5 \
+                  --no_use_resolution_binning \
+                  --ulysses_degree 8 \
+                  --use_torch_compile \
+                  --num_repetitions 1 \
+                  --benchmark_output_directory results
+            {% endif %}
+            The generated video will be stored under the results directory. For the actual benchmark step runtimes, see {% if model.model == "Hunyuan Video" %}stdout.{% elif model.model in ["Wan2.1", "Wan2.2"] %}results/outputs/rank0_*.json{% elif model.model == "FLUX.1" %}results/timing.json{% endif %}
+            {% if model.model == "FLUX.1" %}You may also use ``run_usp.py`` which implements USP without modifying the default diffusers pipeline. {% endif %}
+      {% endfor %}
+    {% endfor %}
+
+Previous versions
+=================
+
+See
+:doc:`/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-history`
+to find documentation for previous releases of xDiT diffusion inference
+performance testing.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.12.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-25.12.rst
@@ -0,0 +1,411 @@
+:orphan:
+
+.. meta::
+   :description: Learn to validate diffusion model video generation on MI300X, MI350X and MI355X accelerators using
+                 prebuilt and optimized docker images.
+   :keywords: xDiT, diffusion, video, video generation, image, image generation, validate, benchmark
+
+************************
+xDiT diffusion inference
+************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm vLLM
+   inference performance documentation. See
+   :doc:`/how-to/rocm-for-ai/inference/xdit-diffusion-inference` for the latest
+   version.
+
+.. _xdit-video-diffusion-2512:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.12-inference-models.yaml
+
+   {% set docker = data.docker %}
+
+   The `rocm/pytorch-xdit <{{ docker.docker_hub_url }}>`_ Docker image offers
+   a prebuilt, optimized environment based on `xDiT
+   <https://github.com/xdit-project/xDiT>`_ for benchmarking diffusion model
+   video and image generation on AMD Instinct MI355X, MI350X (gfx950), MI325X,
+   and MI300X (gfx942) GPUs.
+
+   The image runs ROCm **{{docker.ROCm}}** (preview) based on `TheRock <https://github.com/ROCm/TheRock>`_
+   and includes the following components:
+
+   .. dropdown:: Software components
+
+      .. list-table::
+         :header-rows: 1
+
+         * - Software component
+           - Version
+
+         {% for component_name, component_data in docker.components.items() %}
+         * - `{{ component_name }} <{{ component_data.url }}>`_
+           - {{ component_data.version }}
+         {% endfor %}
+
+Follow this guide to pull the required image, spin up a container, download the model, and run a benchmark.
+For preview and development releases, see `amdsiloai/pytorch-xdit <https://hub.docker.com/r/amdsiloai/pytorch-xdit>`_.
+
+What's new
+==========
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.12-inference-models.yaml
+
+   {% set docker = data.docker %}
+
+   {% for item in docker.whats_new %}
+   * {{ item }}
+   {% endfor %}
+
+.. _xdit-video-diffusion-supported-models-2512:
+
+Supported models
+================
+
+The following models are supported for inference performance benchmarking.
+Some instructions, commands, and recommendations in this documentation might
+vary by model -- select one to get started.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.12-inference-models.yaml
+
+   {% set docker = data.docker %}
+
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+          <div class="row gx-0">
+              <div class="col-2 me-1 px-2 model-param-head">Model</div>
+              <div class="row col-10 pe-0">
+        {% for model_group in docker.supported_models %}
+               <div class="col-6 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.js_tag }}" tabindex="0">{{ model_group.group }}</div>
+        {% endfor %}
+              </div>
+          </div>
+
+          <div class="row gx-0 pt-1">
+              <div class="col-2 me-1 px-2 model-param-head">Variant</div>
+              <div class="row col-10 pe-0">
+        {% for model_group in docker.supported_models %}
+            {% set models = model_group.models %}
+            {% for model in models %}
+                {% if models|length % 3 == 0 %}
+                <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.js_tag }}" data-param-group="{{ model_group.js_tag }}" tabindex="0">{{ model.model }}</div>
+                {% else %}
+                <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.js_tag }}" data-param-group="{{ model_group.js_tag }}" tabindex="0">{{ model.model }}</div>
+                {% endif %}
+            {% endfor %}
+        {% endfor %}
+              </div>
+          </div>
+      </div>
+
+   {% for model_group in docker.supported_models %}
+       {% for model in model_group.models %}
+
+   .. container:: model-doc {{ model.js_tag }}
+
+      .. note::
+
+         To learn more about your specific model see the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_
+         or visit the `GitHub page <{{ model.github }}>`__. Note that some models require access authorization before use via an
+         external license agreement through a third party.
+
+       {% endfor %}
+   {% endfor %}
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting.
+
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
+Pull the Docker image
+=====================
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.12-inference-models.yaml
+
+   {% set docker = data.docker %}
+
+   For this tutorial, it's recommended to use the latest ``{{ docker.pull_tag }}`` Docker image.
+   Pull the image using the following command:
+
+   .. code-block:: shell
+
+      docker pull {{ docker.pull_tag }}
+
+Validate and benchmark
+======================
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.12-inference-models.yaml
+
+   {% set docker = data.docker %}
+
+   Once the image has been downloaded you can follow these steps to
+   run benchmarks and generate outputs.
+
+   {% for model_group in docker.supported_models %}
+     {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.js_tag}}
+
+      The following commands are written for {{ model.model }}.
+      See :ref:`xdit-video-diffusion-supported-models` to switch to another available model.
+
+     {% endfor %}
+   {% endfor %}
+
+Choose your setup method
+------------------------
+
+You can either use an existing Hugging Face cache or download the model fresh inside the container.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.12-inference-models.yaml
+
+   {% set docker = data.docker %}
+
+   {% for model_group in docker.supported_models %}
+     {% for model in model_group.models %}
+   .. container:: model-doc {{model.js_tag}}
+
+      .. tab-set::
+
+         .. tab-item:: Option 1: Use existing Hugging Face cache
+
+            If you already have models downloaded on your host system, you can mount your existing cache.
+
+            1. Set your Hugging Face cache location.
+
+               .. code-block:: shell
+
+                  export HF_HOME=/your/hf_cache/location
+
+            2. Download the model (if not already cached).
+
+               .. code-block:: shell
+
+                  huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %}
+
+            3. Launch the container with mounted cache.
+
+               .. code-block:: shell
+
+                  docker run \
+                      -it --rm \
+                      --cap-add=SYS_PTRACE \
+                      --security-opt seccomp=unconfined \
+                      --user root \
+                      --device=/dev/kfd \
+                      --device=/dev/dri \
+                      --group-add video \
+                      --ipc=host \
+                      --network host \
+                      --privileged \
+                      --shm-size 128G \
+                      --name pytorch-xdit \
+                      -e HSA_NO_SCRATCH_RECLAIM=1 \
+                      -e OMP_NUM_THREADS=16 \
+                      -e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+                      -e HF_HOME=/app/huggingface_models \
+                      -v $HF_HOME:/app/huggingface_models \
+                      {{ docker.pull_tag }}
+
+         .. tab-item:: Option 2: Download inside container
+
+            If you prefer to keep the container self-contained or don't have an existing cache.
+
+            1. Launch the container
+
+               .. code-block:: shell
+
+                  docker run \
+                      -it --rm \
+                      --cap-add=SYS_PTRACE \
+                      --security-opt seccomp=unconfined \
+                      --user root \
+                      --device=/dev/kfd \
+                      --device=/dev/dri \
+                      --group-add video \
+                      --ipc=host \
+                      --network host \
+                      --privileged \
+                      --shm-size 128G \
+                      --name pytorch-xdit \
+                      -e HSA_NO_SCRATCH_RECLAIM=1 \
+                      -e OMP_NUM_THREADS=16 \
+                      -e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+                      {{ docker.pull_tag }}
+
+            2. Inside the container, set the Hugging Face cache location and download the model.
+
+               .. code-block:: shell
+
+                  export HF_HOME=/app/huggingface_models
+                  huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %}
+
+               .. warning::
+
+                  Models will be downloaded to the container's filesystem and will be lost when the container is removed unless you persist the data with a volume.
+     {% endfor %}
+   {% endfor %}
+
+Run inference
+=============
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/xdit_25.12-inference-models.yaml
+
+   {% set docker = data.docker %}
+
+   {% for model_group in docker.supported_models %}
+     {% for model in model_group.models %}
+
+   .. container:: model-doc {{ model.js_tag }}
+
+      .. tab-set::
+
+         .. tab-item:: MAD-integrated benchmarking
+
+            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+               directory and install the required packages on the host machine.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD
+                  pip install -r requirements.txt
+
+            2. On the host machine, use this command to run the performance benchmark test on
+               the `{{model.model}} <{{ model.url }}>`_ model using one node.
+
+               .. code-block:: shell
+
+                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+                  madengine run \
+                      --tags {{model.mad_tag}} \
+                      --keep-model-dir \
+                      --live-output
+                     
+            MAD launches a Docker container with the name
+            ``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the
+            model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv``
+            and ``{{ model.mad_tag }}_serving.csv``.
+
+         .. tab-item:: Standalone benchmarking
+
+            To run the benchmarks for {{ model.model }}, use the following command:
+
+            .. code-block:: shell
+            {% if model.model == "Hunyuan Video" %}
+               cd /app/Hunyuanvideo
+               mkdir results
+
+               torchrun --nproc_per_node=8 run.py \
+                  --model {{ model.model_repo }} \
+                  --prompt "In the large cage, two puppies were wagging their tails at each other." \
+                  --height 720 --width 1280 --num_frames 129 \
+                  --num_inference_steps 50 --warmup_steps 1 --n_repeats 1 \
+                  --ulysses_degree 8 \
+                  --enable_tiling --enable_slicing \
+                  --use_torch_compile \
+                  --bench_output results
+
+            {% endif %}
+            {% if model.model == "Wan2.1" %}
+               cd Wan
+               mkdir results
+
+               torchrun --nproc_per_node=8 /app/Wan/run.py \
+                  --task i2v \
+                  --height 720 \
+                  --width 1280 \
+                  --model {{ model.model_repo }} \
+                  --img_file_path /app/Wan/i2v_input.JPG \
+                  --ulysses_degree 8 \
+                  --seed 42 \
+                  --num_frames 81 \
+                  --prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \
+                  --num_repetitions 1 \
+                  --num_inference_steps 40 \
+                  --use_torch_compile
+
+            {% endif %}
+            {% if model.model == "Wan2.2" %}
+               cd Wan
+               mkdir results
+
+               torchrun --nproc_per_node=8 /app/Wan/run.py \
+                  --task i2v \
+                  --height 720 \
+                  --width 1280 \
+                  --model {{ model.model_repo }} \
+                  --img_file_path /app/Wan/i2v_input.JPG \
+                  --ulysses_degree 8 \
+                  --seed 42 \
+                  --num_frames 81 \
+                  --prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \
+                  --num_repetitions 1 \
+                  --num_inference_steps 40 \
+                  --use_torch_compile
+
+            {% endif %}
+
+            {% if model.model == "FLUX.1" %}
+               cd Flux
+               mkdir results
+
+               torchrun --nproc_per_node=8 /app/Flux/run.py \
+                  --model {{ model.model_repo }} \
+                  --seed 42 \
+                  --prompt "A small cat" \
+                  --height 1024 \
+                  --width 1024 \
+                  --num_inference_steps 25 \
+                  --max_sequence_length 256 \
+                  --warmup_steps 5 \
+                  --no_use_resolution_binning \
+                  --ulysses_degree 8 \
+                  --use_torch_compile \
+                  --num_repetitions 50
+
+            {% endif %}
+
+            {% if model.model == "stable-diffusion-3.5-large" %}
+               cd StableDiffusion3.5 
+               mkdir results
+
+               torchrun --nproc_per_node=8 /app/StableDiffusion3.5/run.py \
+                  --model {{ model.model_repo }} \
+                  --num_inference_steps 28 \
+                  --prompt "A capybara holding a sign that reads Hello World" \
+                  --use_torch_compile \
+                  --pipefusion_parallel_degree 4 \
+                  --use_cfg_parallel \
+                  --num_repetitions 50 \
+                  --dtype torch.float16 \
+                  --output_path results
+
+            {% endif %}
+
+            The generated video will be stored under the results directory. For the actual benchmark step runtimes, see {% if model.model == "Hunyuan Video" %}stdout.{% elif model.model in ["Wan2.1", "Wan2.2"] %}results/outputs/rank0_*.json{% elif model.model == "FLUX.1" %}results/timing.json{% elif model.model == "stable-diffusion-3.5-large"%}benchmark_results.csv{% endif %}
+
+            {% if model.model == "FLUX.1" %}You may also use ``run_usp.py`` which implements USP without modifying the default diffusers pipeline. {% endif %}
+
+      {% endfor %}
+    {% endfor %}
+
+Previous versions
+=================
+
+See
+:doc:`/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-history`
+to find documentation for previous releases of xDiT diffusion inference
+performance testing.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-history.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/xdit-history.rst
@@ -0,0 +1,47 @@
+:orphan:
+
+************************************************************
+xDiT diffusion inference performance testing version history
+************************************************************
+
+This table lists previous versions of the ROCm xDiT diffusion inference performance
+testing environment. For detailed information about available models for
+benchmarking, see the version-specific documentation.
+
+.. list-table::
+   :header-rows: 1
+
+   * - Docker image tag
+     - Components
+     - Resources
+
+   * - ``rocm/pytorch-xdit:v25.13`` (latest)
+     - 
+       * TheRock 1728a81
+     - 
+       * :doc:`Documentation <../../xdit-diffusion-inference>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-xdit/v25.13/images/sha256-81954713070d67bde08595e03f62110c8a3dd66a9ae17a77d611e01f83f0f4ef>`__
+
+   * - ``rocm/pytorch-xdit:v25.12``
+     - 
+       * `ROCm 7.10.0 preview <https://rocm.docs.amd.com/en/7.10.0-preview/about/release-notes.html>`__
+       * TheRock 3e3f834
+     - 
+       * :doc:`Documentation <xdit-25.12>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-xdit/v25.12/images/sha256-e06895132316bf3c393366b70a91eaab6755902dad0100e6e2b38310547d9256>`__
+
+   * - ``rocm/pytorch-xdit:v25.11``
+     - 
+       * `ROCm 7.10.0 preview <https://rocm.docs.amd.com/en/7.10.0-preview/about/release-notes.html>`__
+       * TheRock 3e3f834
+     - 
+       * :doc:`Documentation <xdit-25.11>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-xdit/v25.11/images/sha256-c9fa659439bb024f854b4d5eea598347251b02c341c55f66c98110832bde4216>`__
+
+   * - ``rocm/pytorch-xdit:v25.10``
+     - 
+       * `ROCm 7.9.0 preview <https://rocm.docs.amd.com/en/7.9.0-preview/about/release-notes.html>`__
+       * TheRock 7afbe45
+     - 
+       * :doc:`Documentation <xdit-25.10>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-xdit/v25.10/images/sha256-d79715ff18a9470e3f907cec8a9654d6b783c63370b091446acffc0de4d7070e>`__
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
@@ -6,7 +6,7 @@
 vLLM inference performance testing
 **********************************

-.. _vllm-benchmark-unified-docker-930:
+.. _vllm-benchmark-unified-docker-1210:

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml

@@ -34,35 +34,18 @@ vLLM inference performance testing
            {% endfor %}

 With this Docker image, you can quickly test the :ref:`expected
-inference performance numbers <vllm-benchmark-performance-measurements-930>` for
+inference performance numbers <vllm-benchmark-performance-measurements-1210>` for
 AMD Instinct GPUs.

 What's new
 ==========

-The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <previous-versions/vllm-history>`.
+The following is summary of notable changes since the :doc:`previous ROCm/vLLM
+Docker release <previous-versions/vllm-history>`.

-* Added support for AMD Instinct MI355X and MI350X GPUs.
+- Improved performance on Llama 3 MXFP4 through AITER optimizations and improved kernel fusion.

-* Added support and benchmarking instructions for the following models. See :ref:`vllm-benchmark-supported-models-930`.
-
-  * Llama 4 Scout and Maverick
-
-  * DeepSeek R1 0528 FP8
-
-  * MXFP4 models (MI355X and MI350X only): Llama 3.3 70B MXFP4 and Llama 3.1 405B MXFP4
-
-  * GPT OSS 20B and 120B
-
-  * Qwen 3 32B, 30B-A3B, and 235B-A22B
-
-* Removed the deprecated ``--max-seq-len-to-capture`` flag.
-
-* ``--gpu-memory-utilization`` is now configurable via the `configuration files
-  <https://github.com/ROCm/MAD/tree/develop/scripts/vllm/configs>`__ in the MAD
-  repository.
-
-.. _vllm-benchmark-supported-models-930:
+.. _vllm-benchmark-supported-models-1210:

 Supported models
 ================
@@ -72,7 +55,7 @@ Supported models
   {% set docker = data.dockers[0] %}
   {% set model_groups = data.model_groups %}

-   .. _vllm-benchmark-available-models-930:
+   .. _vllm-benchmark-available-models-1210:

   The following models are supported for inference performance benchmarking
   with vLLM and ROCm. Some instructions, commands, and recommendations in this
@@ -108,7 +91,7 @@ Supported models
         </div>
      </div>

-   .. _vllm-benchmark-vllm-930:
+   .. _vllm-benchmark-vllm-1210:

   {% for model_group in model_groups %}
      {% for model in model_group.models %}
@@ -122,6 +105,15 @@ Supported models
         MXFP4 is supported only on MI355X and MI350X GPUs.
      {% endif %}

+      {% if model.mad_tag in ["pyt_vllm_mixtral-8x7b", "pyt_vllm_mixtral-8x7b_fp8", "pyt_vllm_mixtral-8x22b", "pyt_vllm_mixtral-8x22b_fp8", "pyt_vllm_deepseek-r1"] %}
+      .. caution::
+
+         There is a known regression with AITER for MoE models such as Mixtral and
+         DeepSeek-R1. Consider using the :doc:`previous release
+         <previous-versions/vllm-0.11.1-20251103>`
+         ``rocm/vllm:rocm7.0.0_vllm_0.11.1_20251103`` for better performance.
+      {% endif %}
+
      .. note::

         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
@@ -136,7 +128,7 @@ Supported models
      {% endfor %}
   {% endfor %}

-.. _vllm-benchmark-performance-measurements-930:
+.. _vllm-benchmark-performance-measurements-1210:

 Performance measurements
 ========================
@@ -192,7 +184,7 @@ Benchmarking
   Once the setup is complete, choose between two options to reproduce the
   benchmark results:

-   .. _vllm-benchmark-mad-930:
+   .. _vllm-benchmark-mad-1210:

   {% for model_group in model_groups %}
      {% for model in model_group.models %}
@@ -204,7 +196,7 @@ Benchmarking
         .. tab-item:: MAD-integrated benchmarking

            The following run command is tailored to {{ model.model }}.
-            See :ref:`vllm-benchmark-supported-models-930` to switch to another available model.
+            See :ref:`vllm-benchmark-supported-models-1210` to switch to another available model.

            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
               directory and install the required packages on the host machine.
@@ -233,7 +225,7 @@ Benchmarking
            and ``{{ model.mad_tag }}_serving.csv``.

            Although the :ref:`available models
-            <vllm-benchmark-available-models-930>` are preconfigured to collect
+            <vllm-benchmark-available-models-1210>` are preconfigured to collect
            offline throughput and online serving performance data, you can
            also change the benchmarking parameters. See the standalone
            benchmarking tab for more information.
@@ -258,7 +250,7 @@ Benchmarking
         .. tab-item:: Standalone benchmarking

            The following commands are optimized for {{ model.model }}.
-            See :ref:`vllm-benchmark-supported-models-930` to switch to another available model.
+            See :ref:`vllm-benchmark-supported-models-1210` to switch to another available model.

            .. seealso::

@@ -419,6 +411,10 @@ Advanced usage
 For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
 see the developer's guide at `<https://github.com/ROCm/vllm/blob/documentation/docs/dev-docker/README.md>`__.

+.. note::
+
+   If you’re using this Docker image on other AMD GPUs such as the AMD Instinct MI200 Series or Radeon, add ``export VLLM_ROCM_USE_AITER=0`` to your command, since AITER is only supported on gfx942 and gfx950 architectures.
+
 Reproducing the Docker image
 ----------------------------

@@ -448,6 +444,14 @@ To reproduce this ROCm-enabled vLLM Docker image release, follow these steps:

      Replace ``vllm-rocm`` with your desired image tag.

+Known issues
+============
+
+There is a known regression with AITER for MoE models such as Mixtral and
+DeepSeek-R1. Consider using the :doc:`previous release
+<previous-versions/vllm-0.11.1-20251103>`
+(``rocm/vllm:rocm7.0.0_vllm_0.11.1_20251103``) for better performance.
+
 Further reading
 ===============

--- a/docs/how-to/rocm-for-ai/inference/deploy-your-model.rst
+++ b/docs/how-to/rocm-for-ai/inference/deploy-your-model.rst
@@ -22,7 +22,7 @@ See the `GitHub repository <https://github.com/vllm-project/vllm>`_ and `officia
 <https://docs.vllm.ai/>`_ for more information.

 For guidance on using vLLM with ROCm, refer to `Installation with ROCm
-<https://docs.vllm.ai/en/latest/getting_started/amd-installation.html>`_.
+<https://docs.vllm.ai/en/stable/getting_started/installation/gpu.html#amd-rocm>`__.

 vLLM installation
 -----------------
--- a/docs/how-to/rocm-for-ai/inference/index.rst
+++ b/docs/how-to/rocm-for-ai/inference/index.rst
@@ -26,4 +26,6 @@ training, fine-tuning, and inference. It leverages popular machine learning fram

 - :doc:`SGLang inference performance testing <benchmark-docker/sglang>`

+- :doc:`xDiT diffusion inference <xdit-diffusion-inference>`
+
 - :doc:`Deploying your model <deploy-your-model>`
--- a/docs/how-to/rocm-for-ai/inference/xdit-diffusion-inference.rst
+++ b/docs/how-to/rocm-for-ai/inference/xdit-diffusion-inference.rst
@@ -0,0 +1,462 @@
+.. meta::
+   :description: Learn to validate diffusion model video generation on MI300X, MI350X and MI355X accelerators using
+                 prebuilt and optimized docker images.
+   :keywords: xDiT, diffusion, video, video generation, image, image generation, validate, benchmark
+
+************************
+xDiT diffusion inference
+************************
+
+.. _xdit-video-diffusion:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml
+
+   {% set docker = data.docker %}
+
+   The `rocm/pytorch-xdit <{{ docker.docker_hub_url }}>`_ Docker image offers
+   a prebuilt, optimized environment based on `xDiT
+   <https://github.com/xdit-project/xDiT>`_ for benchmarking diffusion model
+   video and image generation on AMD Instinct MI355X, MI350X (gfx950), MI325X,
+   and MI300X (gfx942) GPUs.
+
+   The image runs a preview version of ROCm using the new `TheRock
+   <https://github.com/ROCm/TheRock>`__ build system and includes the following
+   components:
+
+   .. dropdown:: Software components - {{ docker.pull_tag.split('-')|last }}
+
+      .. list-table::
+         :header-rows: 1
+
+         * - Software component
+           - Version
+
+         {% for component_name, component_data in docker.components.items() %}
+         * - `{{ component_name }} <{{ component_data.url }}>`_
+           - {{ component_data.version }}
+         {% endfor %}
+
+Follow this guide to pull the required image, spin up a container, download the model, and run a benchmark.
+For preview and development releases, see `amdsiloai/pytorch-xdit <https://hub.docker.com/r/amdsiloai/pytorch-xdit>`_.
+
+What's new
+==========
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml
+
+   {% set docker = data.docker %}
+
+   {% for item in docker.whats_new %}
+   * {{ item }}
+   {% endfor %}
+
+.. _xdit-video-diffusion-supported-models:
+
+Supported models
+================
+
+The following models are supported for inference performance benchmarking.
+Some instructions, commands, and recommendations in this documentation might
+vary by model -- select one to get started.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml
+
+   {% set docker = data.docker %}
+
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+          <div class="row gx-0">
+              <div class="col-2 me-1 px-2 model-param-head">Model</div>
+              <div class="row col-10 pe-0">
+        {% for model_group in docker.supported_models %}
+               <div class="col-6 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.js_tag }}" tabindex="0">{{ model_group.group }}</div>
+        {% endfor %}
+              </div>
+          </div>
+
+          <div class="row gx-0 pt-1">
+              <div class="col-2 me-1 px-2 model-param-head">Variant</div>
+              <div class="row col-10 pe-0">
+        {% for model_group in docker.supported_models %}
+            {% set models = model_group.models %}
+            {% for model in models %}
+                {% if models|length % 3 == 0 %}
+                <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.js_tag }}" data-param-group="{{ model_group.js_tag }}" tabindex="0">{{ model.model }}</div>
+                {% else %}
+                <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.js_tag }}" data-param-group="{{ model_group.js_tag }}" tabindex="0">{{ model.model }}</div>
+                {% endif %}
+            {% endfor %}
+        {% endfor %}
+              </div>
+          </div>
+      </div>
+
+   {% for model_group in docker.supported_models %}
+       {% for model in model_group.models %}
+
+   .. container:: model-doc {{ model.js_tag }}
+
+      .. note::
+
+         To learn more about your specific model see the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_
+         or visit the `GitHub page <{{ model.github }}>`__. Note that some models require access authorization before use via an
+         external license agreement through a third party.
+
+       {% endfor %}
+   {% endfor %}
+
+Performance measurements
+========================
+
+To evaluate performance, the `Performance results with AMD ROCm software
+<https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8543b7e6d-item-9eda09e707-tab>`__
+page provides reference throughput and serving measurements for inferencing popular AI models.
+
+.. important::
+
+   The performance data presented in `Performance results with AMD ROCm
+   software
+   <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8543b7e6d-item-9eda09e707-tab>`__
+   only reflects the latest version of this inference benchmarking environment.
+   The listed measurements should not be interpreted as the peak performance
+   achievable by AMD Instinct GPUs or ROCm software.
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting.
+
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
+Pull the Docker image
+=====================
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml
+
+   {% set docker = data.docker %}
+
+   For this tutorial, it's recommended to use the latest ``{{ docker.pull_tag }}`` Docker image.
+   Pull the image using the following command:
+
+   .. code-block:: shell
+
+      docker pull {{ docker.pull_tag }}
+
+Validate and benchmark
+======================
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml
+
+   {% set docker = data.docker %}
+
+   Once the image has been downloaded you can follow these steps to
+   run benchmarks and generate outputs.
+
+   {% for model_group in docker.supported_models %}
+     {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.js_tag}}
+
+      The following commands are written for {{ model.model }}.
+      See :ref:`xdit-video-diffusion-supported-models` to switch to another available model.
+
+     {% endfor %}
+   {% endfor %}
+
+Choose your setup method
+------------------------
+
+You can either use an existing Hugging Face cache or download the model fresh inside the container.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml
+
+   {% set docker = data.docker %}
+
+   {% for model_group in docker.supported_models %}
+     {% for model in model_group.models %}
+   .. container:: model-doc {{model.js_tag}}
+
+      .. tab-set::
+
+         .. tab-item:: Option 1: Use existing Hugging Face cache
+
+            If you already have models downloaded on your host system, you can mount your existing cache.
+
+            1. Set your Hugging Face cache location.
+
+               .. code-block:: shell
+
+                  export HF_HOME=/your/hf_cache/location
+
+            2. Download the model (if not already cached).
+
+               .. code-block:: shell
+
+                  huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %}
+
+            3. Launch the container with mounted cache.
+
+               .. code-block:: shell
+
+                  docker run \
+                      -it --rm \
+                      --cap-add=SYS_PTRACE \
+                      --security-opt seccomp=unconfined \
+                      --user root \
+                      --device=/dev/kfd \
+                      --device=/dev/dri \
+                      --group-add video \
+                      --ipc=host \
+                      --network host \
+                      --privileged \
+                      --shm-size 128G \
+                      --name pytorch-xdit \
+                      -e HSA_NO_SCRATCH_RECLAIM=1 \
+                      -e OMP_NUM_THREADS=16 \
+                      -e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+                      -e HF_HOME=/app/huggingface_models \
+                      -v $HF_HOME:/app/huggingface_models \
+                      {{ docker.pull_tag }}
+
+         .. tab-item:: Option 2: Download inside container
+
+            If you prefer to keep the container self-contained or don't have an existing cache.
+
+            1. Launch the container
+
+               .. code-block:: shell
+
+                  docker run \
+                      -it --rm \
+                      --cap-add=SYS_PTRACE \
+                      --security-opt seccomp=unconfined \
+                      --user root \
+                      --device=/dev/kfd \
+                      --device=/dev/dri \
+                      --group-add video \
+                      --ipc=host \
+                      --network host \
+                      --privileged \
+                      --shm-size 128G \
+                      --name pytorch-xdit \
+                      -e HSA_NO_SCRATCH_RECLAIM=1 \
+                      -e OMP_NUM_THREADS=16 \
+                      -e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+                      {{ docker.pull_tag }}
+
+            2. Inside the container, set the Hugging Face cache location and download the model.
+
+               .. code-block:: shell
+
+                  export HF_HOME=/app/huggingface_models
+                  huggingface-cli download {{ model.model_repo }} {% if model.revision %} --revision {{ model.revision }} {% endif %}
+
+               .. warning::
+
+                  Models will be downloaded to the container's filesystem and will be lost when the container is removed unless you persist the data with a volume.
+     {% endfor %}
+   {% endfor %}
+
+Run inference
+=============
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/xdit-inference-models.yaml
+
+   {% set docker = data.docker %}
+
+   {% for model_group in docker.supported_models %}
+     {% for model in model_group.models %}
+
+   .. container:: model-doc {{ model.js_tag }}
+
+      .. tab-set::
+
+         .. tab-item:: MAD-integrated benchmarking
+
+            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+               directory and install the required packages on the host machine.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD
+                  pip install -r requirements.txt
+
+            2. On the host machine, use this command to run the performance benchmark test on
+               the `{{model.model}} <{{ model.url }}>`_ model using one node.
+
+               .. code-block:: shell
+
+                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+                  madengine run \
+                      --tags {{model.mad_tag}} \
+                      --keep-model-dir \
+                      --live-output
+                     
+            MAD launches a Docker container with the name
+            ``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the
+            model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv``
+            and ``{{ model.mad_tag }}_serving.csv``.
+
+         .. tab-item:: Standalone benchmarking
+
+            To run the benchmarks for {{ model.model }}, use the following command:
+
+            .. code-block:: shell
+            {% if model.model == "Hunyuan Video" %}
+               cd /app/Hunyuanvideo
+               mkdir results
+
+               torchrun --nproc_per_node=8 run.py \
+                  --model {{ model.model_repo }} \
+                  --prompt "In the large cage, two puppies were wagging their tails at each other." \
+                  --height 720 --width 1280 --num_frames 129 \
+                  --num_inference_steps 50 --warmup_steps 1 --n_repeats 1 \
+                  --ulysses_degree 8 \
+                  --enable_tiling --enable_slicing \
+                  --use_torch_compile \
+                  --bench_output results
+
+            {% endif %}
+            {% if model.model == "Wan2.1" %}
+               cd /app/Wan
+               mkdir results
+
+               torchrun --nproc_per_node=8 /app/Wan/run.py \
+                  --task i2v \
+                  --height 720 \
+                  --width 1280 \
+                  --model {{ model.model_repo }} \
+                  --img_file_path /app/Wan/i2v_input.JPG \
+                  --ulysses_degree 8 \
+                  --seed 42 \
+                  --num_frames 81 \
+                  --prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \
+                  --num_repetitions 1 \
+                  --num_inference_steps 40 \
+                  --use_torch_compile
+
+            {% endif %}
+            {% if model.model == "Wan2.2" %}
+               cd /app/Wan
+               mkdir results
+
+               torchrun --nproc_per_node=8 /app/Wan/run.py \
+                  --task i2v \
+                  --height 720 \
+                  --width 1280 \
+                  --model {{ model.model_repo }} \
+                  --img_file_path /app/Wan/i2v_input.JPG \
+                  --ulysses_degree 8 \
+                  --seed 42 \
+                  --num_frames 81 \
+                  --prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \
+                  --num_repetitions 1 \
+                  --num_inference_steps 40 \
+                  --use_torch_compile
+
+            {% endif %}
+
+            {% if model.model == "FLUX.1" %}
+               cd /app/Flux
+               mkdir results
+
+               torchrun --nproc_per_node=8 /app/Flux/run.py \
+                  --model {{ model.model_repo }} \
+                  --seed 42 \
+                  --prompt "A small cat" \
+                  --height 1024 \
+                  --width 1024 \
+                  --num_inference_steps 25 \
+                  --max_sequence_length 256 \
+                  --warmup_steps 5 \
+                  --no_use_resolution_binning \
+                  --ulysses_degree 8 \
+                  --use_torch_compile \
+                  --num_repetitions 50
+
+            {% endif %}
+
+            {% if model.model == "FLUX.1 Kontext" %}
+               cd /app/Flux
+               mkdir results
+
+               torchrun --nproc_per_node=8 /app/Flux/run_usp.py \
+                  --model {{ model.model_repo }} \
+                  --seed 42 \
+                  --prompt "Add a cool hat to the cat" \
+                  --height 1024 \
+                  --width 1024 \
+                  --num_inference_steps 30 \
+                  --max_sequence_length 512 \
+                  --warmup_steps 5 \
+                  --no_use_resolution_binning \
+                  --ulysses_degree 8 \
+                  --use_torch_compile \
+                  --img_file_path /app/Flux/cat.png \
+                  --model_type flux_kontext \
+                  --guidance_scale 2.5 \
+                  --num_repetitions 25
+
+            {% endif %}
+
+            {% if model.model == "FLUX.2" %}
+               cd /app/Flux
+               mkdir results
+
+               torchrun --nproc_per_node=8 /app/Flux/run_usp.py \
+                  --model {{ model.model_repo }} \
+                  --seed 42 \
+                  --prompt "Add a cool hat to the cat" \
+                  --height 1024 \
+                  --width 1024 \
+                  --num_inference_steps 50 \
+                  --max_sequence_length 512 \
+                  --warmup_steps 5 \
+                  --no_use_resolution_binning \
+                  --ulysses_degree 8 \
+                  --use_torch_compile \
+                  --img_file_paths /app/Flux/cat.png \
+                  --model_type flux2 \
+                  --guidance_scale 4.0 \
+                  --num_repetitions 25
+
+            {% endif %}
+
+            {% if model.model == "stable-diffusion-3.5-large" %}
+               cd /app/StableDiffusion3.5 
+               mkdir results
+
+               torchrun --nproc_per_node=8 /app/StableDiffusion3.5/run.py \
+                  --model {{ model.model_repo }} \
+                  --num_inference_steps 28 \
+                  --prompt "A capybara holding a sign that reads Hello World" \
+                  --use_torch_compile \
+                  --pipefusion_parallel_degree 4 \
+                  --use_cfg_parallel \
+                  --num_repetitions 50 \
+                  --dtype torch.float16 \
+                  --output_path results
+
+            {% endif %}
+
+            The generated video will be stored under the results directory. For the actual benchmark step runtimes, see {% if model.model == "Hunyuan Video" %}stdout.{% elif model.model in ["Wan2.1", "Wan2.2"] %}results/outputs/rank0_*.json{% elif model.model in ["FLUX.1", "FLUX.1 Kontext", "FLUX.2"] %}results/timing.json{% elif model.model == "stable-diffusion-3.5-large"%}benchmark_results.csv{% endif %}
+
+            {% if model.model == "FLUX.1" %}You may also use ``run_usp.py`` which implements USP without modifying the default diffusers pipeline. {% endif %}
+
+      {% endfor %}
+    {% endfor %}
+
+Previous versions
+=================
+
+See :doc:`benchmark-docker/previous-versions/xdit-history` to find documentation for previous releases
+of xDiT diffusion inference performance testing.
--- a/docs/how-to/rocm-for-ai/system-setup/multi-node-setup.rst
+++ b/docs/how-to/rocm-for-ai/system-setup/multi-node-setup.rst
@@ -254,7 +254,7 @@ PyTorch training
   The ROCm PyTorch Training Docker image now focuses on :doc:`Training a model
   with Primus and PyTorch <../training/benchmark-docker/primus-pytorch>`. The
   following example refers to the legacy workflow :ref:`Training a
-   model with PyTorch <amd-pytorch-training-multinode-examples>`.
+   model with PyTorch <amd-pytorch-training-multinode-examples-v259>`.

 1. Download the ``run_multinode_train.sh`` benchmarking script from `<https://github.com/ROCm/MAD/tree/develop/scripts/pytorch_train>`__.

@@ -277,7 +277,7 @@ PyTorch training

 .. seealso::

-   See :ref:`Training a model with PyTorch <amd-pytorch-multinode-examples>` for more examples and information.
+   See :ref:`Training a model with PyTorch <amd-pytorch-training-multinode-examples-v259>` for more examples and information.

 Megatron-LM
 -----------
--- a/docs/how-to/rocm-for-ai/system-setup/system-health-check.rst
+++ b/docs/how-to/rocm-for-ai/system-setup/system-health-check.rst
@@ -92,7 +92,7 @@ GPUs, which can impact end-to-end latency.
 .. _healthcheck-install-transferbench:

 1. To get started, use the instructions in the `TransferBench documentation
-   <https://rocm.docs.amd.com/projects/TransferBench/en/latest/install/install.html#install-transferbench>`_
+   <https://rocm.docs.amd.com/projects/TransferBench/en/latest/install/install.html#install-transferbench>`__
   or use the following commands:

   .. code:: shell
@@ -102,5 +102,5 @@ GPUs, which can impact end-to-end latency.
      CC=hipcc make

 2. Run the suggested TransferBench tests -- see `TransferBench benchmarking
-   <https://instinct.docs.amd.com/projects/system-acceptance/en/latest/mi300x/performance-bench.html#transferbench-benchmarking-results>`_
+   <https://instinct.docs.amd.com/projects/system-acceptance/en/latest/common/system-validation.html#transferbench>`__
   in the Instinct performance benchmarking documentation for instructions.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
@@ -33,18 +33,15 @@ It includes the following software components:
              - {{ component_version }}

            {% endfor %}
-         {% if jax_version == "0.6.0" %}
-         .. note::
-
-            Shardy is a new config in JAX 0.6.0. You might get related errors if it's
-            not configured correctly. For now you can turn it off by setting
-            ``shardy=False`` during the training run. You can also follow the `migration
-            guide <https://docs.jax.dev/en/latest/shardy_jax_migration.html>`__ to enable
-            it.
-         {% endif %}
-
      {% endfor %}

+.. note::
+
+   The ``rocm/jax-training:maxtext-v25.9`` has been updated to
+   ``rocm/jax-training:maxtext-v25.9.1``. This revision should include
+   a fix to address segmentation fault issues during launch. See the
+   :doc:`versioned documentation <previous-versions/jax-maxtext-v25.9>`.
+
 MaxText with on ROCm provides the following key features to train large language models efficiently:

 - Transformer Engine (TE)
@@ -57,7 +54,7 @@ MaxText with on ROCm provides the following key features to train large language

 - NANOO FP8 (for MI300X series GPUs) and FP8 (for MI355X and MI350X) quantization support

-.. _amd-maxtext-model-support-v259:
+.. _amd-maxtext-model-support-v25.11:

 Supported models
 ================
@@ -139,7 +136,7 @@ Use the following command to pull the Docker image from Docker Hub.

      docker pull {{ docker.pull_tag }}

-.. _amd-maxtext-multi-node-setup-v259:
+.. _amd-maxtext-multi-node-setup-v25.11:

 Multi-node configuration
 ------------------------
@@ -147,7 +144,7 @@ Multi-node configuration
 See :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your
 environment for multi-node training.

-.. _amd-maxtext-get-started-v259:
+.. _amd-maxtext-get-started-v25.11:

 Benchmarking
 ============
@@ -172,7 +169,7 @@ benchmark results:
         .. tab-item:: MAD-integrated benchmarking

            The following run command is tailored to {{ model.model }}.
-            See :ref:`amd-maxtext-model-support-v259` to switch to another available model.
+            See :ref:`amd-maxtext-model-support-v25.11` to switch to another available model.

            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
               directory and install the required packages on the host machine.
@@ -203,7 +200,7 @@ benchmark results:
         .. tab-item:: Standalone benchmarking

            The following commands are optimized for {{ model.model }}. See
-            :ref:`amd-maxtext-model-support-v259` to switch to another
+            :ref:`amd-maxtext-model-support-v25.11` to switch to another
            available model. Some instructions and resources might not be
            available for all models and configurations.

@@ -325,15 +322,67 @@ benchmark results:

                  sbatch -N <num_nodes> {{ model.multinode_training_script }}

+            .. rubric:: Profiling with rocprofv3
+
+            If you need to collect a trace and the JAX profiler isn't working, use ``rocprofv3`` provided by the :doc:`ROCprofiler-SDK <rocprofiler-sdk:index>` as a workaround. For example:
+
+            .. code-block:: bash
+
+               rocprofv3 \
+                   --hip-trace \
+                   --kernel-trace \
+                   --memory-copy-trace \
+                   --rccl-trace \
+                   --output-format pftrace \
+                   -d ./v3_traces \ # output directory
+                   -- ./jax-maxtext_benchmark_report.sh -m {{ model.model_repo }} # or desired command
+
+            You can set the directory where you want the .json traces to be
+            saved using ``-d <TRACE_DIRECTORY>``. The resulting traces can be
+            opened in Perfetto: `<https://ui.perfetto.dev/>`__.
+
         {% else %}
            .. rubric:: Multi-node training

-            For multi-node training examples, choose a model from :ref:`amd-maxtext-model-support-v259`
+            For multi-node training examples, choose a model from :ref:`amd-maxtext-model-support-v25.11`
            with an available `multi-node training script <https://github.com/ROCm/MAD/tree/develop/scripts/jax-maxtext/gpu-rocm>`__.
         {% endif %}
      {% endfor %}
   {% endfor %}

+Known issues
+============
+
+- Minor performance regression (< 4%) for BF16 quantization in Llama models and Mixtral 8x7b.
+
+- You might see minor loss spikes, or loss curve may have slightly higher
+  convergence end values compared to the previous ``jax-training`` image.
+
+- For FP8 training on MI355, many models will display a warning message like:
+  ``Warning: Latency not found for MI_M=16, MI_N=16, MI_K=128,
+  mi_input_type=BFloat8Float8_fnuz. Returning latency value of 32 (really
+  slow).`` The compile step may take longer than usual, but training will run.
+  This will be fixed in a future release.
+
+- The built-in JAX profiler isn't working. If you need to collect a trace and
+  the JAX profiler isn't working, use ``rocprofv3`` provided by the
+  :doc:`ROCprofiler-SDK <rocprofiler-sdk:index>` as a workaround. For example:
+
+  .. code-block:: bash
+
+     rocprofv3 \
+         --hip-trace \
+         --kernel-trace \
+         --memory-copy-trace \
+         --rccl-trace \
+         --output-format pftrace \
+         -d ./v3_traces \ # output directory
+         -- ./jax-maxtext_benchmark_report.sh -m {{ model.model_repo }} # or desired command
+
+  You can set the directory where you want the .json traces to be
+  saved using ``-d <TRACE_DIRECTORY>``. The resulting traces can be
+  opened in Perfetto: `<https://ui.perfetto.dev/>`__.
+
 Further reading
 ===============

--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
@@ -14,7 +14,7 @@ Training a model with Megatron-LM on ROCm
   <https://hub.docker.com/r/rocm/megatron-lm/>`__ Docker Hub registry will be
   deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
   The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
-   including Megatron-LM, `torchtitan, and torchtune <primus-pytorch>`__.
+   including Megatron-LM and :doc:`torchtitan <primus-pytorch>`.

   Primus with Megatron is designed to replace this ROCm Megatron-LM training workflow.
   To learn how to migrate workloads from Megatron-LM to Primus with Megatron,
@@ -36,12 +36,10 @@ accelerate training workloads:

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml

-   {% set dockers = data.dockers %}
   .. tab-set::

-   {% for supported_gpus, docker in dockers.items() %}
-      .. tab-item:: {{ supported_gpus }}
-         :sync: {{ supported_gpus }}
+      .. tab-item:: {{ data.docker.pull_tag }}
+         :sync: {{ data.docker.pull_tag }}

         .. list-table::
            :header-rows: 1
@@ -49,12 +47,12 @@ accelerate training workloads:
            * - Software component
              - Version

-            {% for component_name, component_version in docker.components.items() %}
+            {% for component_name, component_version in data.docker.components.items() %}
            * - {{ component_name }}
              - {{ component_version }}
            {% endfor %}
-   {% endfor %}
-   .. _amd-megatron-lm-model-support:
+
+   .. _amd-megatron-lm-model-support-v25.11:

   Supported models
   ================
@@ -99,7 +97,7 @@ accelerate training workloads:
   Some models, such as Llama, require an external license agreement through
   a third party (for example, Meta).

-.. _amd-megatron-lm-performance-measurements:
+.. _amd-megatron-lm-performance-measurements-v25.11:

 Performance measurements
 ========================
@@ -131,7 +129,7 @@ To test for optimal performance, consult the recommended :ref:`System health ben
 <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
 system's configuration.

-.. _mi300x-amd-megatron-lm-training:
+.. _mi300x-amd-megatron-lm-training-v25.11:

 Environment setup
 =================
@@ -140,52 +138,38 @@ Use the following instructions to set up the environment, configure the script t
 reproduce the benchmark results on MI300X Series GPUs with the AMD Megatron-LM Docker
 image.

-.. _amd-megatron-lm-requirements:
+.. _amd-megatron-lm-requirements-v25.11:

 Download the Docker image
 -------------------------

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml

-   {% set dockers = data.dockers %}
+   {% set docker = data.docker %}
   1. Use the following command to pull the Docker image from Docker Hub.

-      .. tab-set::
+      .. code-block:: shell

-         {% for supported_gpus, docker in dockers.items() %}
-         .. tab-item:: {{ supported_gpus }}
-            :sync: {{ supported_gpus }}
-
-            .. code-block:: shell
-
-               docker pull {{ docker.pull_tag }}
-         {% endfor %}
+         docker pull {{ docker.pull_tag }}

   2. Launch the Docker container.

-      .. tab-set::
+      .. code-block:: shell

-         {% for supported_gpus, docker in dockers.items() %}
-         .. tab-item:: {{ supported_gpus }}
-            :sync: {{ supported_gpus }}
-
-            .. code-block:: shell
-
-               docker run -it \
-                   --device /dev/dri \
-                   --device /dev/kfd \
-                   --device /dev/infiniband \
-                   --network host --ipc host \
-                   --group-add video \
-                   --cap-add SYS_PTRACE \
-                   --security-opt seccomp=unconfined \
-                   --privileged \
-                   -v $HOME:$HOME \
-                   -v $HOME/.ssh:/root/.ssh \
-                   --shm-size 128G \
-                   --name megatron_training_env \
-                   {{ docker.pull_tag }}
-         {% endfor %}
+         docker run -it \
+             --device /dev/dri \
+             --device /dev/kfd \
+             --device /dev/infiniband \
+             --network host --ipc host \
+             --group-add video \
+             --cap-add SYS_PTRACE \
+             --security-opt seccomp=unconfined \
+             --privileged \
+             -v $HOME:$HOME \
+             -v $HOME/.ssh:/root/.ssh \
+             --shm-size 128G \
+             --name megatron_training_env \
+             {{ docker.pull_tag }}

 3. Use these commands if you exit the ``megatron_training_env`` container and need to return to it.

@@ -206,7 +190,7 @@ Download the Docker image
 The Docker container hosts a verified commit of
 `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev>`__.

-.. _amd-megatron-lm-environment-setup:
+.. _amd-megatron-lm-environment-setup-v25.11:

 Configuration
 =============
@@ -216,39 +200,39 @@ Configuration
   Update the ``train_llama3.sh`` configuration script in the ``examples/llama``
   directory of
   `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/llama>`__ to configure your training run.
-   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training>`.
+   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v25.11>`.

 .. container:: model-doc pyt_megatron_lm_train_llama-2-7b pyt_megatron_lm_train_llama-2-70b

   Update the ``train_llama2.sh`` configuration script in the ``examples/llama``
   directory of
   `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/llama>`__ to configure your training run.
-   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training>`.
+   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v25.11>`.

 .. container:: model-doc pyt_megatron_lm_train_deepseek-v3-proxy

   Update the ``train_deepseekv3.sh`` configuration script in the ``examples/deepseek_v3``
   directory of
   `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/deepseek_v3>`__ to configure your training run.
-   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training>`.
+   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v25.11>`.

 .. container:: model-doc pyt_megatron_lm_train_deepseek-v2-lite-16b

   Update the ``train_deepseekv2.sh`` configuration script in the ``examples/deepseek_v2``
   directory of
   `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/deepseek_v2>`__ to configure your training run.
-   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training>`.
+   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v25.11>`.

 .. container:: model-doc pyt_megatron_lm_train_mixtral-8x7b pyt_megatron_lm_train_mixtral-8x22b-proxy

   Update the ``train_mixtral_moe.sh`` configuration script in the ``examples/mixtral``
   directory of
   `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/mixtral>`__ to configure your training run.
-   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training>`.
+   Options can also be passed as command line arguments as described in :ref:`Run training <amd-megatron-lm-run-training-v25.11>`.

 .. note::

-   See :ref:`Key options <amd-megatron-lm-benchmark-test-vars>` for more information on configuration options.
+   See :ref:`Key options <amd-megatron-lm-benchmark-test-vars-v25.11>` for more information on configuration options.

 Multi-node configuration
 ------------------------
@@ -256,7 +240,7 @@ Multi-node configuration
 Refer to :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your environment for multi-node
 training. See :ref:`amd-megatron-lm-multi-node-examples` for example run commands.

-.. _amd-megatron-lm-tokenizer:
+.. _amd-megatron-lm-tokenizer-v25.11:

 Tokenizer
 ---------
@@ -393,7 +377,7 @@ Download the dataset

   ``TOKENIZER_MODEL`` can be any accessible Hugging Face tokenizer.
   Remember to either pre-download the tokenizer or setup Hugging Face access
-   otherwise when needed -- see the :ref:`Tokenizer <amd-megatron-lm-tokenizer>` section.
+   otherwise when needed -- see the :ref:`Tokenizer <amd-megatron-lm-tokenizer-v25.11>` section.

   .. note::

@@ -495,15 +479,38 @@ Download the dataset

   Ensure that the files are accessible inside the Docker container.

-.. _amd-megatron-lm-run-training:
+.. _amd-megatron-lm-run-training-v25.11:

 Run training
 ============

 Use the following example commands to set up the environment, configure
-:ref:`key options <amd-megatron-lm-benchmark-test-vars>`, and run training on
+:ref:`key options <amd-megatron-lm-benchmark-test-vars-v25.11>`, and run training on
 MI300X Series GPUs with the AMD Megatron-LM environment.

+Before starting training, export the following environment variables.
+
+.. tab-set::
+
+   .. tab-item:: MI355X and MI350X
+
+      .. code-block:: shell
+
+         export HSA_NO_SCRATCH_RECLAIM=1
+         export NVTE_CK_USES_BWD_V3=1
+         export NVTE_CK_USES_BWD_V3=1
+
+   .. tab-item:: MI325X and MI300X
+
+      .. code-block:: shell
+
+         export HSA_NO_SCRATCH_RECLAIM=1
+         export NVTE_CK_USES_BWD_V3=1
+         export NVTE_CK_USES_BWD_V3=1
+
+         # Set this on MI325X/MI300X only
+         export NVTE_CK_IS_V3_ATOMIC_FP32=1
+
 Single node training
 --------------------

@@ -913,7 +920,7 @@ Single node training
          RECOMPUTE_ACTIVATIONS=full \
          CKPT_FORMAT=torch_dist

-.. _amd-megatron-lm-multi-node-examples:
+.. _amd-megatron-lm-multi-node-examples-v25.11:

 Multi-node training examples
 ----------------------------
@@ -964,7 +971,7 @@ training on 16 nodes, try the following command:

   sbatch examples/deepseek_v3/train_deepseek_v3_slurm.sh

-.. _amd-megatron-lm-benchmark-test-vars:
+.. _amd-megatron-lm-benchmark-test-vars-v25.11:

 Key options
 -----------
@@ -1029,11 +1036,6 @@ The benchmark tests support the following sets of variables.
 ``RECOMPUTE_NUM_LAYERS``
  Number of layers used for checkpointing recompute.

-Known issues
-============
-
-PyTorch Profiler may produce inaccurate traces when CPU activity profiling is enabled.
-
 Previous versions
 =================

--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history.rst
@@ -17,13 +17,22 @@ previous releases of the ``ROCm/jax-training`` Docker image on `Docker Hub <http
     - Components
     - Resources

-   * - 25.9 (latest)
+   * - 25.11
+     -
+       * ROCm 7.1.0
+       * JAX 0.7.1
+     -
+       * :doc:`Documentation <../jax-maxtext>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.11/images/sha256-18e4d8f0b8ce7a7422c58046940dd5f32249960449fca09a562b65fb8eb1562a>`__
+
+   * - 25.9.1
     -
       * ROCm 7.0.0
       * JAX 0.6.2
     -
-       * :doc:`Documentation <../jax-maxtext>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7-jax060/images/sha256-7352212ae033a76dca2b9dceffc23c1b5f1a61a7a560082cf747a9bf1acfc9ce>`__
+       * :doc:`Documentation <jax-maxtext-v25.9>`
+       * `Docker Hub (25.9.1) <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.9.1/images/sha256-60946cfbd470f6ee361fc9da740233a4fb2e892727f01719145b1f7627a1cff6>`__
+       * `Docker Hub (25.9) <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.9/images/sha256-4bb16ab58279ef09cb7a5e362c38e3fe3f901de44d8dbac5d0cb3bac5686441e>`__

   * - 25.7
     -
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.7.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.7.rst
@@ -24,7 +24,7 @@ provides a prebuilt environment for training on AMD Instinct MI300X and MI325X G
 including essential components like JAX, XLA, ROCm libraries, and MaxText utilities.
 It includes the following software components:

-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/jax-maxtext-v25.7-benchmark-models.yaml

   {% set dockers = data.dockers %}
   .. tab-set::
@@ -80,7 +80,7 @@ series GPUs. Some instructions, commands, and available training
 configurations in this documentation might vary by model -- select one to get
 started.

-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/jax-maxtext-v25.7-benchmark-models.yaml

   {% set model_groups = data.model_groups %}
   .. raw:: html
@@ -144,7 +144,7 @@ Pull the Docker image

 Use the following command to pull the Docker image from Docker Hub.

-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/jax-maxtext-v25.7-benchmark-models.yaml

   {% set dockers = data.dockers %}
   .. tab-set::
@@ -177,7 +177,7 @@ Benchmarking
 Once the setup is complete, choose between two options to reproduce the
 benchmark results:

-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/jax-maxtext-v25.7-benchmark-models.yaml

   .. _vllm-benchmark-mad:

--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.9.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.9.rst
@@ -0,0 +1,365 @@
+:orphan:
+
+.. meta::
+   :description: How to train a model using JAX MaxText for ROCm.
+   :keywords: ROCm, AI, LLM, train, jax, torch, Llama, flux, tutorial, docker
+
+******************************************
+Training a model with JAX MaxText on ROCm
+******************************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm JAX MaxText
+   training performance documentation. See :doc:`../jax-maxtext` for the latest version.
+
+.. note::
+
+   We have refreshed the ``rocm/jax-training:maxtext-v25.9`` image as
+   `rocm/jax-training:maxtext-v25.9.1`. This should include a fix to address
+   segmentation fault issues during launch.
+
+The MaxText for ROCm training Docker image
+provides a prebuilt environment for training on AMD Instinct MI355X, MI350X, MI325X, and MI300X GPUs,
+including essential components like JAX, XLA, ROCm libraries, and MaxText utilities.
+It includes the following software components:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/jax-maxtext-v25.9-benchmark-models.yaml
+
+   {% set dockers = data.dockers %}
+   .. tab-set::
+
+      {% for docker in dockers %}
+      {% set jax_version = docker.components["JAX"] %}
+
+      .. tab-item:: ``{{ docker.pull_tag }}``
+         :sync: {{ docker.pull_tag }}
+
+         .. list-table::
+            :header-rows: 1
+
+            * - Software component
+              - Version
+
+            {% for component_name, component_version in docker.components.items() %}
+            * - {{ component_name }}
+              - {{ component_version }}
+
+            {% endfor %}
+         {% if jax_version == "0.6.0" %}
+         .. note::
+
+            Shardy is a new config in JAX 0.6.0. You might get related errors if it's
+            not configured correctly. For now you can turn it off by setting
+            ``shardy=False`` during the training run. You can also follow the `migration
+            guide <https://docs.jax.dev/en/latest/shardy_jax_migration.html>`__ to enable
+            it.
+         {% endif %}
+
+      {% endfor %}
+
+MaxText with on ROCm provides the following key features to train large language models efficiently:
+
+- Transformer Engine (TE)
+
+- Flash Attention (FA) 3 -- with or without sequence input packing
+
+- GEMM tuning
+
+- Multi-node support
+
+- NANOO FP8 (for MI300X series GPUs) and FP8 (for MI355X and MI350X) quantization support
+
+.. _amd-maxtext-model-support-v259:
+
+Supported models
+================
+
+The following models are pre-optimized for performance on AMD Instinct
+GPUs. Some instructions, commands, and available training
+configurations in this documentation might vary by model -- select one to get
+started.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/jax-maxtext-v25.9-benchmark-models.yaml
+
+   {% set model_groups = data.model_groups %}
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+         <div class="row gx-0">
+            <div class="col-2 me-1 px-2 model-param-head">Model</div>
+            <div class="row col-10 pe-0">
+      {% for model_group in model_groups %}
+               <div class="col-4 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+      {% endfor %}
+            </div>
+         </div>
+
+         <div class="row gx-0 pt-1">
+            <div class="col-2 me-1 px-2 model-param-head">Variant</div>
+            <div class="row col-10 pe-0">
+      {% for model_group in model_groups %}
+         {% set models = model_group.models %}
+         {% for model in models %}
+            {% if models|length % 3 == 0 %}
+               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% else %}
+               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% endif %}
+         {% endfor %}
+      {% endfor %}
+            </div>
+         </div>
+      </div>
+
+.. note::
+
+   Some models, such as Llama 3, require an external license agreement through
+   a third party (for example, Meta).
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting training.
+
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
+Environment setup
+=================
+
+This Docker image is optimized for specific model configurations outlined
+as follows. Performance can vary for other training workloads, as AMD
+doesn’t validate configurations and run conditions outside those described.
+
+Pull the Docker image
+---------------------
+
+Use the following command to pull the Docker image from Docker Hub.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/jax-maxtext-v25.9-benchmark-models.yaml
+
+   {% set docker = data.dockers[0] %}
+
+   .. code-block:: shell
+
+      docker pull {{ docker.pull_tag }}
+
+.. _amd-maxtext-multi-node-setup-v259:
+
+Multi-node configuration
+------------------------
+
+See :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your
+environment for multi-node training.
+
+.. _amd-maxtext-get-started-v259:
+
+Benchmarking
+============
+
+Once the setup is complete, choose between two options to reproduce the
+benchmark results:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/jax-maxtext-v25.9-benchmark-models.yaml
+
+   .. _vllm-benchmark-mad:
+
+   {% set docker = data.dockers[0] %}
+   {% set model_groups = data.model_groups %}
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. tab-set::
+
+         {% if model.mad_tag and "single-node" in model.doc_options %}
+         .. tab-item:: MAD-integrated benchmarking
+
+            The following run command is tailored to {{ model.model }}.
+            See :ref:`amd-maxtext-model-support-v259` to switch to another available model.
+
+            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+               directory and install the required packages on the host machine.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD
+                  pip install -r requirements.txt
+
+            2. Use this command to run the performance benchmark test on the {{ model.model }} model
+               using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
+
+               .. code-block:: shell
+
+                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+                  madengine run \
+                      --tags {{model.mad_tag}} \
+                      --keep-model-dir \
+                      --live-output \
+                      --timeout 28800
+
+            MAD launches a Docker container with the name
+            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
+            model are collected in the following path: ``~/MAD/perf.csv/``.
+         {% endif %}
+
+         .. tab-item:: Standalone benchmarking
+
+            The following commands are optimized for {{ model.model }}. See
+            :ref:`amd-maxtext-model-support-v259` to switch to another
+            available model. Some instructions and resources might not be
+            available for all models and configurations.
+
+            .. rubric:: Download the Docker image and required scripts
+
+            Run the JAX MaxText benchmark tool independently by starting the
+            Docker container as shown in the following snippet.
+
+            .. code-block:: shell
+
+               docker pull {{ docker.pull_tag }}
+
+            {% if model.model_repo and "single-node" in model.doc_options %}
+            .. rubric:: Single node training
+
+            1. Set up environment variables.
+
+               .. code-block:: shell
+
+                  export MAD_SECRETS_HFTOKEN=<Your Hugging Face token>
+                  export HF_HOME=<Location of saved/cached Hugging Face models>
+
+               ``MAD_SECRETS_HFTOKEN`` is your Hugging Face access token to access models, tokenizers, and data.
+               See `User access tokens <https://huggingface.co/docs/hub/en/security-tokens>`__.
+
+               ``HF_HOME`` is where ``huggingface_hub`` will store local data. See `huggingface_hub CLI <https://huggingface.co/docs/huggingface_hub/main/en/guides/cli#huggingface-cli-download>`__.
+               If you already have downloaded or cached Hugging Face artifacts, set this variable to that path.
+               Downloaded files typically get cached to ``~/.cache/huggingface``.
+
+            2. Launch the Docker container.
+
+               .. code-block:: shell
+
+                  docker run -it \
+                      --device=/dev/dri \
+                      --device=/dev/kfd \
+                      --network host \
+                      --ipc host \
+                      --group-add video \
+                      --cap-add=SYS_PTRACE \
+                      --security-opt seccomp=unconfined \
+                      --privileged \
+                      -v $HOME:$HOME \
+                      -v $HOME/.ssh:/root/.ssh \
+                      -v $HF_HOME:/hf_cache \
+                      -e HF_HOME=/hf_cache \
+                      -e MAD_SECRETS_HFTOKEN=$MAD_SECRETS_HFTOKEN
+                      --shm-size 64G \
+                      --name training_env \
+                      {{ docker.pull_tag }}
+
+            3. In the Docker container, clone the ROCm MAD repository and navigate to the
+               benchmark scripts directory at ``MAD/scripts/jax-maxtext``.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD/scripts/jax-maxtext
+
+            4. Run the setup scripts to install libraries and datasets needed
+               for benchmarking.
+
+               .. code-block:: shell
+
+                  ./jax-maxtext_benchmark_setup.sh -m {{ model.model_repo }}
+
+            5. To run the training benchmark without quantization, use the following command:
+
+               .. code-block:: shell
+
+                  ./jax-maxtext_benchmark_report.sh -m {{ model.model_repo }}
+
+               For quantized training, run the script with the appropriate option for your Instinct GPU.
+
+               .. tab-set::
+
+                  .. tab-item:: MI355X and MI350X
+
+                     For ``fp8`` quantized training on MI355X and MI350X GPUs, use the following command:
+
+                     .. code-block:: shell
+
+                        ./jax-maxtext_benchmark_report.sh -m {{ model.model_repo }} -q fp8
+
+                  {% if model.model_repo not in ["Llama-3.1-70B", "Llama-3.3-70B"] %}
+                  .. tab-item:: MI325X and MI300X
+
+                     For ``nanoo_fp8`` quantized training on MI300X series GPUs, use the following command:
+
+                     .. code-block:: shell
+
+                        ./jax-maxtext_benchmark_report.sh -m {{ model.model_repo }} -q nanoo_fp8
+                  {% endif %}
+
+            {% endif %}
+            {% if model.multinode_training_script and "multi-node" in model.doc_options %}
+            .. rubric:: Multi-node training
+
+            The following examples use SLURM to run on multiple nodes.
+
+            .. note::
+
+               The following scripts will launch the Docker container and run the
+               benchmark. Run them outside of any Docker container.
+
+            1. Make sure ``$HF_HOME`` is set before running the test. See
+               `ROCm benchmarking <https://github.com/ROCm/MAD/blob/develop/scripts/jax-maxtext/gpu-rocm/readme.md>`__
+               for more details on downloading the Llama models before running the
+               benchmark.
+
+            2. To run multi-node training for {{ model.model }},
+               use the
+               `multi-node training script <https://github.com/ROCm/MAD/blob/develop/scripts/jax-maxtext/gpu-rocm/{{ model.multinode_training_script }}>`__
+               under the ``scripts/jax-maxtext/gpu-rocm/`` directory.
+
+            3. Run the multi-node training benchmark script.
+
+               .. code-block:: shell
+
+                  sbatch -N <num_nodes> {{ model.multinode_training_script }}
+
+         {% else %}
+            .. rubric:: Multi-node training
+
+            For multi-node training examples, choose a model from :ref:`amd-maxtext-model-support-v259`
+            with an available `multi-node training script <https://github.com/ROCm/MAD/tree/develop/scripts/jax-maxtext/gpu-rocm>`__.
+         {% endif %}
+      {% endfor %}
+   {% endfor %}
+
+Further reading
+===============
+
+- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
+
+- To learn more about system settings and management practices to configure your system for
+  AMD Instinct MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`jax-maxtext-history` to find documentation for previous releases
+of the ``ROCm/jax-training`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst
@@ -16,14 +16,32 @@ previous releases of the ``ROCm/megatron-lm`` Docker image on `Docker Hub <https
     - Components
     - Resources

-   * - v25.9 (latest)
+   * - v25.11
+     -
+       * ROCm 7.1.0
+       * PyTorch 2.10.0.dev20251112+rocm7.1
+     -
+       * :doc:`Primus Megatron documentation <../primus-megatron>`
+       * :doc:`Megatron-LM (legacy) documentation <../megatron-lm>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197>`__
+
+   * - v25.10
+     -
+       * ROCm 7.1.0
+       * PyTorch 2.10.0.dev20251112+rocm7.1
+     -
+       * :doc:`Primus Megatron documentation <primus-megatron-v25.10>`
+       * :doc:`Megatron-LM (legacy) documentation <megatron-lm-v25.10>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197>`__
+
+   * - v25.9
     -
       * ROCm 7.0.0
       * Primus 0.3.0
       * PyTorch 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
     -
-       * :doc:`Primus Megatron documentation <../primus-megatron>`
-       * :doc:`Megatron-LM (legacy) documentation <../megatron-lm>`
+       * :doc:`Primus Megatron documentation <primus-megatron-v25.9>`
+       * :doc:`Megatron-LM (legacy) documentation <megatron-lm-v25.9>`
       * `Docker Hub (gfx950) <https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6>`__
       * `Docker Hub (gfx942) <https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357>`__

--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.10.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.10.rst
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.9.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.9.rst
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.10.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.10.rst
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.9.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.9.rst
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-pytorch-v25.10.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-pytorch-v25.10.rst
@@ -0,0 +1,448 @@
+:orphan:
+
+.. meta::
+   :description: How to train a model using PyTorch for ROCm.
+   :keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
+
+****************************************
+Training a model with Primus and PyTorch
+****************************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm Primus PyTorch training
+   performance benchmark documentation. See :doc:`../primus-pytorch` for the latest version.
+
+`Primus <https://github.com/AMD-AGI/Primus>`__ is a unified and flexible
+LLM training framework designed to streamline training. It streamlines LLM
+training on AMD Instinct GPUs using a modular, reproducible configuration paradigm.
+Primus now supports the PyTorch torchtitan backend.
+
+.. note::
+
+   For a unified training solution on AMD GPUs with ROCm, the `rocm/pytorch-training
+   <https://hub.docker.com/r/rocm/pytorch-training/>`__ Docker Hub registry will be
+   deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
+   The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
+   including torchtitan and :doc:`Megatron-LM <primus-megatron>`.
+
+   Primus with the PyTorch torchtitan backend is designed to replace the
+   :doc:`ROCm PyTorch training <pytorch-training>` workflow. See
+   :doc:`pytorch-training` to see steps to run workloads without Primus.
+
+AMD provides a ready-to-use Docker image for MI355X, MI350X, MI325X, and
+MI300X GPUs containing essential components for Primus and PyTorch training
+with Primus Turbo optimizations.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
+
+   .. tab-set::
+
+      .. tab-item:: {{ data.docker.pull_tag }}
+         :sync: {{ data.docker.pull_tag }}
+
+         .. list-table::
+            :header-rows: 1
+
+            * - Software component
+              - Version
+
+            {% for component_name, component_version in data.docker.components.items() %}
+            * - {{ component_name }}
+              - {{ component_version }}
+            {% endfor %}
+
+.. _amd-primus-pytorch-model-support-v2510:
+
+Supported models
+================
+
+The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X GPUs.
+Some instructions, commands, and training recommendations in this documentation might
+vary by model -- select one to get started.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
+
+   {% set model_groups = data.model_groups %}
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+         <div class="row gx-0">
+            <div class="col-2 me-1 px-2 model-param-head">Model</div>
+            <div class="row col-10 pe-0">
+      {% for model_group in model_groups %}
+               <div class="col-6 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+      {% endfor %}
+            </div>
+         </div>
+
+         <div class="row gx-0 pt-1">
+            <div class="col-2 me-1 px-2 model-param-head">Variant</div>
+            <div class="row col-10 pe-0">
+      {% for model_group in model_groups %}
+         {% set models = model_group.models %}
+         {% for model in models %}
+            {% if models|length % 3 == 0 %}
+               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% else %}
+               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% endif %}
+         {% endfor %}
+      {% endfor %}
+            </div>
+         </div>
+      </div>
+
+.. seealso::
+
+   For additional workloads, including Llama 3.3, Llama 3.2, Llama 2, GPT OSS, Qwen, and Flux models,
+   see the documentation :doc:`pytorch-training` (without Primus)
+
+.. _amd-primus-pytorch-performance-measurements-v2510:
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting training.
+
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
+This Docker image is optimized for specific model configurations outlined
+below. Performance can vary for other training workloads, as AMD
+doesn’t test configurations and run conditions outside those described.
+
+Pull the Docker image
+=====================
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
+
+   Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull {{ data.docker.pull_tag }}
+
+Run training
+============
+
+Once the setup is complete, choose between the following two workflows to start benchmarking training.
+For fine-tuning workloads and multi-node training examples, see :doc:`pytorch-training` (without Primus).
+For best performance on MI325X, MI350X, and MI355X GPUs, you might need to
+tweak some configurations (such as batch sizes).
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
+
+   {% set docker = data.docker %}
+   {% set model_groups = data.model_groups %}
+
+   .. tab-set::
+
+      .. tab-item:: MAD-integrated benchmarking
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+         .. container:: model-doc {{ model.mad_tag }}
+
+            The following run command is tailored to {{ model.model }}.
+            See :ref:`amd-primus-pytorch-model-support-v2510` to switch to another available model.
+
+            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+               directory and install the required packages on the host machine.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD
+                  pip install -r requirements.txt
+
+            2. For example, use this command to run the performance benchmark test on the {{ model.model }} model
+               using one node with the {{ model.precision }} data type on the host machine.
+
+               .. code-block:: shell
+
+                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+                  madengine run \
+                      --tags {{ model.mad_tag }} \
+                      --keep-model-dir \
+                      --live-output \
+                      --timeout 28800
+
+               MAD launches a Docker container with the name
+               ``container_ci-{{ model.mad_tag }}``. The latency and throughput reports of the
+               model are collected in ``~/MAD/perf.csv``.
+
+      {% endfor %}
+   {% endfor %}
+
+      .. tab-item:: Primus benchmarking
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+         .. container:: model-doc {{ model.mad_tag }}
+
+            The following run commands are tailored to {{ model.model }}.
+            See :ref:`amd-primus-pytorch-model-support-v2510` to switch to another available model.
+
+            .. rubric:: Download the Docker image and required packages
+
+            1. Pull the ``{{ docker.pull_tag }}`` Docker image from Docker Hub.
+
+               .. code-block:: shell
+
+                  docker pull {{ docker.pull_tag }}
+
+            2. Run the Docker container.
+
+               .. code-block:: shell
+
+                  docker run -it \
+                      --device /dev/dri \
+                      --device /dev/kfd \
+                      --network host \
+                      --ipc host \
+                      --group-add video \
+                      --cap-add SYS_PTRACE \
+                      --security-opt seccomp=unconfined \
+                      --privileged \
+                      -v $HOME:$HOME \
+                      -v $HOME/.ssh:/root/.ssh \
+                      --shm-size 64G \
+                      --name training_env \
+                      {{ docker.pull_tag }}
+
+               Use these commands if you exit the ``training_env`` container and need to return to it.
+
+               .. code-block:: shell
+
+                  docker start training_env
+                  docker exec -it training_env bash
+
+            .. rubric:: Prepare training datasets and dependencies
+
+            The following benchmarking examples require downloading models and datasets
+            from Hugging Face. To ensure successful access to gated repos, set your
+            ``HF_TOKEN``.
+
+            .. code-block:: shell
+
+               export HF_TOKEN=$your_personal_hugging_face_access_token
+
+            .. rubric:: Pretraining
+
+            To get started, navigate to the ``Primus`` directory in your container.
+
+            .. code-block::
+
+               cd /workspace/Primus
+
+            Now, to start the pretraining benchmark, use the ``run_pretrain.sh`` script
+            included with Primus with the appropriate options.
+
+            .. rubric:: Benchmarking examples
+
+            .. container:: model-doc primus_pyt_train_llama-3.1-8b
+
+               Use the following command to run train Llama 3.1 8B with BF16 precision using Primus torchtitan.
+
+               .. tab-set::
+
+                  .. tab-item:: MI355X and MI350X
+                     :sync: MI355X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/MI355X/llama3.1_8B-BF16-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.local_batch_size 6
+
+                  .. tab-item:: MI325X
+                     :sync: MI325X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-BF16-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.local_batch_size 6
+
+                  .. tab-item:: MI300X
+                     :sync: MI300X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-BF16-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.local_batch_size 4
+
+
+               To train Llama 3.1 8B with FP8 precision, use the following command.
+
+               .. tab-set::
+
+                  .. tab-item:: MI355X and MI350X
+                     :sync: MI355X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/MI355X/llama3.1_8B-BF16-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.local_batch_size 8
+
+                  .. tab-item:: MI325X
+                     :sync: MI325X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.local_batch_size 7
+
+                  .. tab-item:: MI300X
+                     :sync: MI300X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.local_batch_size 5
+
+            .. container:: model-doc primus_pyt_train_llama-3.1-70b
+
+               Use the following command to run train Llama 3.1 70B with BF16 precision using Primus torchtitan.
+
+               .. tab-set::
+
+                  .. tab-item:: MI355X and MI350X
+                     :sync: MI355X and MI300X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/MI355X/llama3.1_70B-BF16-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.local_batch_size 8
+
+                  .. tab-item:: MI325X
+                     :sync: MI325X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.local_batch_size 6
+
+                  .. tab-item:: MI300X
+                     :sync: MI300X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.local_batch_size 4
+
+               To train Llama 3.1 70B with FP8 precision, use the following command.
+
+               .. tab-set::
+
+                  .. tab-item:: MI355X and MI350X
+                     :sync: MI355X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/MI355X/llama3.1_70B-FP8-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.local_batch_size 6
+
+                  .. tab-item:: MI325X
+                     :sync: MI325X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.local_batch_size 5
+
+                  .. tab-item:: MI300X
+                     :sync: MI300X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.local_batch_size 3
+
+            .. container:: model-doc primus_pyt_train_deepseek-v2
+
+               Use the following command to run train DeepSeek V2 16B with BF16 precision using Primus torchtitan.
+
+               .. tab-set::
+
+                  .. tab-item:: MI355X and MI350X
+                     :sync: MI355X and MI300X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/MI355X/deepseek_v3_16b-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.local_batch_size 16
+
+                  .. tab-item:: MI325X
+                     :sync: MI325X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.local_batch_size 10
+
+                  .. tab-item:: MI300X
+                     :sync: MI300X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.local_batch_size 8
+
+               To train DeepSeek V2 16B with FP8 precision, use the following command.
+
+               .. tab-set::
+
+                  .. tab-item:: MI355X and MI350X
+                     :sync: MI355X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/MI355X/deepseek_v3_16b-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.local_batch_size 16
+
+                  .. tab-item:: MI325X
+                     :sync: MI325X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.local_batch_size 8
+
+                  .. tab-item:: MI300X
+                     :sync: MI300X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.local_batch_size 8
+      {% endfor %}
+   {% endfor %}
+
+Further reading
+===============
+
+- For an introduction to Primus, see `Primus: A Lightweight, Unified Training
+  Framework for Large Models on AMD GPUs <https://rocm.blogs.amd.com/software-tools-optimization/primus/README.html>`__.
+
+- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
+
+- To learn more about system settings and management practices to configure your system for
+  AMD Instinct MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`pytorch-training-history` to find documentation for previous releases
+of the ``ROCm/pytorch-training`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-pytorch-v25.9.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-pytorch-v25.9.rst
@@ -0,0 +1,574 @@
+:orphan:
+
+.. meta::
+   :description: How to train a model using PyTorch for ROCm.
+   :keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
+
+****************************************
+Training a model with Primus and PyTorch
+****************************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm Primus PyTorch training
+   performance benchmark documentation. See :doc:`../primus-pytorch` for the latest version.
+
+`Primus <https://github.com/AMD-AGI/Primus>`__ is a unified and flexible
+LLM training framework designed to streamline training. It streamlines LLM
+training on AMD Instinct GPUs using a modular, reproducible configuration paradigm.
+Primus now supports the PyTorch torchtitan backend.
+
+.. note::
+
+   For a unified training solution on AMD GPUs with ROCm, the `rocm/pytorch-training
+   <https://hub.docker.com/r/rocm/pytorch-training/>`__ Docker Hub registry will be
+   deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
+   The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
+   including torchtitan and :doc:`Megatron-LM <../primus-megatron>`.
+
+   Primus with the PyTorch torchtitan backend is designed to replace the
+   :doc:`ROCm PyTorch training <../pytorch-training>` workflow. See
+   :doc:`../pytorch-training` to see steps to run workloads without Primus.
+
+AMD provides a ready-to-use Docker image for MI355X, MI350X, MI325X, and
+MI300X GPUs containing essential components for Primus and PyTorch training
+with Primus Turbo optimizations.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.9-benchmark-models.yaml
+
+   {% set dockers = data.dockers %}
+   .. tab-set::
+
+   {% for supported_gpus, docker in dockers.items() %}
+      .. tab-item:: {{ supported_gpus }}
+         :sync: {{ supported_gpus }}
+
+         .. list-table::
+            :header-rows: 1
+
+            * - Software component
+              - Version
+
+            {% for component_name, component_version in docker.components.items() %}
+            * - {{ component_name }}
+              - {{ component_version }}
+            {% endfor %}
+   {% endfor %}
+
+.. _amd-primus-pytorch-model-support-v259:
+
+Supported models
+================
+
+The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X GPUs.
+Some instructions, commands, and training recommendations in this documentation might
+vary by model -- select one to get started.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.9-benchmark-models.yaml
+
+   {% set model_groups = data.model_groups %}
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+         <div class="row gx-0">
+            <div class="col-2 me-1 px-2 model-param-head">Model</div>
+            <div class="row col-10 pe-0">
+      {% for model_group in model_groups %}
+               <div class="col-12 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+      {% endfor %}
+            </div>
+         </div>
+
+         <div class="row gx-0 pt-1">
+            <div class="col-2 me-1 px-2 model-param-head">Variant</div>
+            <div class="row col-10 pe-0">
+      {% for model_group in model_groups %}
+         {% set models = model_group.models %}
+         {% for model in models %}
+            {% if models|length % 3 == 0 %}
+               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% else %}
+               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% endif %}
+         {% endfor %}
+      {% endfor %}
+            </div>
+         </div>
+      </div>
+
+.. seealso::
+
+   For additional workloads, including Llama 3.3, Llama 3.2, Llama 2, GPT OSS, Qwen, and Flux models,
+   see the documentation :doc:`../pytorch-training` (without Primus)
+
+.. _amd-primus-pytorch-performance-measurements-v259:
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting training.
+
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
+This Docker image is optimized for specific model configurations outlined
+below. Performance can vary for other training workloads, as AMD
+doesn’t test configurations and run conditions outside those described.
+
+Pull the Docker image
+=====================
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.9-benchmark-models.yaml
+
+   {% set dockers = data.dockers %}
+
+   Use the following command to pull the Docker image from Docker Hub.
+
+   .. tab-set::
+
+      {% for supported_gpus, docker in dockers.items() %}
+      .. tab-item:: {{ supported_gpus }}
+         :sync: {{ supported_gpus }}
+
+         .. code-block:: shell
+
+            docker pull {{ docker.pull_tag }}
+      {% endfor %}
+
+Run training
+============
+
+Once the setup is complete, choose between the following two workflows to start benchmarking training.
+For fine-tuning workloads and multi-node training examples, see :doc:`../pytorch-training` (without Primus).
+For best performance on MI325X, MI350X, and MI355X GPUs, you might need to
+tweak some configurations (such as batch sizes).
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.9-benchmark-models.yaml
+
+   {% set dockers = data.dockers %}
+   {% set model_groups = data.model_groups %}
+
+   .. tab-set::
+
+      .. tab-item:: MAD-integrated benchmarking
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+         .. container:: model-doc {{ model.mad_tag }}
+
+            The following run command is tailored to {{ model.model }}.
+            See :ref:`amd-primus-pytorch-model-support-v259` to switch to another available model.
+
+            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+               directory and install the required packages on the host machine.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD
+                  pip install -r requirements.txt
+
+            2. For example, use this command to run the performance benchmark test on the {{ model.model }} model
+               using one node with the {{ model.precision }} data type on the host machine.
+
+               .. code-block:: shell
+
+                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+                  madengine run \
+                      --tags {{ model.mad_tag }} \
+                      --keep-model-dir \
+                      --live-output \
+                      --timeout 28800
+
+               MAD launches a Docker container with the name
+               ``container_ci-{{ model.mad_tag }}``. The latency and throughput reports of the
+               model are collected in ``~/MAD/perf.csv``.
+
+               .. note::
+
+                  Currently, Primus torchtitan models are run with Primus Turbo
+                  enabled for enhanced performance. To disable Primus Turbo,
+                  modify respective configuration file
+                  ``scripts/primus/pytorch_train/primus_torchtitan_scripts/llama3_[8B|70B]-[BF16|FP8].yaml``.
+
+      {% endfor %}
+   {% endfor %}
+
+      .. tab-item:: Primus benchmarking
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+         .. container:: model-doc {{ model.mad_tag }}
+
+            The following run commands are tailored to {{ model.model }}.
+            See :ref:`amd-primus-pytorch-model-support-v259` to switch to another available model.
+
+            .. rubric:: Download the Docker image and required packages
+
+            1. Pull the appropriate Docker image for your AMD GPU architecture from Docker Hub.
+
+               .. tab-set::
+
+                  {% for supported_gpus, docker in dockers.items() %}
+                  .. tab-item:: {{ supported_gpus }}
+                     :sync: {{ supported_gpus }}
+
+                     .. code-block:: shell
+
+                        docker pull {{ docker.pull_tag }}
+                  {% endfor %}
+
+            2. Run the Docker container.
+
+               .. tab-set::
+
+                  {% for supported_gpus, docker in dockers.items() %}
+                  .. tab-item:: {{ supported_gpus }}
+                     :sync: {{ supported_gpus }}
+
+                     .. code-block:: shell
+
+                        docker run -it \
+                            --device /dev/dri \
+                            --device /dev/kfd \
+                            --network host \
+                            --ipc host \
+                            --group-add video \
+                            --cap-add SYS_PTRACE \
+                            --security-opt seccomp=unconfined \
+                            --privileged \
+                            -v $HOME:$HOME \
+                            -v $HOME/.ssh:/root/.ssh \
+                            --shm-size 64G \
+                            --name training_env \
+                            {{ docker.pull_tag }}
+                  {% endfor %}
+
+               Use these commands if you exit the ``training_env`` container and need to return to it.
+
+               .. code-block:: shell
+
+                  docker start training_env
+                  docker exec -it training_env bash
+
+            .. rubric:: Prepare training datasets and dependencies
+
+            The following benchmarking examples require downloading models and datasets
+            from Hugging Face. To ensure successful access to gated repos, set your
+            ``HF_TOKEN``.
+
+            .. code-block:: shell
+
+               export HF_TOKEN=$your_personal_hugging_face_access_token
+
+            .. rubric:: Pretraining
+
+            To get started, navigate to the ``Primus`` directory in your container.
+
+            .. code-block::
+
+               cd /workspace/Primus
+
+            Now, to start the pretraining benchmark, use the ``run_pretrain.sh`` script
+            included with Primus with the appropriate options.
+
+            .. rubric:: Benchmarking examples
+
+            .. container:: model-doc primus_pyt_train_llama-3.1-8b
+
+               Use the following command to run train Llama 3.1 8B with BF16 precision using Primus torchtitan.
+
+               .. tab-set::
+
+                  .. tab-item:: MI355X and MI350X
+                     :sync: MI355X and MI300X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \
+                        bash examples/run_pretrain.sh \
+                            --metrics.enable_tensorboard false \
+                            --profiling.enable_profiling false \
+                            --training.batch_size 5
+
+                  .. tab-item:: MI325X
+                     :sync: MI325X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \
+                        bash examples/run_pretrain.sh \
+                            --metrics.enable_tensorboard false \
+                            --profiling.enable_profiling false \
+                            --training.batch_size 6
+
+                  .. tab-item:: MI300X
+                     :sync: MI325X and MI300X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \
+                        bash examples/run_pretrain.sh \
+                            --metrics.enable_tensorboard false \
+                            --profiling.enable_profiling false \
+                            --training.batch_size 4
+
+
+               To train Llama 3.1 8B with FP8 precision, use the following command.
+
+               .. tab-set::
+
+                  .. tab-item:: MI355X and MI350X
+                     :sync: MI355X and MI300X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \
+                        bash examples/run_pretrain.sh \
+                            --metrics.enable_tensorboard false \
+                            --profiling.enable_profiling false \
+                            --training.batch_size 8
+
+                  .. tab-item:: MI325X
+                     :sync: MI325X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/llama3.1_8B-FP8-pretrain.yaml \
+                        bash examples/run_pretrain.sh \
+                            --metrics.enable_tensorboard false \
+                            --profiling.enable_profiling false \
+                            --training.batch_size 7
+
+                  .. tab-item:: MI300X
+                     :sync: MI325X and MI300X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/llama3.1_8B-FP8-pretrain.yaml \
+                        bash examples/run_pretrain.sh \
+                            --metrics.enable_tensorboard false \
+                            --profiling.enable_profiling false \
+                            --training.batch_size 5
+
+            .. container:: model-doc primus_pyt_train_llama-3.1-70b
+
+               Use the following command to run train Llama 3.1 70B with BF16 precision using Primus torchtitan.
+
+               .. tab-set::
+
+                  .. tab-item:: MI355X and MI350X
+                     :sync: MI355X and MI300X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/llama3.1_70B-BF16-pretrain.yaml \
+                        bash examples/run_pretrain.sh \
+                            --metrics.enable_tensorboard false \
+                            --profiling.enable_profiling false \
+                            --training.batch_size 8
+
+                  .. tab-item:: MI325X
+                     :sync: MI325X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/llama3.1_70B-BF16-pretrain.yaml \
+                        bash examples/run_pretrain.sh \
+                            --metrics.enable_tensorboard false \
+                            --profiling.enable_profiling false \
+                            --training.batch_size 6
+
+                  .. tab-item:: MI300X
+                     :sync: MI325X and MI300X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/llama3.1_70B-BF16-pretrain.yaml \
+                        bash examples/run_pretrain.sh \
+                            --metrics.enable_tensorboard false \
+                            --profiling.enable_profiling false \
+                            --training.batch_size 4
+
+               To train Llama 3.1 70B with FP8 precision, use the following command.
+
+               .. tab-set::
+
+                  .. tab-item:: MI355X and MI350X
+                     :sync: MI355X and MI300X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/llama3.1_70B-FP8-pretrain.yaml \
+                        bash examples/run_pretrain.sh \
+                            --metrics.enable_tensorboard false \
+                            --profiling.enable_profiling false \
+                            --training.batch_size 6
+
+                  .. tab-item:: MI325X
+                     :sync: MI325X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/llama3.1_70B-FP8-pretrain.yaml \
+                        bash examples/run_pretrain.sh \
+                            --metrics.enable_tensorboard false \
+                            --profiling.enable_profiling false \
+                            --training.batch_size 5
+
+                  .. tab-item:: MI300X
+                     :sync: MI325X and MI300X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/llama3.1_70B-FP8-pretrain.yaml \
+                        bash examples/run_pretrain.sh \
+                            --metrics.enable_tensorboard false \
+                            --profiling.enable_profiling false \
+                            --training.batch_size 3
+      {% endfor %}
+   {% endfor %}
+
+      .. tab-item:: Standalone torchtitan benchmarking
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+         .. container:: model-doc {{ model.mad_tag }}
+
+            The following run commands are tailored to {{ model.model }}.
+            See :ref:`amd-primus-pytorch-model-support-v259` to switch to another available model.
+
+            .. rubric:: Download the Docker image and required packages
+
+            1. Pull the appropriate Docker image for your AMD GPU architecture from Docker Hub.
+
+               .. tab-set::
+
+                  {% for supported_gpus, docker in dockers.items() %}
+                  .. tab-item:: {{ supported_gpus }}
+                     :sync: {{ supported_gpus }}
+
+                     .. code-block:: shell
+
+                        docker pull {{ docker.pull_tag }}
+                  {% endfor %}
+
+            2. Run the Docker container.
+
+               .. tab-set::
+
+                  {% for supported_gpus, docker in dockers.items() %}
+                  .. tab-item:: {{ supported_gpus }}
+                     :sync: {{ supported_gpus }}
+
+                     .. code-block:: shell
+
+                        docker run -it \
+                            --device /dev/dri \
+                            --device /dev/kfd \
+                            --network host \
+                            --ipc host \
+                            --group-add video \
+                            --cap-add SYS_PTRACE \
+                            --security-opt seccomp=unconfined \
+                            --privileged \
+                            -v $HOME:$HOME \
+                            -v $HOME/.ssh:/root/.ssh \
+                            --shm-size 64G \
+                            --name training_env \
+                            {{ docker.pull_tag }}
+                  {% endfor %}
+
+               Use these commands if you exit the ``training_env`` container and need to return to it.
+
+               .. code-block:: shell
+
+                  docker start training_env
+                  docker exec -it training_env bash
+
+            3. Navigate to the ``torchtitan`` workspace directory.
+
+               .. code-block:: shell
+
+                  cd /workspace/torchtitan
+
+            .. rubric:: Download the tokenizer
+
+            1. The following benchmarking examples require downloading models and datasets
+               from Hugging Face. To ensure successful access to gated repos, set your
+               ``HF_TOKEN``.
+
+               .. code-block:: shell
+
+                  export HF_TOKEN=$your_personal_hugging_face_access_token
+
+            2. Download the tokenizer for your model.
+
+               .. container:: model-doc {{ model.mad_tag }}
+
+                  .. code-block:: shell
+
+                     python3 scripts/download_tokenizer.py \
+                        --repo_id {{ model.model_repo }} \
+                        --tokenizer_path "original" \
+                        --hf_token=${HF_TOKEN}
+
+            .. rubric:: Pretraining examples
+
+            Run the training script with the appropriate configuration file.
+
+            For train with BF16 precicion, use the following command:
+
+            .. container:: model-doc {{ model.mad_tag }}
+
+               .. code-block:: shell
+
+                  CONFIG_FILE={{ model.config_file.bf16 }} \
+                  .run_train.sh
+
+            For train with BF16 precicion, use the following command:
+
+            .. container:: model-doc {{ model.mad_tag }}
+
+               .. code-block:: shell
+
+                  CONFIG_FILE={{ model.config_file.fp8 }} \
+                  .run_train.sh
+      {% endfor %}
+   {% endfor %}
+
+Known issues
+============
+
+PyTorch Profiler may produce inaccurate traces when CPU activity profiling is enabled.
+
+
+Further reading
+===============
+
+- For an introduction to Primus, see `Primus: A Lightweight, Unified Training
+  Framework for Large Models on AMD GPUs <https://rocm.blogs.amd.com/software-tools-optimization/primus/README.html>`__.
+
+- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
+
+- To learn more about system settings and management practices to configure your system for
+  AMD Instinct MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`pytorch-training-history` to find documentation for previous releases
+of the ``ROCm/pytorch-training`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst
@@ -16,14 +16,32 @@ previous releases of the ``ROCm/pytorch-training`` Docker image on `Docker Hub <
     - Components
     - Resources

-   * - v25.9 (latest)
+   * - v25.11
+     -
+       * ROCm 7.1.0
+       * PyTorch 2.10.0.dev20251112+rocm7.1
+     -
+       * :doc:`Primus PyTorch Training documentation <../primus-pytorch>`
+       * :doc:`PyTorch training (legacy) documentation <../pytorch-training>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197>`__
+
+   * - v25.10
+     -
+       * ROCm 7.1.0
+       * PyTorch 2.10.0.dev20251112+rocm7.1
+     -
+       * :doc:`Primus PyTorch Training documentation <primus-pytorch-v25.10>`
+       * :doc:`PyTorch training (legacy) documentation <pytorch-training-v25.10>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/primus/v25.10/images/sha256-140c37cd2eeeb183759b9622543fc03cc210dc97cbfa18eeefdcbda84420c197>`__
+
+   * - v25.9
     -
       * ROCm 7.0.0
       * Primus 0.3.0
       * PyTorch 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
     -
-       * :doc:`Primus PyTorch Training documentation <../primus-pytorch>`
-       * :doc:`PyTorch training (legacy) documentation <../pytorch-training>`
+       * :doc:`Primus PyTorch Training documentation <primus-pytorch-v25.9>`
+       * :doc:`PyTorch training (legacy) documentation <pytorch-training-v25.9>`
       * `Docker Hub (gfx950) <https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6>`__
       * `Docker Hub (gfx942) <https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357>`__

--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.10.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.10.rst
@@ -0,0 +1,669 @@
+:orphan:
+
+.. meta::
+   :description: How to train a model using PyTorch for ROCm.
+   :keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
+
+**************************************
+Training a model with PyTorch on ROCm
+**************************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm PyTorch training
+   performance benchmark documentation. See :doc:`../pytorch-training` for the latest version.
+
+.. note::
+
+   For a unified training solution on AMD GPUs with ROCm, the `rocm/pytorch-training
+   <https://hub.docker.com/r/rocm/pytorch-training/>`__ Docker Hub registry will be
+   deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
+   The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
+   including torchtitan and :doc:`Megatron-LM <../primus-megatron>`.
+
+   See :doc:`../primus-pytorch` for details.
+
+PyTorch is an open-source machine learning framework that is widely used for
+model training with GPU-optimized components for transformer-based models.
+The PyTorch for ROCm training Docker image provides a prebuilt optimized
+environment for fine-tuning and pretraining a model on AMD Instinct MI325X
+and MI300X GPUs. It includes the following software components to accelerate
+training workloads:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
+
+   .. tab-set::
+
+      .. tab-item:: {{ data.docker.pull_tag }}
+         :sync: {{ data.docker.pull_tag }}
+
+         .. list-table::
+            :header-rows: 1
+
+            * - Software component
+              - Version
+
+            {% for component_name, component_version in data.docker.components.items() %}
+            * - {{ component_name }}
+              - {{ component_version }}
+            {% endfor %}
+
+.. _amd-pytorch-training-model-support-v2510:
+
+Supported models
+================
+
+The following models are pre-optimized for performance on the AMD Instinct
+MI355X, MI350X, MI325X, and MI300X GPUs. Some instructions, commands, and
+training recommendations in this documentation might vary by model -- select
+one to get started.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
+
+   {% set model_groups = data.model_groups %}
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+         <div class="row gx-0">
+            <div class="col-2 me-1 px-2 model-param-head">Model</div>
+            <div class="row col-10 pe-0">
+      {% for model_group in model_groups %}
+               <div class="col-4 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+      {% endfor %}
+            </div>
+         </div>
+
+         <div class="row gx-0 pt-1">
+            <div class="col-2 me-1 px-2 model-param-head">Variant</div>
+            <div class="row col-10 pe-0">
+      {% for model_group in model_groups %}
+         {% set models = model_group.models %}
+         {% for model in models %}
+            {% if models|length % 3 == 0 %}
+               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% else %}
+               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% endif %}
+         {% endfor %}
+      {% endfor %}
+            </div>
+         </div>
+      </div>
+
+.. _amd-pytorch-training-supported-training-modes-v2510:
+
+The following table lists supported training modes per model.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
+
+   {% set model_groups = data.model_groups %}
+   .. dropdown:: Supported training modes
+
+      .. list-table::
+         :header-rows: 1
+
+         * - Model
+           - Supported training modes
+
+      {% for model_group in model_groups %}
+         {% set models = model_group.models %}
+         {% for model in models %}
+         {% if model.training_modes %}
+         * - {{ model.model }}
+           - ``{{ model.training_modes | join('``, ``') }}``
+
+         {% endif %}
+         {% endfor %}
+      {% endfor %}
+
+      .. note::
+
+         Some model and fine-tuning combinations are not listed. This is
+         because the `upstream torchtune repository <https://github.com/pytorch/torchtune>`__
+         doesn't provide default YAML configurations for them.
+         For advanced usage, you can create a custom configuration to enable
+         unlisted fine-tuning methods by using an existing file in the
+         ``/workspace/torchtune/recipes/configs`` directory as a template.
+
+.. _amd-pytorch-training-performance-measurements-v2510:
+
+Performance measurements
+========================
+
+To evaluate performance, the
+`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
+page provides reference throughput and latency measurements for training
+popular AI models.
+
+.. note::
+
+   The performance data presented in
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
+   should not be interpreted as the peak performance achievable by AMD
+   Instinct MI325X and MI300X GPUs or ROCm software.
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting training.
+
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
+This Docker image is optimized for specific model configurations outlined
+below. Performance can vary for other training workloads, as AMD
+doesn’t test configurations and run conditions outside those described.
+
+Run training
+============
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
+
+   {% set docker = data.docker %}
+   {% set model_groups = data.model_groups %}
+
+   Once the setup is complete, choose between two options to start benchmarking training:
+
+   .. tab-set::
+
+      .. tab-item:: MAD-integrated benchmarking
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+         .. container:: model-doc {{ model.mad_tag }}
+
+            The following run command is tailored to {{ model.model }}.
+            See :ref:`amd-pytorch-training-model-support-v2510` to switch to another available model.
+
+            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+               directory and install the required packages on the host machine.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD
+                  pip install -r requirements.txt
+
+            2. For example, use this command to run the performance benchmark test on the {{ model.model }} model
+               using one node with the {{ model.precision }} data type on the host machine.
+
+               .. code-block:: shell
+
+                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+                  madengine run \
+                      --tags {{ model.mad_tag }} \
+                      --keep-model-dir \
+                      --live-output \
+                      --timeout 28800
+
+               MAD launches a Docker container with the name
+               ``container_ci-{{ model.mad_tag }}``. The latency and throughput reports of the
+               model are collected in ``~/MAD/perf.csv``.
+
+      {% endfor %}
+   {% endfor %}
+
+      .. tab-item:: Standalone benchmarking
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+         .. container:: model-doc {{ model.mad_tag }}
+
+            The following commands are tailored to {{ model.model }}.
+            See :ref:`amd-pytorch-training-model-support-v2510` to switch to another available model.
+
+      {% endfor %}
+   {% endfor %}
+
+         .. rubric:: Download the Docker image and required packages
+
+         1. Use the following command to pull the Docker image from Docker Hub.
+
+            .. code-block:: shell
+
+               docker pull {{ docker.pull_tag }}
+
+         2. Launch the Docker container.
+
+            .. code-block:: shell
+
+               docker run -it \
+                   --device /dev/dri \
+                   --device /dev/kfd \
+                   --network host \
+                   --ipc host \
+                   --group-add video \
+                   --cap-add SYS_PTRACE \
+                   --security-opt seccomp=unconfined \
+                   --privileged \
+                   -v $HOME:$HOME \
+                   -v $HOME/.ssh:/root/.ssh \
+                   --shm-size 64G \
+                   --name training_env \
+                   {{ docker.pull_tag }}
+
+            Use these commands if you exit the ``training_env`` container and need to return to it.
+
+            .. code-block:: shell
+
+               docker start training_env
+               docker exec -it training_env bash
+
+         3. In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
+            repository and navigate to the benchmark scripts directory
+            ``/workspace/MAD/scripts/pytorch_train``.
+
+            .. code-block:: shell
+
+               git clone https://github.com/ROCm/MAD
+               cd MAD/scripts/pytorch_train
+
+         .. rubric:: Prepare training datasets and dependencies
+
+         1. The following benchmarking examples require downloading models and datasets
+            from Hugging Face. To ensure successful access to gated repos, set your
+            ``HF_TOKEN``.
+
+            .. code-block:: shell
+
+               export HF_TOKEN=$your_personal_hugging_face_access_token
+
+         2. Run the setup script to install libraries and datasets needed for benchmarking.
+
+            .. code-block:: shell
+
+               ./pytorch_benchmark_setup.sh
+
+            .. container:: model-doc pyt_train_llama-3.1-8b
+
+               ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 8B:
+
+               .. list-table::
+                  :header-rows: 1
+
+                  * - Library
+                    - Reference
+
+                  * - ``accelerate``
+                    - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
+
+                  * - ``datasets``
+                    - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
+
+            .. container:: model-doc pyt_train_llama-3.1-70b
+
+               ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 70B:
+
+               .. list-table::
+                  :header-rows: 1
+
+                  * - Library
+                    - Reference
+
+                  * - ``datasets``
+                    - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
+
+                  * - ``torchdata``
+                    - `TorchData <https://meta-pytorch.org/data/beta/index.html#torchdata>`__
+
+                  * - ``tomli``
+                    - `Tomli <https://pypi.org/project/tomli/>`__
+
+                  * - ``tiktoken``
+                    - `tiktoken <https://github.com/openai/tiktoken>`__
+
+                  * - ``blobfile``
+                    - `blobfile <https://pypi.org/project/blobfile/>`__
+
+                  * - ``tabulate``
+                    - `tabulate <https://pypi.org/project/tabulate/>`__
+
+                  * - ``wandb``
+                    - `Weights & Biases <https://github.com/wandb/wandb>`__
+
+                  * - ``sentencepiece``
+                    - `SentencePiece <https://github.com/google/sentencepiece>`__ 0.2.0
+
+                  * - ``tensorboard``
+                    - `TensorBoard <https://www.tensorflow.org/tensorboard>`__ 2.18.0
+
+            .. container:: model-doc pyt_train_flux
+
+               ``pytorch_benchmark_setup.sh`` installs the following libraries for FLUX:
+
+               .. list-table::
+                  :header-rows: 1
+
+                  * - Library
+                    - Reference
+
+                  * - ``accelerate``
+                    - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
+
+                  * - ``datasets``
+                    - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`__ 3.2.0
+
+                  * - ``sentencepiece``
+                    - `SentencePiece <https://github.com/google/sentencepiece>`__ 0.2.0
+
+                  * - ``tensorboard``
+                    - `TensorBoard <https://www.tensorflow.org/tensorboard>`__ 2.18.0
+
+                  * - ``csvkit``
+                    - `csvkit <https://csvkit.readthedocs.io/en/latest/>`__ 2.0.1
+
+                  * - ``deepspeed``
+                    - `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`__ 0.16.2
+
+                  * - ``diffusers``
+                    - `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`__ 0.31.0
+
+                  * - ``GitPython``
+                    - `GitPython <https://github.com/gitpython-developers/GitPython>`__ 3.1.44
+
+                  * - ``opencv-python-headless``
+                    - `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`__ 4.10.0.84
+
+                  * - ``peft``
+                    - `PEFT <https://huggingface.co/docs/peft/en/index>`__ 0.14.0
+
+                  * - ``protobuf``
+                    - `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`__ 5.29.2
+
+                  * - ``pytest``
+                    - `PyTest <https://docs.pytest.org/en/stable/>`__ 8.3.4
+
+                  * - ``python-dotenv``
+                    - `python-dotenv <https://pypi.org/project/python-dotenv/>`__ 1.0.1
+
+                  * - ``seaborn``
+                    - `Seaborn <https://seaborn.pydata.org/>`__ 0.13.2
+
+                  * - ``transformers``
+                    - `Transformers <https://huggingface.co/docs/transformers/en/index>`__ 4.47.0
+
+            ``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face:
+
+            * `frank-chieng/chinese_architecture_siheyuan <https://huggingface.co/datasets/frank-chieng/chinese_architecture_siheyuan>`__
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+         {% set training_modes = model.training_modes %}
+         {% set training_mode_descs = {
+            "pretrain": "Benchmark pre-training.",
+            "HF_pretrain": "Llama 3.1 8B pre-training with FP8 precision."
+         } %}
+         {% set available_modes = training_modes | select("in", ["pretrain", "HF_pretrain"]) | list %}
+         {% if available_modes %}
+
+         .. container:: model-doc {{ model.mad_tag }}
+
+            .. rubric:: Pretraining
+
+            To start the pre-training benchmark, use the following command with the
+            appropriate options. See the following list of options and their descriptions.
+
+            {% if model.mad_tag == "pyt_train_dlrm" %}
+
+            1. Go to the DLRM directory.
+
+               .. code-block:: shell
+
+                  cd /workspace/DLRMBenchmark
+
+            2. To run the single node training benchmark for DLRM-v2 with TF32 precision,
+               run the following script.
+
+               .. code-block:: shell
+
+                  ./launch_training_single_node.sh
+
+               To run with MAD within the Docker container, use the following command.
+
+               .. code-block:: shell
+
+                  ./pytorch_benchmark_report.sh -t pretrain -m DLRM
+
+            {% else %}
+
+            .. code-block:: shell
+
+               ./pytorch_benchmark_report.sh -t {% if available_modes | length == 1 %}{{ available_modes[0] }}{% else %}$training_mode{% endif %} \
+                   -m {{ model.model_repo }} \
+                   -p $datatype \
+                   -s $sequence_length
+
+            {% if model.mad_tag == "pyt_train_flux" %}
+            .. container:: model-doc {{ model.mad_tag }}
+
+               .. note::
+
+                  Currently, FLUX models are not supported out-of-the-box on this Docker.
+                  To use FLUX, refer to ``rocm/pytorch-training`` Docker: :doc:`pytorch-training-v25.6`
+
+                  Occasionally, downloading the Flux dataset might fail. In the event of this
+                  error, manually download it from Hugging Face at
+                  `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
+                  and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
+                  the required dataset.
+            {% endif %}
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Name
+                 - Options
+                 - Description
+
+               {% for mode in available_modes %}
+               * - {% if loop.first %}``$training_mode``{% endif %}
+                 - ``{{ mode }}``
+                 - {{ training_mode_descs[mode] }}
+               {% endfor %}
+
+               * - ``$datatype``
+                 - ``BF16``{% if model.mad_tag == "pyt_train_llama-3.1-8b" %} or ``FP8``{% endif %}
+                 - Only Llama 3.1 8B supports FP8 precision.
+
+               * - ``$sequence_length``
+                 - Sequence length for the language model.
+                 - Between 2048 and 8192. 8192 by default.
+            {% endif %}
+         {% endif %}
+
+         {% set training_modes = model.training_modes %}
+         {% set training_mode_descs = {
+            "posttrain": "Benchmark post-training.",
+         } %}
+         {% set available_modes = training_modes | select("in", ["posttrain"]) | list %}
+         {% if available_modes %}
+
+         .. container:: model-doc {{ model.mad_tag }}
+
+            .. rubric:: Post-training
+
+            To start the post-training benchmark, use the following command with the
+            appropriate options. See the following list of options and their descriptions.
+
+            .. code-block:: shell
+
+               ./pytorch_benchmark_report.sh -t {% if available_modes | length == 1 %}{{ available_modes[0] }}{% else %}$training_mode{% endif %} \
+                   -m {{ model.model_repo }} \
+                   -p $datatype \
+                   -s $sequence_length
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Name
+                 - Options
+                 - Description
+
+               {% for mode in available_modes %}
+               * - {% if loop.first %}``$training_mode``{% endif %}
+                 - ``{{ mode }}``
+                 - {{ training_mode_descs[mode] }}
+               {% endfor %}
+
+               * - ``$datatype``
+                 - ``BF16``{% if model.mad_tag == "pyt_train_llama-3.1-8b" %} or ``FP8``{% endif %}
+                 - Only Llama 3.1 8B supports FP8 precision.
+
+               * - ``$sequence_length``
+                 - Sequence length for the language model.
+                 - Between 2048 and 8192. 8192 by default.
+         {% endif %}
+
+         {% set training_mode_descs = {
+            "finetune_fw": "Full weight fine-tuning (BF16 and FP8 supported).",
+            "finetune_lora": "LoRA fine-tuning (BF16 supported).",
+            "finetune_qlora": "QLoRA fine-tuning (BF16 supported).",
+            "HF_finetune_lora": "LoRA fine-tuning with Hugging Face PEFT.",
+         } %}
+         {% set available_modes = training_modes | select("in", ["finetune_fw", "finetune_lora", "finetune_qlora", "HF_finetune_lora"]) | list %}
+         {% if available_modes %}
+         .. container:: model-doc {{ model.mad_tag }}
+
+            .. rubric:: Fine-tuning
+
+            To start the fine-tuning benchmark, use the following command with the
+            appropriate options. See the following list of options and their descriptions.
+            See :ref:`supported training modes <amd-pytorch-training-supported-training-modes-v2510>`.
+
+            .. code-block:: shell
+
+               ./pytorch_benchmark_report.sh -t $training_mode \
+                   -m {{ model.model_repo }} \
+                   -p $datatype \
+                   -s $sequence_length
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Name
+                 - Options
+                 - Description
+
+               {% for mode in available_modes %}
+               * - {% if loop.first %}``$training_mode``{% endif %}
+                 - ``{{ mode }}``
+                 - {{ training_mode_descs[mode] }}
+               {% endfor %}
+
+               * - ``$datatype``
+                 - ``BF16``{% if "finetune_fw" in available_modes %} or ``FP8``{% endif %}
+                 - All models support BF16.{% if "finetune_fw" in available_modes %} FP8 is only available for full weight fine-tuning.{% endif %}
+
+               * - ``$sequence_length``
+                 - Between 2048 and 16384.
+                 - Sequence length for the language model.
+
+            {% if model.mad_tag in ["pyt_train_llama3.2-vision-11b", "pyt_train_llama-3.2-vision-90b"] %}
+            .. note::
+
+               For LoRA and QLoRA support with vision models (Llama 3.2 11B and 90B),
+               use the following torchtune commit for compatibility:
+
+               .. code-block:: shell
+
+                  git checkout 48192e23188b1fc524dd6d127725ceb2348e7f0e
+
+            {% elif model.mad_tag in ["pyt_train_llama-2-7b", "pyt_train_llama-2-13b", "pyt_train_llama-2-70b"] %}
+            .. note::
+
+               You might encounter the following error with Llama 2: ``ValueError: seq_len (16384) of
+               input tensor should be smaller than max_seq_len (4096)``.
+               This error indicates that an input sequence is longer than the model's maximum context window.
+
+               Ensure your tokenized input does not exceed the model's ``max_seq_len`` (4096
+               tokens in this case). You can resolve this by truncating the input or splitting
+               it into smaller chunks before passing it to the model.
+
+               Note on reproducibility: The results in this guide are based on
+               commit ``b4c98ac`` from the upstream
+               `<https://github.com/pytorch/torchtune>`__ repository. For the
+               latest updates, you can use the main branch.
+
+            {% endif %}
+         {% endif %}
+      {% endfor %}
+   {% endfor %}
+
+            .. rubric:: Benchmarking examples
+
+            For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.
+
+.. _amd-pytorch-training-multinode-examples-v2510:
+
+Multi-node training
+-------------------
+
+Refer to :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your environment for multi-node
+training. See :ref:`rocm-for-ai-multi-node-setup-pyt-train-example` for example Slurm run commands.
+
+Pre-training
+~~~~~~~~~~~~
+
+Multi-node training with torchtitan is supported. The provided SLURM script is pre-configured for Llama 3 70B.
+
+To launch the training job on a SLURM cluster for Llama 3 70B, run the following commands from the MAD repository.
+
+.. code-block:: shell
+
+   # In the MAD repository
+   cd scripts/pytorch_train
+   sbatch run_slurm_train.sh
+
+Fine-tuning
+~~~~~~~~~~~
+
+Multi-node training with torchtune is supported. The provided SLURM script is pre-configured for Llama 3.3 70B.
+
+To launch the training job on a SLURM cluster for Llama 3.3 70B, run the following commands from the MAD repository.
+
+.. code-block:: shell
+
+   huggingface-cli login # Get access to HF Llama model space
+   huggingface-cli download meta-llama/Llama-3.3-70B-Instruct --local-dir ./models/Llama-3.3-70B-Instruct # Download the Llama 3.3 model locally
+   # In the MAD repository
+   cd scripts/pytorch_train
+   sbatch Torchtune_Multinode.sh
+
+.. note::
+
+   Information regarding benchmark setup:
+
+   * By default, Llama 3.3 70B is fine-tuned using ``alpaca_dataset``.
+   * You can adjust the torchtune `YAML configuration file
+     <https://github.com/pytorch/torchtune/blob/main/recipes/configs/llama3_3/70B_full_multinode.yaml>`__
+     if you're using a different model.
+   * The number of nodes and other parameters can be tuned in the SLURM script ``Torchtune_Multinode.sh``.
+   * Set the ``mounting_paths`` inside the SLURM script.
+
+Once the run is finished, you can find the log files in the ``result_torchtune/`` directory.
+
+Further reading
+===============
+
+- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
+
+- To learn more about system settings and management practices to configure your system for
+  AMD Instinct MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`pytorch-training-history` to find documentation for previous releases
+of the ``ROCm/pytorch-training`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.6.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.6.rst
@@ -240,7 +240,7 @@ The following models are pre-optimized for performance on the AMD Instinct MI325
                 - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0

               * - ``torchdata``
-                 - `TorchData <https://pytorch.org/data/beta/index.html>`_
+                 - `TorchData <https://meta-pytorch.org/data/beta/index.html>`_

               * - ``tomli``
                 - `Tomli <https://pypi.org/project/tomli/>`_
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.9.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.9.rst
@@ -0,0 +1,667 @@
+:orphan:
+
+.. meta::
+   :description: How to train a model using PyTorch for ROCm.
+   :keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
+
+**************************************
+Training a model with PyTorch on ROCm
+**************************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm PyTorch training
+   performance benchmark documentation. See :doc:`../pytorch-training` for the latest version.
+
+.. note::
+
+   For a unified training solution on AMD GPUs with ROCm, the `rocm/pytorch-training
+   <https://hub.docker.com/r/rocm/pytorch-training/>`__ Docker Hub registry will be
+   deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
+   The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
+   including torchtitan and :doc:`Megatron-LM <../primus-megatron>`.
+
+   See :doc:`../primus-pytorch` for details.
+
+PyTorch is an open-source machine learning framework that is widely used for
+model training with GPU-optimized components for transformer-based models.
+The PyTorch for ROCm training Docker image provides a prebuilt optimized
+environment for fine-tuning and pretraining a model on AMD Instinct MI325X
+and MI300X GPUs. It includes the following software components to accelerate
+training workloads:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.9-benchmark-models.yaml
+
+   {% set dockers = data.dockers %}
+   .. tab-set::
+
+   {% for supported_gpus, docker in dockers.items() %}
+      .. tab-item:: {{ supported_gpus }}
+         :sync: {{ supported_gpus }}
+
+         .. list-table::
+            :header-rows: 1
+
+            * - Software component
+              - Version
+
+            {% for component_name, component_version in docker.components.items() %}
+            * - {{ component_name }}
+              - {{ component_version }}
+            {% endfor %}
+   {% endfor %}
+
+.. _amd-pytorch-training-model-support-v259:
+
+Supported models
+================
+
+The following models are pre-optimized for performance on the AMD Instinct
+MI355X, MI350X, MI325X, and MI300X GPUs. Some instructions, commands, and
+training recommendations in this documentation might vary by model -- select
+one to get started.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.9-benchmark-models.yaml
+
+   {% set model_groups = data.model_groups %}
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+         <div class="row gx-0">
+            <div class="col-2 me-1 px-2 model-param-head">Model</div>
+            <div class="row col-10 pe-0">
+      {% for model_group in model_groups %}
+               <div class="col-4 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+      {% endfor %}
+            </div>
+         </div>
+
+         <div class="row gx-0 pt-1">
+            <div class="col-2 me-1 px-2 model-param-head">Variant</div>
+            <div class="row col-10 pe-0">
+      {% for model_group in model_groups %}
+         {% set models = model_group.models %}
+         {% for model in models %}
+            {% if models|length % 3 == 0 %}
+               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% else %}
+               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% endif %}
+         {% endfor %}
+      {% endfor %}
+            </div>
+         </div>
+      </div>
+
+.. _amd-pytorch-training-supported-training-modes-v259:
+
+The following table lists supported training modes per model.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.9-benchmark-models.yaml
+
+   {% set model_groups = data.model_groups %}
+   .. dropdown:: Supported training modes
+
+      .. list-table::
+         :header-rows: 1
+
+         * - Model
+           - Supported training modes
+
+      {% for model_group in model_groups %}
+         {% set models = model_group.models %}
+         {% for model in models %}
+         {% if model.training_modes %}
+         * - {{ model.model }}
+           - ``{{ model.training_modes | join('``, ``') }}``
+
+         {% endif %}
+         {% endfor %}
+      {% endfor %}
+
+      .. note::
+
+         Some model and fine-tuning combinations are not listed. This is
+         because the `upstream torchtune repository <https://github.com/pytorch/torchtune>`__
+         doesn't provide default YAML configurations for them.
+         For advanced usage, you can create a custom configuration to enable
+         unlisted fine-tuning methods by using an existing file in the
+         ``/workspace/torchtune/recipes/configs`` directory as a template.
+
+.. _amd-pytorch-training-performance-measurements-v259:
+
+Performance measurements
+========================
+
+To evaluate performance, the
+`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
+page provides reference throughput and latency measurements for training
+popular AI models.
+
+.. note::
+
+   The performance data presented in
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
+   should not be interpreted as the peak performance achievable by AMD
+   Instinct MI325X and MI300X GPUs or ROCm software.
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting training.
+
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
+This Docker image is optimized for specific model configurations outlined
+below. Performance can vary for other training workloads, as AMD
+doesn’t test configurations and run conditions outside those described.
+
+Run training
+============
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.9-benchmark-models.yaml
+
+   {% set dockers = data.dockers %}
+   {% set model_groups = data.model_groups %}
+
+   Once the setup is complete, choose between two options to start benchmarking training:
+
+   .. tab-set::
+
+      .. tab-item:: MAD-integrated benchmarking
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+         .. container:: model-doc {{ model.mad_tag }}
+
+            The following run command is tailored to {{ model.model }}.
+            See :ref:`amd-pytorch-training-model-support-v259` to switch to another available model.
+
+            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+               directory and install the required packages on the host machine.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD
+                  pip install -r requirements.txt
+
+            2. For example, use this command to run the performance benchmark test on the {{ model.model }} model
+               using one node with the {{ model.precision }} data type on the host machine.
+
+               .. code-block:: shell
+
+                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+                  madengine run \
+                      --tags {{ model.mad_tag }} \
+                      --keep-model-dir \
+                      --live-output \
+                      --timeout 28800
+
+               MAD launches a Docker container with the name
+               ``container_ci-{{ model.mad_tag }}``. The latency and throughput reports of the
+               model are collected in ``~/MAD/perf.csv``.
+
+      {% endfor %}
+   {% endfor %}
+
+      .. tab-item:: Standalone benchmarking
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+         .. container:: model-doc {{ model.mad_tag }}
+
+            The following commands are tailored to {{ model.model }}.
+            See :ref:`amd-pytorch-training-model-support-v259` to switch to another available model.
+
+      {% endfor %}
+   {% endfor %}
+
+         .. rubric:: Download the Docker image and required packages
+
+         1. Use the following command to pull the Docker image from Docker Hub.
+
+            .. tab-set::
+
+               {% for supported_gpus, docker in dockers.items() %}
+               .. tab-item:: {{ supported_gpus }}
+                  :sync: {{ supported_gpus }}
+
+                  .. code-block:: shell
+
+                     docker pull {{ docker.pull_tag }}
+               {% endfor %}
+
+         2. Launch the Docker container.
+
+            .. tab-set::
+
+               {% for supported_gpus, docker in dockers.items() %}
+               .. tab-item:: {{ supported_gpus }}
+                  :sync: {{ supported_gpus }}
+
+                  .. code-block:: shell
+
+                     docker run -it \
+                         --device /dev/dri \
+                         --device /dev/kfd \
+                         --network host \
+                         --ipc host \
+                         --group-add video \
+                         --cap-add SYS_PTRACE \
+                         --security-opt seccomp=unconfined \
+                         --privileged \
+                         -v $HOME:$HOME \
+                         -v $HOME/.ssh:/root/.ssh \
+                         --shm-size 64G \
+                         --name training_env \
+                         {{ docker.pull_tag }}
+               {% endfor %}
+
+            Use these commands if you exit the ``training_env`` container and need to return to it.
+
+            .. code-block:: shell
+
+               docker start training_env
+               docker exec -it training_env bash
+
+         3. In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
+            repository and navigate to the benchmark scripts directory
+            ``/workspace/MAD/scripts/pytorch_train``.
+
+            .. code-block:: shell
+
+               git clone https://github.com/ROCm/MAD
+               cd MAD/scripts/pytorch_train
+
+         .. rubric:: Prepare training datasets and dependencies
+
+         1. The following benchmarking examples require downloading models and datasets
+            from Hugging Face. To ensure successful access to gated repos, set your
+            ``HF_TOKEN``.
+
+            .. code-block:: shell
+
+               export HF_TOKEN=$your_personal_hugging_face_access_token
+
+         2. Run the setup script to install libraries and datasets needed for benchmarking.
+
+            .. code-block:: shell
+
+               ./pytorch_benchmark_setup.sh
+
+            .. container:: model-doc pyt_train_llama-3.1-8b
+
+               ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 8B:
+
+               .. list-table::
+                  :header-rows: 1
+
+                  * - Library
+                    - Reference
+
+                  * - ``accelerate``
+                    - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
+
+                  * - ``datasets``
+                    - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
+
+            .. container:: model-doc pyt_train_llama-3.1-70b
+
+               ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 70B:
+
+               .. list-table::
+                  :header-rows: 1
+
+                  * - Library
+                    - Reference
+
+                  * - ``datasets``
+                    - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
+
+                  * - ``torchdata``
+                    - `TorchData <https://meta-pytorch.org/data/beta/index.html#torchdata>`__
+
+                  * - ``tomli``
+                    - `Tomli <https://pypi.org/project/tomli/>`__
+
+                  * - ``tiktoken``
+                    - `tiktoken <https://github.com/openai/tiktoken>`__
+
+                  * - ``blobfile``
+                    - `blobfile <https://pypi.org/project/blobfile/>`__
+
+                  * - ``tabulate``
+                    - `tabulate <https://pypi.org/project/tabulate/>`__
+
+                  * - ``wandb``
+                    - `Weights & Biases <https://github.com/wandb/wandb>`__
+
+                  * - ``sentencepiece``
+                    - `SentencePiece <https://github.com/google/sentencepiece>`__ 0.2.0
+
+                  * - ``tensorboard``
+                    - `TensorBoard <https://www.tensorflow.org/tensorboard>`__ 2.18.0
+
+            .. container:: model-doc pyt_train_flux
+
+               ``pytorch_benchmark_setup.sh`` installs the following libraries for FLUX:
+
+               .. list-table::
+                  :header-rows: 1
+
+                  * - Library
+                    - Reference
+
+                  * - ``accelerate``
+                    - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
+
+                  * - ``datasets``
+                    - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`__ 3.2.0
+
+                  * - ``sentencepiece``
+                    - `SentencePiece <https://github.com/google/sentencepiece>`__ 0.2.0
+
+                  * - ``tensorboard``
+                    - `TensorBoard <https://www.tensorflow.org/tensorboard>`__ 2.18.0
+
+                  * - ``csvkit``
+                    - `csvkit <https://csvkit.readthedocs.io/en/latest/>`__ 2.0.1
+
+                  * - ``deepspeed``
+                    - `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`__ 0.16.2
+
+                  * - ``diffusers``
+                    - `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`__ 0.31.0
+
+                  * - ``GitPython``
+                    - `GitPython <https://github.com/gitpython-developers/GitPython>`__ 3.1.44
+
+                  * - ``opencv-python-headless``
+                    - `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`__ 4.10.0.84
+
+                  * - ``peft``
+                    - `PEFT <https://huggingface.co/docs/peft/en/index>`__ 0.14.0
+
+                  * - ``protobuf``
+                    - `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`__ 5.29.2
+
+                  * - ``pytest``
+                    - `PyTest <https://docs.pytest.org/en/stable/>`__ 8.3.4
+
+                  * - ``python-dotenv``
+                    - `python-dotenv <https://pypi.org/project/python-dotenv/>`__ 1.0.1
+
+                  * - ``seaborn``
+                    - `Seaborn <https://seaborn.pydata.org/>`__ 0.13.2
+
+                  * - ``transformers``
+                    - `Transformers <https://huggingface.co/docs/transformers/en/index>`__ 4.47.0
+
+            ``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face:
+
+            * `frank-chieng/chinese_architecture_siheyuan <https://huggingface.co/datasets/frank-chieng/chinese_architecture_siheyuan>`__
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+         {% set training_modes = model.training_modes %}
+         {% set training_mode_descs = {
+            "pretrain": "Benchmark pre-training.",
+            "HF_pretrain": "Llama 3.1 8B pre-training with FP8 precision."
+         } %}
+         {% set available_modes = training_modes | select("in", ["pretrain", "HF_pretrain"]) | list %}
+         {% if available_modes %}
+
+         .. container:: model-doc {{ model.mad_tag }}
+
+            .. rubric:: Pre-training
+
+            To start the pre-training benchmark, use the following command with the
+            appropriate options. See the following list of options and their descriptions.
+
+            .. code-block:: shell
+
+               ./pytorch_benchmark_report.sh -t {% if available_modes | length == 1 %}{{ available_modes[0] }}{% else %}$training_mode{% endif %} \
+                   -m {{ model.model_repo }} \
+                   -p $datatype \
+                   -s $sequence_length
+
+            {% if model.mad_tag == "pyt_train_flux" %}
+            .. container:: model-doc {{ model.mad_tag }}
+
+               .. note::
+
+                  Currently, FLUX models are not supported out-of-the-box on this Docker.
+                  To use FLUX, refer to ``rocm/pytorch-training`` Docker: :doc:`previous-versions/pytorch-training-v25.6`
+
+                  Occasionally, downloading the Flux dataset might fail. In the event of this
+                  error, manually download it from Hugging Face at
+                  `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
+                  and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
+                  the required dataset.
+            {% endif %}
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Name
+                 - Options
+                 - Description
+
+               {% for mode in available_modes %}
+               * - {% if loop.first %}``$training_mode``{% endif %}
+                 - ``{{ mode }}``
+                 - {{ training_mode_descs[mode] }}
+               {% endfor %}
+
+               * - ``$datatype``
+                 - ``BF16``{% if model.mad_tag == "pyt_train_llama-3.1-8b" %} or ``FP8``{% endif %}
+                 - Only Llama 3.1 8B supports FP8 precision.
+
+               * - ``$sequence_length``
+                 - Sequence length for the language model.
+                 - Between 2048 and 8192. 8192 by default.
+         {% endif %}
+
+         {% set training_modes = model.training_modes %}
+         {% set training_mode_descs = {
+            "posttrain": "Benchmark post-training.",
+         } %}
+         {% set available_modes = training_modes | select("in", ["posttrain"]) | list %}
+         {% if available_modes %}
+
+         .. container:: model-doc {{ model.mad_tag }}
+
+            .. rubric:: Post-training
+
+            To start the post-training benchmark, use the following command with the
+            appropriate options. See the following list of options and their descriptions.
+
+            .. code-block:: shell
+
+               ./pytorch_benchmark_report.sh -t {% if available_modes | length == 1 %}{{ available_modes[0] }}{% else %}$training_mode{% endif %} \
+                   -m {{ model.model_repo }} \
+                   -p $datatype \
+                   -s $sequence_length
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Name
+                 - Options
+                 - Description
+
+               {% for mode in available_modes %}
+               * - {% if loop.first %}``$training_mode``{% endif %}
+                 - ``{{ mode }}``
+                 - {{ training_mode_descs[mode] }}
+               {% endfor %}
+
+               * - ``$datatype``
+                 - ``BF16``{% if model.mad_tag == "pyt_train_llama-3.1-8b" %} or ``FP8``{% endif %}
+                 - Only Llama 3.1 8B supports FP8 precision.
+
+               * - ``$sequence_length``
+                 - Sequence length for the language model.
+                 - Between 2048 and 8192. 8192 by default.
+         {% endif %}
+
+         {% set training_mode_descs = {
+            "finetune_fw": "Full weight fine-tuning (BF16 and FP8 supported).",
+            "finetune_lora": "LoRA fine-tuning (BF16 supported).",
+            "finetune_qlora": "QLoRA fine-tuning (BF16 supported).",
+            "HF_finetune_lora": "LoRA fine-tuning with Hugging Face PEFT.",
+         } %}
+         {% set available_modes = training_modes | select("in", ["finetune_fw", "finetune_lora", "finetune_qlora", "HF_finetune_lora"]) | list %}
+         {% if available_modes %}
+         .. container:: model-doc {{ model.mad_tag }}
+
+            .. rubric:: Fine-tuning
+
+            To start the fine-tuning benchmark, use the following command with the
+            appropriate options. See the following list of options and their descriptions.
+            See :ref:`supported training modes <amd-pytorch-training-supported-training-modes-v259>`.
+
+            .. code-block:: shell
+
+               ./pytorch_benchmark_report.sh -t $training_mode \
+                   -m {{ model.model_repo }} \
+                   -p $datatype \
+                   -s $sequence_length
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Name
+                 - Options
+                 - Description
+
+               {% for mode in available_modes %}
+               * - {% if loop.first %}``$training_mode``{% endif %}
+                 - ``{{ mode }}``
+                 - {{ training_mode_descs[mode] }}
+               {% endfor %}
+
+               * - ``$datatype``
+                 - ``BF16``{% if "finetune_fw" in available_modes %} or ``FP8``{% endif %}
+                 - All models support BF16.{% if "finetune_fw" in available_modes %} FP8 is only available for full weight fine-tuning.{% endif %}
+
+               * - ``$sequence_length``
+                 - Between 2048 and 16384.
+                 - Sequence length for the language model.
+
+            {% if model.mad_tag in ["pyt_train_llama3.2-vision-11b", "pyt_train_llama-3.2-vision-90b"] %}
+            .. note::
+
+               For LoRA and QLoRA support with vision models (Llama 3.2 11B and 90B),
+               use the following torchtune commit for compatibility:
+
+               .. code-block:: shell
+
+                  git checkout 48192e23188b1fc524dd6d127725ceb2348e7f0e
+
+            {% elif model.mad_tag in ["pyt_train_llama-2-7b", "pyt_train_llama-2-13b", "pyt_train_llama-2-70b"] %}
+            .. note::
+
+               You might encounter the following error with Llama 2: ``ValueError: seq_len (16384) of
+               input tensor should be smaller than max_seq_len (4096)``.
+               This error indicates that an input sequence is longer than the model's maximum context window.
+
+               Ensure your tokenized input does not exceed the model's ``max_seq_len`` (4096
+               tokens in this case). You can resolve this by truncating the input or splitting
+               it into smaller chunks before passing it to the model.
+
+               Note on reproducibility: The results in this guide are based on
+               commit ``b4c98ac`` from the upstream
+               `<https://github.com/pytorch/torchtune>`__ repository. For the
+               latest updates, you can use the main branch.
+
+            {% endif %}
+         {% endif %}
+      {% endfor %}
+   {% endfor %}
+
+            .. rubric:: Benchmarking examples
+
+            For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.
+
+.. _amd-pytorch-training-multinode-examples-v259:
+
+Multi-node training
+-------------------
+
+Refer to :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your environment for multi-node
+training. See :ref:`rocm-for-ai-multi-node-setup-pyt-train-example` for example Slurm run commands.
+
+Pre-training
+~~~~~~~~~~~~
+
+Multi-node training with torchtitan is supported. The provided SLURM script is pre-configured for Llama 3 70B.
+
+To launch the training job on a SLURM cluster for Llama 3 70B, run the following commands from the MAD repository.
+
+.. code-block:: shell
+
+   # In the MAD repository
+   cd scripts/pytorch_train
+   sbatch run_slurm_train.sh
+
+Fine-tuning
+~~~~~~~~~~~
+
+Multi-node training with torchtune is supported. The provided SLURM script is pre-configured for Llama 3.3 70B.
+
+To launch the training job on a SLURM cluster for Llama 3.3 70B, run the following commands from the MAD repository.
+
+.. code-block:: shell
+
+   huggingface-cli login # Get access to HF Llama model space
+   huggingface-cli download meta-llama/Llama-3.3-70B-Instruct --local-dir ./models/Llama-3.3-70B-Instruct # Download the Llama 3.3 model locally
+   # In the MAD repository
+   cd scripts/pytorch_train
+   sbatch Torchtune_Multinode.sh
+
+.. note::
+
+   Information regarding benchmark setup:
+
+   * By default, Llama 3.3 70B is fine-tuned using ``alpaca_dataset``.
+   * You can adjust the torchtune `YAML configuration file
+     <https://github.com/pytorch/torchtune/blob/main/recipes/configs/llama3_3/70B_full_multinode.yaml>`__
+     if you're using a different model.
+   * The number of nodes and other parameters can be tuned in the SLURM script ``Torchtune_Multinode.sh``.
+   * Set the ``mounting_paths`` inside the SLURM script.
+
+Once the run is finished, you can find the log files in the ``result_torchtune/`` directory.
+
+Known issues
+============
+
+PyTorch Profiler may produce inaccurate traces when CPU activity profiling is enabled.
+
+Further reading
+===============
+
+- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
+
+- To learn more about system settings and management practices to configure your system for
+  AMD Instinct MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`pytorch-training-history` to find documentation for previous releases
+of the ``ROCm/pytorch-training`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst
@@ -18,7 +18,7 @@ model training. Performance acceleration is powered by `Primus Turbo
   <https://hub.docker.com/r/rocm/megatron-lm/>`__ Docker Hub registry will be
   deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
   The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
-   including Megatron-LM, `torchtitan, and torchtune <primus-pytorch>`__.
+   including Megatron-LM and :doc:`torchtitan <primus-pytorch>`.

   Primus with Megatron is designed to replace the :doc:`ROCm Megatron-LM
   training <megatron-lm>` workflow. To learn how to migrate workloads from
@@ -31,12 +31,10 @@ Megatron-LM.

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml

-   {% set dockers = data.dockers %}
   .. tab-set::

-   {% for supported_gpus, docker in dockers.items() %}
-      .. tab-item:: {{ supported_gpus }}
-         :sync: {{ supported_gpus }}
+      .. tab-item:: {{ data.docker.pull_tag }}
+         :sync: {{ data.docker.pull_tag }}

         .. list-table::
            :header-rows: 1
@@ -44,13 +42,12 @@ Megatron-LM.
            * - Software component
              - Version

-            {% for component_name, component_version in docker.components.items() %}
+            {% for component_name, component_version in data.docker.components.items() %}
            * - {{ component_name }}
              - {{ component_version }}
            {% endfor %}
-   {% endfor %}

-.. _amd-primus-megatron-lm-model-support-v259:
+.. _amd-primus-megatron-lm-model-support-v25.11:

 Supported models
 ================
@@ -111,7 +108,7 @@ To test for optimal performance, consult the recommended :ref:`System health ben
 <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
 system's configuration.

-.. _mi300x-amd-primus-megatron-lm-training-v259:
+.. _mi300x-amd-primus-megatron-lm-training-v25.11:

 Environment setup
 =================
@@ -121,69 +118,55 @@ Environment setup
   Use the following instructions to set up the environment, configure the script to train models, and
   reproduce the benchmark results on AMD Instinct GPUs.

-.. _amd-primus-megatron-lm-requirements-v259:
+.. _amd-primus-megatron-lm-requirements-v25.11:

 Pull the Docker image

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml

-   {% set dockers = data.dockers %}
+   {% set docker = data.docker %}

-   1. Pull the appropriate Docker image for your AMD GPU architecture from Docker Hub.
+   1. Pull the ``{{ docker.pull_tag }}`` Docker image from Docker Hub.

-      .. tab-set::
+      .. code-block:: shell

-         {% for supported_gpus, docker in dockers.items() %}
-         .. tab-item:: {{ supported_gpus }}
-            :sync: {{ supported_gpus }}
-
-            .. code-block:: shell
-
-               docker pull {{ docker.pull_tag }}
-         {% endfor %}
+         docker pull {{ docker.pull_tag }}

   2. Launch the Docker container.

-      .. tab-set::
+      .. code-block:: shell

-         {% for supported_gpus, docker in dockers.items() %}
-         .. tab-item:: {{ supported_gpus }}
-            :sync: {{ supported_gpus }}
+         docker run -it \
+             --device /dev/dri \
+             --device /dev/kfd \
+             --device /dev/infiniband \
+             --network host --ipc host \
+             --group-add video \
+             --cap-add SYS_PTRACE \
+             --security-opt seccomp=unconfined \
+             --privileged \
+             -v $HOME:$HOME \
+             --shm-size 128G \
+             --name primus_training_env \
+             {{ docker.pull_tag }}

-            .. code-block:: shell
+      Use these commands if you exit the ``primus_training_env`` container and need to return to it.

-               docker run -it \
-                   --device /dev/dri \
-                   --device /dev/kfd \
-                   --device /dev/infiniband \
-                   --network host --ipc host \
-                   --group-add video \
-                   --cap-add SYS_PTRACE \
-                   --security-opt seccomp=unconfined \
-                   --privileged \
-                   -v $HOME:$HOME \
-                   --shm-size 128G \
-                   --name primus_training_env \
-                   {{ docker.pull_tag }}
-         {% endfor %}
+      .. code-block:: shell

-3. Use these commands if you exit the ``primus_training_env`` container and need to return to it.
+         docker start primus_training_env
+         docker exec -it primus_training_env bash

-   .. code-block:: shell
+The Docker container hosts verified commit ``c4c083de`` of the `Primus
+<https://github.com/AMD-AGI/Primus/tree/c4c083de64ba3e8f19ccc9629411267108931f9e/>`__ repository.

-      docker start primus_training_env
-      docker exec -it primus_training_env bash
-
-The Docker container hosts verified commit ``e16b27b`` of the `Primus
-<https://github.com/AMD-AGI/Primus/tree/e16b27b>`__ repository.
-
-.. _amd-primus-megatron-lm-environment-setup-v259:
+.. _amd-primus-megatron-lm-environment-setup-v25.11:

 Configuration
 =============

 Primus defines a training configuration in YAML for each model in
-`examples/megatron/configs <https://github.com/AMD-AGI/rss/tree/e16b27bf6c1b2798f38848fc574fee60d9a9b902/examples/megatron/configs>`__.
+`examples/megatron/configs <https://github.com/AMD-AGI/Primus/tree/c4c083de64ba3e8f19ccc9629411267108931f9e/examples/megatron/configs>`__.

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml

@@ -224,7 +207,7 @@ You can use either mock data or real data for training.

  Ensure that the files are accessible inside the Docker container.

-.. _amd-primus-megatron-lm-tokenizer-v259:
+.. _amd-primus-megatron-lm-tokenizer-v25.11:

 Tokenizer
 ---------
@@ -245,7 +228,7 @@ right permissions to access the tokenizer for each model.
   <https://github.com/AMD-AGI/Primus/blob/e16b27bf6c1b2798f38848fc574fee60d9a9b902/examples/megatron/configs/llama3.1_8B-pretrain.yaml>`__
   definition.

-.. _amd-primus-megatron-lm-run-training-v259:
+.. _amd-primus-megatron-lm-run-training-v25.11:

 Run training
 ============
@@ -269,7 +252,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the

   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to Llama 3.3 70B.
-   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.

   To run pre-training for Llama 3.3 70B BF16, run:

@@ -280,28 +263,27 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the

         .. code-block:: shell

-            EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
-            bash ./examples/run_pretrain.sh \
-                --train_iters 50 \
-                --micro_batch_size 6 \
-                --global_batch_size 48 \
+            EXP=examples/megatron/configs/MI355X/llama3.3_70B-BF16-pretrain.yaml \
+            bash ./examples/run_pretrain.sh

      .. tab-item:: MI300X
         :sync: MI325X and MI300X

         .. code-block:: shell

-            EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
-            bash ./examples/run_pretrain.sh \
-                --train_iters 50 \
-                --micro_batch_size 2 \
-                --global_batch_size 16
+            # Set the variables for better performance
+            # only on MI325X and MI300X
+            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
+            export NVTE_CK_IS_V3_ATOMIC_FP32=1
+
+            EXP=examples/megatron/configs/MI300X/llama3.3_70B-BF16-pretrain.yaml \
+            bash ./examples/run_pretrain.sh

 .. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b

   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to Llama 3.1 8B.
-   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.

   To run pre-training for Llama 3.1 8B FP8, run:

@@ -312,22 +294,21 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the

         .. code-block:: shell

-            EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
-            bash ./examples/run_pretrain.sh \
-                --train_iters 50 \
-                --fp8 hybrid \
-                --micro_batch_size 4 \
-                --global_batch_size 512 \
+            EXP=examples/megatron/configs/MI355X/llama3.1_8B-FP8-pretrain.yaml \
+            bash ./examples/run_pretrain.sh

      .. tab-item:: MI300X
         :sync: MI325X and MI300X

         .. code-block:: shell

-            EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
-            bash ./examples/run_pretrain.sh \
-                --train_iters 50 \
-                --fp8 hybrid
+            # Set the variables for better performance
+            # only on MI325X and MI300X
+            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
+            export NVTE_CK_IS_V3_ATOMIC_FP32=1
+
+            EXP=examples/megatron/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml \
+            bash ./examples/run_pretrain.sh

   For Llama 3.1 8B BF16, use the following command:

@@ -338,26 +319,27 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the

         .. code-block:: shell

-            EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
-            bash ./examples/run_pretrain.sh \
-                --train_iters 50 \
-                --micro_batch_size 4 \
-                --global_batch_size 512 \
+            EXP=examples/megatron/configs/MI355X/llama3.1_BF16-pretrain.yaml \
+            bash ./examples/run_pretrain.sh

      .. tab-item:: MI300X
         :sync: MI325X and MI300X

         .. code-block:: shell

-            EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
-            bash ./examples/run_pretrain.sh \
-                --train_iters 50
+            # Set the variables for better performance
+            # only on MI325X and MI300X
+            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
+            export NVTE_CK_IS_V3_ATOMIC_FP32=1
+
+            EXP=examples/megatron/configs/MI300X/llama3.1_8B-BF16-pretrain.yaml \
+            bash ./examples/run_pretrain.sh

 .. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-70b

   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to Llama 3.1 70B.
-   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.

   To run pre-training for Llama 3.1 70B BF16, run:

@@ -368,20 +350,21 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the

         .. code-block:: shell

-            EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
-            bash ./examples/run_pretrain.sh \
-                 --train_iters 50 \
-                 --micro_batch_size 4 \
-                 --global_batch_size 32
+            EXP=examples/megatron/configs/MI355X/llama3.1_70B-BF16-pretrain.yaml \
+            bash ./examples/run_pretrain.sh

      .. tab-item:: MI300X
         :sync: MI325X and MI300X

         .. code-block:: shell

-            EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
-            bash ./examples/run_pretrain.sh \
-                 --train_iters 50
+            # Set the variables for better performance
+            # only on MI325X and MI300X
+            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
+            export NVTE_CK_IS_V3_ATOMIC_FP32=1
+
+            EXP=examples/megatron/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml \
+            bash ./examples/run_pretrain.sh

   To run the training on a single node for Llama 3.1 70B FP8, use the following command.

@@ -398,20 +381,20 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the

         .. code-block:: shell

-            EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
-            bash ./examples/run_pretrain.sh \
-                --train_iters 50 \
-                --fp8 hybrid \
-                --no_fp8_weight_transpose_cache true \
-                --micro_batch_size 3 \
-                --global_batch_size 24
+            EXP=examples/megatron/configs/MI355X/llama3.1_70B-FP8-pretrain.yaml \
+            bash ./examples/run_pretrain.sh

      .. tab-item:: MI300X
         :sync: MI325X and MI300X

         .. code-block:: shell

-            EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
+            # Set the variables for better performance
+            # only on MI325X and MI300X
+            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
+            export NVTE_CK_IS_V3_ATOMIC_FP32=1
+
+            EXP=examples/megatron/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml \
            bash ./examples/run_pretrain.sh \
                --train_iters 50 \
                --num_layers 40 \
@@ -422,7 +405,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the

   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to Llama 2 7B.
-   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.

   To run pre-training for Llama 2 7B FP8, run:

@@ -433,22 +416,21 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the

         .. code-block:: shell

-            EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
-            bash ./examples/run_pretrain.sh \
-                --train_iters 50 \
-                --fp8 hybrid \
-                --micro_batch_size 13 \
-                --global_batch_size 416
+            EXP=examples/megatron/configs/MI355X/llama2_7B-FP8-pretrain.yaml \
+            bash ./examples/run_pretrain.sh

      .. tab-item:: MI300X
         :sync: MI325X and MI300X

         .. code-block:: shell

-            EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
-            bash ./examples/run_pretrain.sh \
-                --train_iters 50 \
-                --fp8 hybrid
+            # Set the variables for better performance
+            # only on MI325X and MI300X
+            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
+            export NVTE_CK_IS_V3_ATOMIC_FP32=1
+
+            EXP=examples/megatron/configs/MI300X/llama2_7B-FP8-pretrain.yaml \
+            bash ./examples/run_pretrain.sh

   To run pre-training for Llama 2 7B BF16, run:

@@ -459,26 +441,27 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the

         .. code-block:: shell

-            EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
-            bash ./examples/run_pretrain.sh \
-                --train_iters 50 \
-                --micro_batch_size 10 \
-                --global_batch_size 640
+            EXP=examples/megatron/configs/MI355X/llama2_7B-BF16-pretrain.yaml \
+            bash ./examples/run_pretrain.sh

      .. tab-item:: MI300X
         :sync: MI325X and MI300X

         .. code-block:: shell

-            EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
-            bash ./examples/run_pretrain.sh \
-                --train_iters 50
+            # Set the variables for better performance
+            # only on MI325X and MI300X
+            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
+            export NVTE_CK_IS_V3_ATOMIC_FP32=1
+
+            EXP=examples/megatron/configs/MI300X/llama2_7B-BF16-pretrain.yaml \
+            bash ./examples/run_pretrain.sh

 .. container:: model-doc primus_pyt_megatron_lm_train_llama-2-70b

   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to Llama 2 70B.
-   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.

   To run pre-training for Llama 2 70B BF16, run:

@@ -489,26 +472,27 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the

         .. code-block:: shell

-            EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
-            bash ./examples/run_pretrain.sh \
-                --train_iters 50 \
-                --micro_batch_size 17 \
-                --global_batch_size 272
+            EXP=examples/megatron/configs/MI355X/llama2_70B-BF16-pretrain.yaml \
+            bash ./examples/run_pretrain.sh

      .. tab-item:: MI300X
         :sync: MI325X and MI300X

         .. code-block:: shell

-            EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
-            bash ./examples/run_pretrain.sh \
-                --train_iters 50
+            # Set the variables for better performance
+            # only on MI325X and MI300X
+            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
+            export NVTE_CK_IS_V3_ATOMIC_FP32=1
+
+            EXP=examples/megatron/configs/MI300X/llama2_70B-BF16-pretrain.yaml \
+            bash ./examples/run_pretrain.sh

 .. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v3-proxy

   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to DeepSeek-V3.
-   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.

   To run training on a single node for DeepSeek-V3 (MoE with expert parallel) BF16 with 3-layer proxy,
   use the following command:
@@ -520,7 +504,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the

         .. code-block:: shell

-            EXP=examples/megatron/configs/deepseek_v3-pretrain.yaml \
+            EXP=examples/megatron/configs/MI355X/deepseek_v3-BF16-pretrain.yaml \
            bash examples/run_pretrain.sh \
                --num_layers 3 \
                --moe_layer_freq 1 \
@@ -533,7 +517,12 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the

         .. code-block:: shell

-            EXP=examples/megatron/configs/deepseek_v3-pretrain.yaml \
+            # Set the variables for better performance
+            # only on MI325X and MI300X
+            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
+            export NVTE_CK_IS_V3_ATOMIC_FP32=1
+
+            EXP=examples/megatron/configs/MI300X/deepseek_v3-BF16-pretrain.yaml \
            bash examples/run_pretrain.sh \
                --num_layers 3 \
                --moe_layer_freq 1 \
@@ -543,7 +532,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the

   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to DeepSeek-V2-Lite.
-   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.

   To run training on a single node for DeepSeek-V2-Lite (MoE with expert parallel) BF16,
   use the following command:
@@ -555,27 +544,27 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the

         .. code-block:: shell

-            EXP=examples/megatron/configs/deepseek_v2_lite-pretrain.yaml \
-            bash examples/run_pretrain.sh \
-                --train_iters 50 \
-                --micro_batch_size 12 \
-                --global_batch_size 768
+            EXP=examples/megatron/configs/MI355X/deepseek_v2_lite-BF16-pretrain.yaml \
+            bash examples/run_pretrain.sh

      .. tab-item:: MI300X
         :sync: MI325X and MI300X

         .. code-block:: shell

-            EXP=examples/megatron/configs/deepseek_v2_lite-pretrain.yaml \
-            bash examples/run_pretrain.sh \
-                --train_iters 50 \
-                --global_batch_size 256
+            # Set the variables for better performance
+            # only on MI325X and MI300X
+            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
+            export NVTE_CK_IS_V3_ATOMIC_FP32=1
+
+            EXP=examples/megatron/configs/MI300X/deepseek_v2_lite-BF16-pretrain.yaml \
+            bash examples/run_pretrain.sh

 .. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x7b

   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to Mixtral 8x7B.
-   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.

   To run training on a single node for Mixtral 8x7B (MoE with expert parallel),
   use the following command:
@@ -587,18 +576,20 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the

         .. code-block:: shell

-            EXP=examples/megatron/configs/mixtral_8x7B_v0.1-pretrain.yaml \
-            bash examples/run_pretrain.sh \
-                --train_iters 50 \
-                --micro_batch_size 4 \
-                --global_batch_size 256
+            EXP=examples/megatron/configs/MI355X/mixtral_8x7B_v0.1-BF16-pretrain.yaml \
+            bash examples/run_pretrain.sh

      .. tab-item:: MI300X
         :sync: MI325X and MI300X

         .. code-block:: shell

-            EXP=examples/megatron/configs/mixtral_8x7B_v0.1-pretrain.yaml \
+            # Set the variables for better performance
+            # only on MI325X and MI300X
+            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
+            export NVTE_CK_IS_V3_ATOMIC_FP32=1
+
+            EXP=examples/megatron/configs/MI300X/mixtral_8x7B_v0.1-BF16-pretrain.yaml \
            bash examples/run_pretrain.sh \
                --train_iters 50

@@ -606,7 +597,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the

   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to Mixtral 8x22B.
-   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.

   To run training on a single node for Mixtral 8x22B BF16 (MoE with expert parallel) 4-layer proxy,
   use the following command:
@@ -618,20 +609,20 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the

         .. code-block:: shell

-            EXP=examples/megatron/configs/mixtral_8x22B_v0.1-pretrain.yaml \
-            bash examples/run_pretrain.sh \
-                --train_iters 50 \
-                --num_layers 4 \
-                --pipeline_model_parallel_size 1 \
-                --micro_batch_size 2 \
-                --global_batch_size 16
+            EXP=examples/megatron/configs/MI355X/mixtral_8x22B_v0.1-BF16-pretrain.yaml \
+            bash examples/run_pretrain.sh

      .. tab-item:: MI300X
         :sync: MI325X and MI300X

         .. code-block:: shell

-            EXP=examples/megatron/configs/mixtral_8x22B_v0.1-pretrain.yaml \
+            # Set the variables for better performance
+            # only on MI325X and MI300X
+            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
+            export NVTE_CK_IS_V3_ATOMIC_FP32=1
+
+            EXP=examples/megatron/configs/MI300X/mixtral_8x22B_v0.1-BF16-pretrain.yaml \
            bash examples/run_pretrain.sh \
                --train_iters 50 \
                --num_layers 4 \
@@ -643,7 +634,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the

   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to Qwen 2.5 7B.
-   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.

   To run training on a single node for Qwen 2.5 7B BF16, use the following
   command:
@@ -655,20 +646,21 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the

         .. code-block:: shell

-            EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \
-            bash examples/run_pretrain.sh \
-                --train_iters 50 \
-                --micro_batch_size 16 \
-                --global_batch_size 768
+            EXP=examples/megatron/configs/MI355X/qwen2.5_7B-BF16-pretrain.yaml \
+            bash examples/run_pretrain.sh

      .. tab-item:: MI300X
         :sync: MI325X and MI300X

         .. code-block:: shell

-            EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \
-            bash examples/run_pretrain.sh \
-                --train_iters 50
+            # Set the variables for better performance
+            # only on MI325X and MI300X
+            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
+            export NVTE_CK_IS_V3_ATOMIC_FP32=1
+
+            EXP=examples/megatron/configs/MI300X/qwen2.5_7B-BF16-pretrain.yaml \
+            bash examples/run_pretrain.sh

   For FP8, use the following command.

@@ -679,28 +671,27 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the

         .. code-block:: shell

-            EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \
-            bash examples/run_pretrain.sh \
-                --train_iters 50 \
-                --fp8 hybrid
-                --micro_batch_size 20 \
-                --global_batch_size 800
+            EXP=examples/megatron/configs/MI355X/qwen2.5_7B-FP8-pretrain.yaml \
+            bash examples/run_pretrain.sh

      .. tab-item:: MI300X
         :sync: MI325X and MI300X

         .. code-block:: shell

-            EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \
-            bash examples/run_pretrain.sh \
-                --train_iters 50 \
-                --fp8 hybrid
+            # Set the variables for better performance
+            # only on MI325X and MI300X
+            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
+            export NVTE_CK_IS_V3_ATOMIC_FP32=1
+
+            EXP=examples/megatron/configs/MI300X/qwen2.5_7B-FP8-pretrain.yaml \
+            bash examples/run_pretrain.sh

 .. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-72b

   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to Qwen 2.5 72B.
-   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.

   To run the training on a single node for Qwen 2.5 72B BF16, use the following command.

@@ -711,7 +702,7 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the

         .. code-block:: shell

-            EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \
+            EXP=examples/megatron/configs/MI355X/qwen2.5_72B-pretrain.yaml \
            bash examples/run_pretrain.sh \
                --train_iters 50 \
                --micro_batch_size 16 \
@@ -722,11 +713,15 @@ To run training on a single node, navigate to ``/workspace/Primus`` and use the

         .. code-block:: shell

-            EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \
-            bash examples/run_pretrain.sh \
-                --train_iters 50
+            # Set the variables for better performance
+            # only on MI325X and MI300X
+            export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
+            export NVTE_CK_IS_V3_ATOMIC_FP32=1

-.. _amd-primus-megatron-multi-node-examples-v259:
+            EXP=examples/megatron/configs/MI300X/qwen2.5_72B-BF16-pretrain.yaml \
+            bash examples/run_pretrain.sh
+
+.. _amd-primus-megatron-multi-node-examples-v25.11:

 Multi-node training examples
 ----------------------------
@@ -740,28 +735,27 @@ to launch the multi-node workload. Use the following steps to setup your environ

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml

-   {% set dockers = data.dockers %}
-   .. tab-set::
+   {% set docker = data.docker %}
+   .. code-block:: shell

-      {% for supported_gpus, docker in dockers.items() %}
-      .. tab-item:: {{ supported_gpus }}
-         :sync: {{ supported_gpus }}
+      git clone --recurse-submodules https://github.com/AMD-AGI/Primus.git
+      cd Primus
+      git checkout c4c083de64ba3e8f19ccc9629411267108931f9e
+      git submodule update --init --recursive

-         .. code-block:: shell
+      export DOCKER_IMAGE={{ docker.pull_tag }}
+      export HF_TOKEN=<your_HF_token>
+      export HSA_NO_SCRATCH_RECLAIM=1
+      export NVTE_CK_USES_BWD_V3=1
+      export NCCL_IB_HCA=<your_NCCL_IB_HCA> # specify which RDMA interfaces to use for communication
+      export NCCL_SOCKET_IFNAME=<your_NCCL_SOCKET_IFNAME> # your Network Interface
+      export GLOO_SOCKET_IFNAME=<your_GLOO_SOCKET_IFNAME> # your Network Interface
+      export NCCL_IB_GID_INDEX=3 # Set InfiniBand GID index for NCCL communication. Default is 3 for ROCE

-            git clone --recurse-submodules https://github.com/AMD-AGI/Primus.git
-            cd Primus
-            git checkout e16b27b
-
-            export DOCKER_IMAGE={{ docker.pull_tag }}
-            export HF_TOKEN=<your_HF_token>
-            export HSA_NO_SCRATCH_RECLAIM=1
-            export NVTE_CK_USES_BWD_V3=1
-            export NCCL_IB_HCA=<your_NCCL_IB_HCA> # specify which RDMA interfaces to use for communication
-            export NCCL_SOCKET_IFNAME=<your_NCCL_SOCKET_IFNAME> # your Network Interface
-            export GLOO_SOCKET_IFNAME=<your_GLOO_SOCKET_IFNAME> # your Network Interface
-            export NCCL_IB_GID_INDEX=3 # Set InfiniBand GID index for NCCL communication. Default is 3 for ROCE
-      {% endfor %}
+      # Set the variables for better performance
+      # only on MI325X and MI300X
+      export PRIMUS_TURBO_ATTN_V3_ATOMIC_FP32=1
+      export NVTE_CK_IS_V3_ATOMIC_FP32=1

 .. note::

@@ -769,13 +763,13 @@ to launch the multi-node workload. Use the following steps to setup your environ
   * If ``NCCL_IB_HCA`` and ``NCCL_SOCKET_IFNAME`` are not set, Primus will try to auto-detect. However, since NICs can vary accross different cluster, it is encouraged to explicitly export your NCCL parameters for the cluster.
   * To find your network interface, you can use ``ip a``.
   * To find RDMA interfaces, you can use ``ibv_devices`` to get the list of all the RDMA/IB  devices.
-   * Remember to set ``DOCKER_IMAGE`` and ``HF_TOKEN`` (see :ref:`amd-primus-megatron-lm-tokenizer-v259`) as appropriate.
+   * Remember to set ``DOCKER_IMAGE`` and ``HF_TOKEN`` (see :ref:`amd-primus-megatron-lm-tokenizer-v25.11`) as appropriate.

 .. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b

   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to Llama 3.1 8B.
-   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.

   To train Llama 3.1 8B FP8 on 8 nodes, run:

@@ -784,16 +778,15 @@ to launch the multi-node workload. Use the following steps to setup your environ
      # Adjust the training parameters.
      # For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case.
      NNODES=8 \
-      EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
+      EXP=examples/megatron/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml \
      bash ./examples/run_slurm_pretrain.sh \
          --global_batch_size 1024 \
-          --fp8 hybrid

 .. container:: model-doc primus_pyt_megatron_lm_train_llama-2-7b

   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to Llama 2 7B.
-   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.

   To train Llama 2 7B FP8 on 8 nodes, run:

@@ -802,16 +795,15 @@ to launch the multi-node workload. Use the following steps to setup your environ
      # Adjust the training parameters.
      # For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case.
      NNODES=8 \
-      EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
+      EXP=examples/megatron/configs/MI300X/llama2_7B-FP8-pretrain.yaml \
      bash ./examples/run_slurm_pretrain.sh \
          --global_batch_size 2048 \
-          --fp8 hybrid

 .. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-70b

   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to Llama 3.1 70B.
-   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.

   To train Llama 3.1 70B FP8 on 8 nodes, run:

@@ -820,20 +812,18 @@ to launch the multi-node workload. Use the following steps to setup your environ
      # Adjust the training parameters.
      # For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case.
      NNODES=8 \
-      EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
+      EXP=examples/megatron/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml \
      bash examples/run_slurm_pretrain.sh \
          --micro_batch_size 4 \
          --global_batch_size 256 \
          --recompute_num_layers 80 \
-          --no_fp8_weight_transpose_cache true \
-          --fp8 hybrid

   To train Llama 3.1 70B BF16 on 8 nodes, run:

   .. code-block:: shell

      NNODES=8 \
-      EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
+      EXP=examples/megatron/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml \
      bash examples/run_slurm_pretrain.sh \
          --micro_batch_size 1 \
          --global_batch_size 256 \
@@ -843,7 +833,7 @@ to launch the multi-node workload. Use the following steps to setup your environ

   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to Llama 2 70B.
-   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.

   To train Llama 2 70B FP8 on 8 nodes, run:

@@ -852,20 +842,18 @@ to launch the multi-node workload. Use the following steps to setup your environ
      # Adjust the training parameters.
      # For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case.
      NNODES=8 \
-      EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
+      EXP=examples/megatron/configs/MI300X/llama2_70B-FP8-pretrain.yaml \
      bash examples/run_slurm_pretrain.sh \
          --micro_batch_size 10 \
          --global_batch_size 640 \
          --recompute_num_layers 80 \
-          --no_fp8_weight_transpose_cache true \
-          --fp8 hybrid

   To train Llama 2 70B BF16 on 8 nodes, run:

   .. code-block:: shell

      NNODES=8 \
-      EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
+      EXP=examples/megatron/configs/MI300X/llama2_70B-BF16-pretrain.yaml \
      bash ./examples/run_slurm_pretrain.sh \
          --micro_batch_size 2 \
          --global_batch_size 1536 \
@@ -875,7 +863,7 @@ to launch the multi-node workload. Use the following steps to setup your environ

   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to Llama 3.3 70B.
-   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.

   To train Llama 3.3 70B FP8 on 8 nodes, run:

@@ -884,20 +872,18 @@ to launch the multi-node workload. Use the following steps to setup your environ
      # Adjust the training parameters.
      # For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case
      NNODES=8 \
-      EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
+      EXP=examples/megatron/configs/MI300X/llama3.3_70B-FP8-pretrain.yaml \
      bash examples/run_slurm_pretrain.sh \
          --micro_batch_size 4 \
          --global_batch_size 256 \
          --recompute_num_layers 80 \
-          --no_fp8_weight_transpose_cache true \
-          --fp8 hybrid

   To train Llama 3.3 70B BF16 on 8 nodes, run:

   .. code-block:: shell

      NNODES=8 \
-      EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
+      EXP=examples/megatron/configs/MI300X/llama3.3_70B-BF16-pretrain.yaml \
      bash examples/run_slurm_pretrain.sh \
          --micro_batch_size 1 \
          --global_batch_size 256 \
@@ -907,7 +893,7 @@ to launch the multi-node workload. Use the following steps to setup your environ

   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to Llama 2 70B.
-   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.

   To train Mixtral 8x7B BF16 on 8 nodes, run:

@@ -916,7 +902,7 @@ to launch the multi-node workload. Use the following steps to setup your environ
      # Adjust the training parameters.
      # For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case
      NNODES=8 \
-      EXP=examples/megatron/configs/mixtral_8x7B_v0.1-pretrain.yaml \
+      EXP=examples/megatron/configs/MI300X/mixtral_8x7B_v0.1-BF16-pretrain.yaml \
      bash examples/run_slurm_pretrain.sh \
          --micro_batch_size 2 \
          --global_batch_size 256
@@ -925,7 +911,7 @@ to launch the multi-node workload. Use the following steps to setup your environ

   Once setup is complete, run the appropriate training command.
   The following run commands are tailored to Llama 2 70B.
-   See :ref:`amd-primus-megatron-lm-model-support-v259` to switch to another available model.
+   See :ref:`amd-primus-megatron-lm-model-support-v25.11` to switch to another available model.

   To train Qwen2.5 72B FP8 on 8 nodes, run:

@@ -934,15 +920,13 @@ to launch the multi-node workload. Use the following steps to setup your environ
      # Adjust the training parameters.
      # For example, `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case
      NNODES=8 \
-      EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \
+      EXP=examples/megatron/configs/qwen2.5_72B-FP8-pretrain.yaml \
      bash examples/run_slurm_pretrain.sh \
          --micro_batch_size 8 \
          --global_batch_size 512 \
          --recompute_num_layers 80 \
-          --no_fp8_weight_transpose_cache true \
-          --fp8 hybrid

-.. _amd-primus-megatron-lm-benchmark-test-vars-v259:
+.. _amd-primus-megatron-lm-benchmark-test-vars-v25.11:

 Key options
 -----------
@@ -987,7 +971,10 @@ num_layers
 Known issues
 ============

-PyTorch Profiler may produce inaccurate traces when CPU activity profiling is enabled.
+DeepSeekV3 proxy model and Mixtral 8x22B proxy model may exit with an error
+due to a memory free issue. However, this does not impacts training runs. All
+iterations, in this case 50, should have been completed before the exit and
+the results should be available in the end.

 Further reading
 ===============
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch.rst
@@ -17,7 +17,7 @@ Primus now supports the PyTorch torchtitan backend.
   <https://hub.docker.com/r/rocm/pytorch-training/>`__ Docker Hub registry will be
   deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
   The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
-   including `Megatron-LM <primus-megatron>`__, torchtitan, and torchtune.
+   including torchtitan and :doc:`Megatron-LM <primus-megatron>`.

   Primus with the PyTorch torchtitan backend is designed to replace the
   :doc:`ROCm PyTorch training <pytorch-training>` workflow. See
@@ -29,12 +29,10 @@ with Primus Turbo optimizations.

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml

-   {% set dockers = data.dockers %}
   .. tab-set::

-   {% for supported_gpus, docker in dockers.items() %}
-      .. tab-item:: {{ supported_gpus }}
-         :sync: {{ supported_gpus }}
+      .. tab-item:: {{ data.docker.pull_tag }}
+         :sync: {{ data.docker.pull_tag }}

         .. list-table::
            :header-rows: 1
@@ -42,13 +40,12 @@ with Primus Turbo optimizations.
            * - Software component
              - Version

-            {% for component_name, component_version in docker.components.items() %}
+            {% for component_name, component_version in data.docker.components.items() %}
            * - {{ component_name }}
              - {{ component_version }}
            {% endfor %}
-   {% endfor %}

-.. _amd-primus-pytorch-model-support-v259:
+.. _amd-primus-pytorch-model-support-v25.11:

 Supported models
 ================
@@ -67,7 +64,7 @@ vary by model -- select one to get started.
            <div class="col-2 me-1 px-2 model-param-head">Model</div>
            <div class="row col-10 pe-0">
      {% for model_group in model_groups %}
-               <div class="col-12 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+               <div class="col-6 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
      {% endfor %}
            </div>
         </div>
@@ -94,7 +91,7 @@ vary by model -- select one to get started.
   For additional workloads, including Llama 3.3, Llama 3.2, Llama 2, GPT OSS, Qwen, and Flux models,
   see the documentation :doc:`pytorch-training` (without Primus)

-.. _amd-primus-pytorch-performance-measurements-v259:
+.. _amd-primus-pytorch-performance-measurements-v25.11:

 System validation
 =================
@@ -120,20 +117,11 @@ Pull the Docker image

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml

-   {% set dockers = data.dockers %}
-
   Use the following command to pull the Docker image from Docker Hub.

-   .. tab-set::
+   .. code-block:: shell

-      {% for supported_gpus, docker in dockers.items() %}
-      .. tab-item:: {{ supported_gpus }}
-         :sync: {{ supported_gpus }}
-
-         .. code-block:: shell
-
-            docker pull {{ docker.pull_tag }}
-      {% endfor %}
+      docker pull {{ data.docker.pull_tag }}

 Run training
 ============
@@ -145,7 +133,7 @@ tweak some configurations (such as batch sizes).

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml

-   {% set dockers = data.dockers %}
+   {% set docker = data.docker %}
   {% set model_groups = data.model_groups %}

   .. tab-set::
@@ -158,7 +146,7 @@ tweak some configurations (such as batch sizes).
         .. container:: model-doc {{ model.mad_tag }}

            The following run command is tailored to {{ model.model }}.
-            See :ref:`amd-primus-pytorch-model-support-v259` to switch to another available model.
+            See :ref:`amd-primus-pytorch-model-support-v25.11` to switch to another available model.

            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
               directory and install the required packages on the host machine.
@@ -185,13 +173,6 @@ tweak some configurations (such as batch sizes).
               ``container_ci-{{ model.mad_tag }}``. The latency and throughput reports of the
               model are collected in ``~/MAD/perf.csv``.

-               .. note::
-
-                  Currently, Primus torchtitan models are run with Primus Turbo
-                  enabled for enhanced performance. To disable Primus Turbo,
-                  modify respective configuration file
-                  ``scripts/primus/pytorch_train/primus_torchtitan_scripts/llama3_[8B|70B]-[BF16|FP8].yaml``.
-
      {% endfor %}
   {% endfor %}

@@ -203,48 +184,34 @@ tweak some configurations (such as batch sizes).
         .. container:: model-doc {{ model.mad_tag }}

            The following run commands are tailored to {{ model.model }}.
-            See :ref:`amd-primus-pytorch-model-support-v259` to switch to another available model.
+            See :ref:`amd-primus-pytorch-model-support-v25.11` to switch to another available model.

            .. rubric:: Download the Docker image and required packages

-            1. Pull the appropriate Docker image for your AMD GPU architecture from Docker Hub.
+            1. Pull the ``{{ docker.pull_tag }}`` Docker image from Docker Hub.

-               .. tab-set::
+               .. code-block:: shell

-                  {% for supported_gpus, docker in dockers.items() %}
-                  .. tab-item:: {{ supported_gpus }}
-                     :sync: {{ supported_gpus }}
-
-                     .. code-block:: shell
-
-                        docker pull {{ docker.pull_tag }}
-                  {% endfor %}
+                  docker pull {{ docker.pull_tag }}

            2. Run the Docker container.

-               .. tab-set::
+               .. code-block:: shell

-                  {% for supported_gpus, docker in dockers.items() %}
-                  .. tab-item:: {{ supported_gpus }}
-                     :sync: {{ supported_gpus }}
-
-                     .. code-block:: shell
-
-                        docker run -it \
-                            --device /dev/dri \
-                            --device /dev/kfd \
-                            --network host \
-                            --ipc host \
-                            --group-add video \
-                            --cap-add SYS_PTRACE \
-                            --security-opt seccomp=unconfined \
-                            --privileged \
-                            -v $HOME:$HOME \
-                            -v $HOME/.ssh:/root/.ssh \
-                            --shm-size 64G \
-                            --name training_env \
-                            {{ docker.pull_tag }}
-                  {% endfor %}
+                  docker run -it \
+                      --device /dev/dri \
+                      --device /dev/kfd \
+                      --network host \
+                      --ipc host \
+                      --group-add video \
+                      --cap-add SYS_PTRACE \
+                      --security-opt seccomp=unconfined \
+                      --privileged \
+                      -v $HOME:$HOME \
+                      -v $HOME/.ssh:/root/.ssh \
+                      --shm-size 64G \
+                      --name training_env \
+                      {{ docker.pull_tag }}

               Use these commands if you exit the ``training_env`` container and need to return to it.

@@ -253,6 +220,9 @@ tweak some configurations (such as batch sizes).
                  docker start training_env
                  docker exec -it training_env bash

+               The Docker container hosts verified commit ``c4c083de`` of the `Primus
+               <https://github.com/AMD-AGI/Primus/tree/c4c083de64ba3e8f19ccc9629411267108931f9e/>`__ repository.
+
            .. rubric:: Prepare training datasets and dependencies

            The following benchmarking examples require downloading models and datasets
@@ -283,75 +253,56 @@ tweak some configurations (such as batch sizes).
               .. tab-set::

                  .. tab-item:: MI355X and MI350X
-                     :sync: MI355X and MI300X
+                     :sync: MI355X

                     .. code-block:: shell

-                        EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \
-                        bash examples/run_pretrain.sh \
-                            --metrics.enable_tensorboard false \
-                            --profiling.enable_profiling false \
-                            --training.batch_size 5
+                        EXP=examples/torchtitan/configs/MI355X/llama3.1_8B-BF16-pretrain.yaml \
+                        bash examples/run_pretrain.sh

                  .. tab-item:: MI325X
                     :sync: MI325X

                     .. code-block:: shell

-                        EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \
-                        bash examples/run_pretrain.sh \
-                            --metrics.enable_tensorboard false \
-                            --profiling.enable_profiling false \
-                            --training.batch_size 6
+                        EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-BF16-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.local_batch_size 6 

                  .. tab-item:: MI300X
-                     :sync: MI325X and MI300X
+                     :sync: MI300X

                     .. code-block:: shell

-                        EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \
-                        bash examples/run_pretrain.sh \
-                            --metrics.enable_tensorboard false \
-                            --profiling.enable_profiling false \
-                            --training.batch_size 4
-
+                        EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-BF16-pretrain.yaml \
+                        bash examples/run_pretrain.sh

               To train Llama 3.1 8B with FP8 precision, use the following command.

               .. tab-set::

                  .. tab-item:: MI355X and MI350X
-                     :sync: MI355X and MI300X
+                     :sync: MI355X

                     .. code-block:: shell

-                        EXP=examples/torchtitan/configs/llama3.1_8B-BF16-pretrain.yaml \
-                        bash examples/run_pretrain.sh \
-                            --metrics.enable_tensorboard false \
-                            --profiling.enable_profiling false \
-                            --training.batch_size 8
+                        EXP=examples/torchtitan/configs/MI355X/llama3.1_8B-FP8-pretrain.yaml \
+                        bash examples/run_pretrain.sh

                  .. tab-item:: MI325X
                     :sync: MI325X

                     .. code-block:: shell

-                        EXP=examples/torchtitan/configs/llama3.1_8B-FP8-pretrain.yaml \
-                        bash examples/run_pretrain.sh \
-                            --metrics.enable_tensorboard false \
-                            --profiling.enable_profiling false \
-                            --training.batch_size 7
+                        EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.local_batch_size 7 

                  .. tab-item:: MI300X
-                     :sync: MI325X and MI300X
+                     :sync: MI300X

                     .. code-block:: shell

-                        EXP=examples/torchtitan/configs/llama3.1_8B-FP8-pretrain.yaml \
-                        bash examples/run_pretrain.sh \
-                            --metrics.enable_tensorboard false \
-                            --profiling.enable_profiling false \
-                            --training.batch_size 5
+                        EXP=examples/torchtitan/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml \
+                        bash examples/run_pretrain.sh

            .. container:: model-doc primus_pyt_train_llama-3.1-70b

@@ -364,36 +315,57 @@ tweak some configurations (such as batch sizes).

                     .. code-block:: shell

-                        EXP=examples/torchtitan/configs/llama3.1_70B-BF16-pretrain.yaml \
-                        bash examples/run_pretrain.sh \
-                            --metrics.enable_tensorboard false \
-                            --profiling.enable_profiling false \
-                            --training.batch_size 8
+                        EXP=examples/torchtitan/configs/MI355X/llama3.1_70B-BF16-pretrain.yaml \
+                        bash examples/run_pretrain.sh

                  .. tab-item:: MI325X
                     :sync: MI325X

                     .. code-block:: shell

-                        EXP=examples/torchtitan/configs/llama3.1_70B-BF16-pretrain.yaml \
-                        bash examples/run_pretrain.sh \
-                            --metrics.enable_tensorboard false \
-                            --profiling.enable_profiling false \
-                            --training.batch_size 6
+                        EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.local_batch_size 6 

                  .. tab-item:: MI300X
-                     :sync: MI325X and MI300X
+                     :sync: MI300X

                     .. code-block:: shell

-                        EXP=examples/torchtitan/configs/llama3.1_70B-BF16-pretrain.yaml \
-                        bash examples/run_pretrain.sh \
-                            --metrics.enable_tensorboard false \
-                            --profiling.enable_profiling false \
-                            --training.batch_size 4
+                        EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml \
+                        bash examples/run_pretrain.sh

               To train Llama 3.1 70B with FP8 precision, use the following command.

+               .. tab-set::
+
+                  .. tab-item:: MI355X and MI350X
+                     :sync: MI355X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/MI355X/llama3.1_70B-FP8-pretrain.yaml \
+                        bash examples/run_pretrain.sh
+
+                  .. tab-item:: MI325X
+                     :sync: MI325X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.local_batch_size 5 
+
+                  .. tab-item:: MI300X
+                     :sync: MI300X
+
+                     .. code-block:: shell
+
+                        EXP=examples/torchtitan/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml \
+                        bash examples/run_pretrain.sh
+
+            .. container:: model-doc primus_pyt_train_deepseek-v3-16b
+
+               Use the following command to run train DeepSeek V3 16B with BF16 precision using Primus torchtitan.
+
               .. tab-set::

                  .. tab-item:: MI355X and MI350X
@@ -401,151 +373,27 @@ tweak some configurations (such as batch sizes).

                     .. code-block:: shell

-                        EXP=examples/torchtitan/configs/llama3.1_70B-FP8-pretrain.yaml \
-                        bash examples/run_pretrain.sh \
-                            --metrics.enable_tensorboard false \
-                            --profiling.enable_profiling false \
-                            --training.batch_size 6
+                        EXP=examples/torchtitan/configs/MI355X/deepseek_v3_16b-pretrain.yaml \
+                        bash examples/run_pretrain.sh

                  .. tab-item:: MI325X
                     :sync: MI325X

                     .. code-block:: shell

-                        EXP=examples/torchtitan/configs/llama3.1_70B-FP8-pretrain.yaml \
-                        bash examples/run_pretrain.sh \
-                            --metrics.enable_tensorboard false \
-                            --profiling.enable_profiling false \
-                            --training.batch_size 5
+                        EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \
+                        bash examples/run_pretrain.sh --training.local_batch_size 10 

                  .. tab-item:: MI300X
-                     :sync: MI325X and MI300X
+                     :sync: MI300X

                     .. code-block:: shell

-                        EXP=examples/torchtitan/configs/llama3.1_70B-FP8-pretrain.yaml \
-                        bash examples/run_pretrain.sh \
-                            --metrics.enable_tensorboard false \
-                            --profiling.enable_profiling false \
-                            --training.batch_size 3
+                        EXP=examples/torchtitan/configs/MI300X/deepseek_v3_16b-pretrain.yaml \
+                        bash examples/run_pretrain.sh
      {% endfor %}
   {% endfor %}

-      .. tab-item:: Standalone torchtitan benchmarking
-
-   {% for model_group in model_groups %}
-      {% for model in model_group.models %}
-
-         .. container:: model-doc {{ model.mad_tag }}
-
-            The following run commands are tailored to {{ model.model }}.
-            See :ref:`amd-primus-pytorch-model-support-v259` to switch to another available model.
-
-            .. rubric:: Download the Docker image and required packages
-
-            1. Pull the appropriate Docker image for your AMD GPU architecture from Docker Hub.
-
-               .. tab-set::
-
-                  {% for supported_gpus, docker in dockers.items() %}
-                  .. tab-item:: {{ supported_gpus }}
-                     :sync: {{ supported_gpus }}
-
-                     .. code-block:: shell
-
-                        docker pull {{ docker.pull_tag }}
-                  {% endfor %}
-
-            2. Run the Docker container.
-
-               .. tab-set::
-
-                  {% for supported_gpus, docker in dockers.items() %}
-                  .. tab-item:: {{ supported_gpus }}
-                     :sync: {{ supported_gpus }}
-
-                     .. code-block:: shell
-
-                        docker run -it \
-                            --device /dev/dri \
-                            --device /dev/kfd \
-                            --network host \
-                            --ipc host \
-                            --group-add video \
-                            --cap-add SYS_PTRACE \
-                            --security-opt seccomp=unconfined \
-                            --privileged \
-                            -v $HOME:$HOME \
-                            -v $HOME/.ssh:/root/.ssh \
-                            --shm-size 64G \
-                            --name training_env \
-                            {{ docker.pull_tag }}
-                  {% endfor %}
-
-               Use these commands if you exit the ``training_env`` container and need to return to it.
-
-               .. code-block:: shell
-
-                  docker start training_env
-                  docker exec -it training_env bash
-
-            3. Navigate to the ``torchtitan`` workspace directory.
-
-               .. code-block:: shell
-
-                  cd /workspace/torchtitan
-
-            .. rubric:: Download the tokenizer
-
-            1. The following benchmarking examples require downloading models and datasets
-               from Hugging Face. To ensure successful access to gated repos, set your
-               ``HF_TOKEN``.
-
-               .. code-block:: shell
-
-                  export HF_TOKEN=$your_personal_hugging_face_access_token
-
-            2. Download the tokenizer for your model.
-
-               .. container:: model-doc {{ model.mad_tag }}
-
-                  .. code-block:: shell
-
-                     python3 scripts/download_tokenizer.py \
-                        --repo_id {{ model.model_repo }} \
-                        --tokenizer_path "original" \
-                        --hf_token=${HF_TOKEN}
-
-            .. rubric:: Pretraining examples
-
-            Run the training script with the appropriate configuration file.
-
-            For train with BF16 precicion, use the following command:
-
-            .. container:: model-doc {{ model.mad_tag }}
-
-               .. code-block:: shell
-
-                  CONFIG_FILE={{ model.config_file.bf16 }} \
-                  .run_train.sh
-
-            For train with BF16 precicion, use the following command:
-
-            .. container:: model-doc {{ model.mad_tag }}
-
-               .. code-block:: shell
-
-                  CONFIG_FILE={{ model.config_file.fp8 }} \
-                  .run_train.sh
-      {% endfor %}
-   {% endfor %}
-
-Known issues
-============
-
-PyTorch Profiler may produce inaccurate traces when CPU activity profiling is enabled.
-
-
 Further reading
 ===============

--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
@@ -14,7 +14,7 @@ Training a model with PyTorch on ROCm
   <https://hub.docker.com/r/rocm/pytorch-training/>`__ Docker Hub registry will be
   deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
   The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
-   including `Megatron-LM <primus-megatron>`__, torchtitan, and torchtune.
+   including torchtitan and :doc:`Megatron-LM <primus-megatron>`.

   See :doc:`primus-pytorch` for details.

@@ -27,12 +27,10 @@ training workloads:

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml

-   {% set dockers = data.dockers %}
   .. tab-set::

-   {% for supported_gpus, docker in dockers.items() %}
-      .. tab-item:: {{ supported_gpus }}
-         :sync: {{ supported_gpus }}
+      .. tab-item:: {{ data.docker.pull_tag }}
+         :sync: {{ data.docker.pull_tag }}

         .. list-table::
            :header-rows: 1
@@ -40,13 +38,12 @@ training workloads:
            * - Software component
              - Version

-            {% for component_name, component_version in docker.components.items() %}
+            {% for component_name, component_version in data.docker.components.items() %}
            * - {{ component_name }}
              - {{ component_version }}
            {% endfor %}
-   {% endfor %}

-.. _amd-pytorch-training-model-support-v259:
+.. _amd-pytorch-training-model-support-v25.11:

 Supported models
 ================
@@ -88,7 +85,7 @@ one to get started.
         </div>
      </div>

-.. _amd-pytorch-training-supported-training-modes-v259:
+.. _amd-pytorch-training-supported-training-modes-v25.11:

 The following table lists supported training modes per model.

@@ -123,7 +120,7 @@ The following table lists supported training modes per model.
         unlisted fine-tuning methods by using an existing file in the
         ``/workspace/torchtune/recipes/configs`` directory as a template.

-.. _amd-pytorch-training-performance-measurements-v259:
+.. _amd-pytorch-training-performance-measurements-v25.11:

 Performance measurements
 ========================
@@ -164,7 +161,7 @@ Run training

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml

-   {% set dockers = data.dockers %}
+   {% set docker = data.docker %}
   {% set model_groups = data.model_groups %}

   Once the setup is complete, choose between two options to start benchmarking training:
@@ -179,7 +176,7 @@ Run training
         .. container:: model-doc {{ model.mad_tag }}

            The following run command is tailored to {{ model.model }}.
-            See :ref:`amd-pytorch-training-model-support-v259` to switch to another available model.
+            See :ref:`amd-pytorch-training-model-support-v25.11` to switch to another available model.

            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
               directory and install the required packages on the host machine.
@@ -217,7 +214,7 @@ Run training
         .. container:: model-doc {{ model.mad_tag }}

            The following commands are tailored to {{ model.model }}.
-            See :ref:`amd-pytorch-training-model-support-v259` to switch to another available model.
+            See :ref:`amd-pytorch-training-model-support-v25.11` to switch to another available model.

      {% endfor %}
   {% endfor %}
@@ -226,42 +223,28 @@ Run training

         1. Use the following command to pull the Docker image from Docker Hub.

-            .. tab-set::
+            .. code-block:: shell

-               {% for supported_gpus, docker in dockers.items() %}
-               .. tab-item:: {{ supported_gpus }}
-                  :sync: {{ supported_gpus }}
-
-                  .. code-block:: shell
-
-                     docker pull {{ docker.pull_tag }}
-               {% endfor %}
+               docker pull {{ docker.pull_tag }}

         2. Launch the Docker container.

-            .. tab-set::
+            .. code-block:: shell

-               {% for supported_gpus, docker in dockers.items() %}
-               .. tab-item:: {{ supported_gpus }}
-                  :sync: {{ supported_gpus }}
-
-                  .. code-block:: shell
-
-                     docker run -it \
-                         --device /dev/dri \
-                         --device /dev/kfd \
-                         --network host \
-                         --ipc host \
-                         --group-add video \
-                         --cap-add SYS_PTRACE \
-                         --security-opt seccomp=unconfined \
-                         --privileged \
-                         -v $HOME:$HOME \
-                         -v $HOME/.ssh:/root/.ssh \
-                         --shm-size 64G \
-                         --name training_env \
-                         {{ docker.pull_tag }}
-               {% endfor %}
+               docker run -it \
+                   --device /dev/dri \
+                   --device /dev/kfd \
+                   --network host \
+                   --ipc host \
+                   --group-add video \
+                   --cap-add SYS_PTRACE \
+                   --security-opt seccomp=unconfined \
+                   --privileged \
+                   -v $HOME:$HOME \
+                   -v $HOME/.ssh:/root/.ssh \
+                   --shm-size 64G \
+                   --name training_env \
+                   {{ docker.pull_tag }}

            Use these commands if you exit the ``training_env`` container and need to return to it.

@@ -419,11 +402,34 @@ Run training

         .. container:: model-doc {{ model.mad_tag }}

-            .. rubric:: Pre-training
+            .. rubric:: Pretraining

            To start the pre-training benchmark, use the following command with the
            appropriate options. See the following list of options and their descriptions.

+            {% if model.mad_tag == "pyt_train_dlrm" %}
+
+            1. Go to the DLRM directory.
+
+               .. code-block:: shell
+
+                  cd /workspace/DLRMBenchmark
+
+            2. To run the single node training benchmark for DLRM-v2 with TF32 precision,
+               run the following script.
+
+               .. code-block:: shell
+
+                  ./launch_training_single_node.sh
+
+               To run with MAD within the Docker container, use the following command.
+
+               .. code-block:: shell
+
+                  ./pytorch_benchmark_report.sh -t pretrain -m DLRM
+
+            {% else %}
+
            .. code-block:: shell

               ./pytorch_benchmark_report.sh -t {% if available_modes | length == 1 %}{{ available_modes[0] }}{% else %}$training_mode{% endif %} \
@@ -466,6 +472,7 @@ Run training
               * - ``$sequence_length``
                 - Sequence length for the language model.
                 - Between 2048 and 8192. 8192 by default.
+            {% endif %}
         {% endif %}

         {% set training_modes = model.training_modes %}
@@ -525,7 +532,7 @@ Run training

            To start the fine-tuning benchmark, use the following command with the
            appropriate options. See the following list of options and their descriptions.
-            See :ref:`supported training modes <amd-pytorch-training-supported-training-modes-v259>`.
+            See :ref:`supported training modes <amd-pytorch-training-supported-training-modes-v25.11>`.

            .. code-block:: shell

@@ -590,7 +597,7 @@ Run training

            For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.

-.. _amd-pytorch-training-multinode-examples-v259:
+.. _amd-pytorch-training-multinode-examples-v25.11:

 Multi-node training
 -------------------
@@ -639,11 +646,6 @@ To launch the training job on a SLURM cluster for Llama 3.3 70B, run the followi

 Once the run is finished, you can find the log files in the ``result_torchtune/`` directory.

-Known issues
-============
-
-PyTorch Profiler may produce inaccurate traces when CPU activity profiling is enabled.
-
 Further reading
 ===============

--- a/docs/how-to/rocm-for-ai/training/scale-model-training.rst
+++ b/docs/how-to/rocm-for-ai/training/scale-model-training.rst
@@ -46,7 +46,7 @@ In DDP training, each process or worker owns a replica of the model and processe

 See the following developer blogs for more in-depth explanations and examples.

-*  `Multi GPU training with DDP — PyTorch Tutorials <https://pytorch.org/tutorials/beginner/ddp_Series_multigpu.html>`_
+*  `Multi GPU training with DDP — PyTorch Tutorials <https://docs.pytorch.org/tutorials/beginner/ddp_series_multigpu.html>`__

 *  `Building a decoder transformer model on AMD GPUs — ROCm Blogs
   <https://rocm.blogs.amd.com/artificial-intelligence/decoder-transformer/README.html#distributed-training-on-multiple-gpus>`_
--- a/docs/index.md
+++ b/docs/index.md
@@ -65,6 +65,8 @@ ROCm documentation is organized into the following categories:
 * [ROCm libraries](./reference/api-libraries.md)
 * [ROCm tools, compilers, and runtimes](./reference/rocm-tools.md)
 * [GPU hardware specifications](./reference/gpu-arch-specs.rst)
+* [Hardware atomics operation support](./reference/gpu-atomics-operation.rst)
+* [Environment variables](./reference/env-variables.rst)
 * [Data types and precision support](./reference/precision-support.rst)
 * [Graph safe support](./reference/graph-safe-support.rst)
 <!-- markdownlint-enable MD051 -->
--- a/docs/reference/env-variables.rst
+++ b/docs/reference/env-variables.rst
@@ -0,0 +1,173 @@
+.. meta::
+    :description: Environment variables reference
+    :keywords: AMD, ROCm, environment variables, environment, reference, settings
+
+.. role:: cpp(code)
+   :language: cpp
+
+.. _env-variables-reference:
+
+*************************************************************
+ROCm environment variables
+*************************************************************
+
+ROCm provides a set of environment variables that allow users to configure and optimize their development
+and runtime experience. These variables define key settings such as installation paths, platform selection,
+and runtime behavior for applications running on AMD accelerators and GPUs.
+
+This page outlines commonly used environment variables across different components of the ROCm software stack,
+including HIP and ROCR-Runtime. Understanding these variables can help streamline software development and
+execution in ROCm-based environments.
+
+HIP environment variables
+=========================
+
+The following tables list the HIP environment variables.
+
+GPU isolation variables
+--------------------------------------------------------------------------------
+
+.. remote-content::
+   :repo: ROCm/rocm-systems
+   :path: /projects/hip/docs/reference/env_variables/gpu_isolation_hip_env.rst
+   :default_branch: develop
+   :tag_prefix: docs/
+
+
+Profiling variables
+--------------------------------------------------------------------------------
+
+.. remote-content::
+   :repo: ROCm/rocm-systems
+   :path: /projects/hip/docs/reference/env_variables/profiling_hip_env.rst
+   :default_branch: develop
+   :tag_prefix: docs/
+
+
+Debug variables
+--------------------------------------------------------------------------------
+
+.. remote-content::
+   :repo: ROCm/rocm-systems
+   :path: /projects/hip/docs/reference/env_variables/debug_hip_env.rst
+   :default_branch: develop
+   :tag_prefix: docs/
+
+Memory management related variables
+--------------------------------------------------------------------------------
+
+.. remote-content::
+   :repo: ROCm/rocm-systems
+   :path: /projects/hip/docs/reference/env_variables/memory_management_hip_env.rst
+   :default_branch: develop
+   :tag_prefix: docs/
+
+Other useful variables
+--------------------------------------------------------------------------------
+
+.. remote-content::
+   :repo: ROCm/rocm-systems
+   :path: /projects/hip/docs/reference/env_variables/miscellaneous_hip_env.rst
+   :default_branch: develop
+   :tag_prefix: docs/
+
+ROCR-Runtime environment variables
+==================================
+
+The following table lists the ROCR-Runtime environment variables:
+
+.. remote-content::
+   :repo: ROCm/rocm-systems
+   :path: /projects/rocr-runtime/runtime/docs/data/env_variables.rst
+   :default_branch: develop
+   :tag_prefix: docs/
+
+HIPCC environment variables
+===========================
+
+This topic provides descriptions of the HIPCC environment variables.
+
+.. remote-content::
+   :repo: ROCm/llvm-project
+   :path: amd/hipcc/docs/env.rst
+   :default_branch: amd-staging
+   :start_line: 14
+   :tag_prefix: docs/
+
+Environment variables in ROCm libraries
+=======================================
+
+Many ROCm libraries define environment variables for specific tuning, debugging,
+or behavioral control. The table below provides an overview and links to further
+documentation.
+
+.. list-table::
+    :header-rows: 1
+    :widths: 30, 70
+
+    * - Library
+      - Purpose of Environment Variables
+
+    * - :doc:`hipBLASLt <hipblaslt:reference/env-variables>`
+      - Manage logging, debugging, offline tuning, and stream-K configuration
+        for hipBLASLt.
+
+    * - :doc:`hipSPARSELt <hipsparselt:reference/env-variables>`
+      - Control logging, debugging and performance monitoring of hipSPARSELt.
+
+    * - :doc:`rocBLAS <rocblas:reference/env-variables>`
+      - Performance tuning, kernel selection, logging, and debugging for BLAS
+        operations.
+
+    * - :doc:`rocSolver <rocsolver:reference/env_variables>`
+      - Control logging of rocSolver.
+
+    * - :doc:`rocSPARSE <rocsparse:reference/env_variables>`
+      - Control logging of rocSPARSE.
+
+    * - :doc:`MIGraphX <amdmigraphx:reference/MIGraphX-dev-env-vars>`
+      - Control debugging, testing, and model performance tuning options for
+        MIGraphX.
+
+    * - :doc:`MIOpen <miopen:reference/env_variables>`
+      - Control MIOpen logging and debugging, find mode and algorithm behavior
+        and others.
+
+    * - :doc:`MIVisionX <mivisionx:reference/MIVisionX-env-variables>`
+      - Control core OpenVX, GPU/device and debugging/profiling, stitching and
+        chroma key configurations, file I/O operations, model deployment, and
+        neural network parameters of MIVisionX.
+
+    * - :doc:`RCCL <rccl:api-reference/env-variables>`
+      - Control the logging, debugging, compiler and assembly behavior, and
+        cache of RPP.
+
+    * - :doc:`RPP <rpp:reference/rpp-env-variables>`
+      - Logging, debugging, compiler and assembly management, and cache control in RPP
+
+    * - `Tensile <https://rocm.docs.amd.com/projects/Tensile/en/latest/src/reference/environment-variables.html>`_
+      - Enable testing, debugging, and experimental features for Tensile clients and applications
+
+Key single-variable details
+===========================
+
+This section provides detailed descriptions, in the standard format, for ROCm
+libraries that feature a single, key environment variable (or a very minimal set)
+which is documented directly on this page for convenience.
+
+.. _rocalution-vars-detail:
+
+rocALUTION
+----------
+
+.. list-table::
+    :header-rows: 1
+    :widths: 70,30
+
+    * - Environment variable
+      - Value
+
+    * - | ``ROCALUTION_LAYER``
+        | If set to ``1``, enable file logging. Logs each rocALUTION function call including object constructor/destructor, address of the object, memory allocation, data transfers, all function calls for matrices, vectors, solvers, and preconditioners. The log file is placed in the working directory.
+      - | ``1`` (Enable trace file logging)
+        | Default: Not set.
--- a/docs/reference/graph-safe-support.rst
+++ b/docs/reference/graph-safe-support.rst
@@ -93,7 +93,7 @@ The following table shows whether a ROCm library is graph-safe.
      - ⚠️ (experimental)
    * 
      - `rocThrust <https://github.com/ROCm/rocThrust>`_
-      - ❌ (see :doc:`details <rocthrust:hipgraph-support>`)
+      - ❌
    * 
      - `rocWMMA <https://github.com/ROCm/rocWMMA>`_
      - ❌
--- a/docs/release/versions.md
+++ b/docs/release/versions.md
@@ -10,6 +10,8 @@

 | Version | Release date |
 | ------- | ------------ |
+| [7.1.1](https://rocm.docs.amd.com/en/docs-7.1.1/) | November 26, 2025 |
+| [7.1.0](https://rocm.docs.amd.com/en/docs-7.1.0/) | October 30, 2025 |
 | [7.0.2](https://rocm.docs.amd.com/en/docs-7.0.2/) | October 10, 2025 |
 | [7.0.1](https://rocm.docs.amd.com/en/docs-7.0.1/) | September 17, 2025 |
 | [7.0.0](https://rocm.docs.amd.com/en/docs-7.0.0/) | September 16, 2025 |
--- a/Show More
+++ b/Show More