Update .wordlist.txt

fix spelling
Update RELEASE.md
2026-01-10 23:28:03 -05:00 · 2025-09-16 12:57:00 -07:00 · 2025-09-16 12:52:42 -07:00 · 2025-09-16 12:51:10 -07:00 · 2025-09-16 13:33:07 -04:00 · 2025-09-16 13:04:24 -04:00
163 changed files with 4388 additions and 10291 deletions
--- a/.azuredevops/components/HIPIFY.yml
+++ b/.azuredevops/components/HIPIFY.yml
@@ -79,7 +79,7 @@ jobs:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
        packageManager: ${{ job.packageManager }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
    - task: Bash@3
      displayName: Add lit to PATH
      inputs:
--- a/.azuredevops/components/MIOpen.yml
+++ b/.azuredevops/components/MIOpen.yml
@@ -131,7 +131,7 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
@@ -212,7 +212,7 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
--- a/.azuredevops/components/ROCR-Runtime.yml
+++ b/.azuredevops/components/ROCR-Runtime.yml
@@ -37,7 +37,6 @@ parameters:
    - libdrm-dev
    - libelf-dev
    - libnuma-dev
-    - libsimde-dev
    - ninja-build
    - pkg-config
 - name: rocmDependencies
--- a/.azuredevops/components/aqlprofile.yml
+++ b/.azuredevops/components/aqlprofile.yml
@@ -1,174 +0,0 @@
-parameters:
- name: componentName
-  type: string
-  default: aqlprofile
- name: checkoutRepo
-  type: string
-  default: 'self'
- name: checkoutRef
-  type: string
-  default: ''
-# monorepo related parameters
- name: sparseCheckoutDir
-  type: string
-  default: ''
- name: triggerDownstreamJobs
-  type: boolean
-  default: false
- name: downstreamAggregateNames
-  type: string
-  default: ''
- name: buildDependsOn
-  type: object
-  default: null
- name: unifiedBuild
-  type: boolean
-  default: false
-# set to true if doing full build of ROCm stack
-# and dependencies are pulled from same pipeline
- name: aggregatePipeline
-  type: boolean
-  default: false
- name: aptPackages
-  type: object
-  default:
-    - cmake
-    - git
-    - ninja-build
-    - python3-pip
- name: rocmDependencies
-  type: object
-  default:
-    - clr
-    - llvm-project
-    - ROCR-Runtime
- name: rocmTestDependencies
-  type: object
-  default:
-    - clr
-    - llvm-project
-    - ROCR-Runtime
-    - rocprofiler-register
-
- name: jobMatrix
-  type: object
-  default:
-    buildJobs:
-      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
-    testJobs:
-      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
-
-jobs:
- ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
-    ${{ if parameters.buildDependsOn }}:
-      dependsOn:
-        - ${{ each build in parameters.buildDependsOn }}:
-          - ${{ build }}_${{ job.os }}
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool: ${{ variables.MEDIUM_BUILD_POOL }}
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      parameters:
-        checkoutRepo: ${{ parameters.checkoutRepo }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
-      parameters:
-        dependencyList:
-          - gtest
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmDependencies }}
-        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
-        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-        ${{ if parameters.triggerDownstreamJobs }}:
-          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-      parameters:
-        os: ${{ job.os }}
-        consolidateBuildAndInstall: true
-        extraBuildFlags: >-
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor
-          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
-          -DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/aqlprofile/cmake_modules
-          -DAQLPROFILE_BUILD_TESTS=ON
-          -DGPU_TARGETS=${{ job.target }}
-          -GNinja
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-      parameters:
-        componentName: ${{ parameters.componentName }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
-        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-      parameters:
-        componentName: ${{ parameters.componentName }}
-        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    - ${{ if eq(job.os, 'ubuntu2204') }}:
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          gpuTarget: ${{ job.target }}
-
- ${{ if eq(parameters.unifiedBuild, False) }}:
-  - ${{ each job in parameters.jobMatrix.testJobs }}:
-    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
-      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
-      condition:
-        and(succeeded(),
-          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
-          eq(${{ parameters.aggregatePipeline }}, False)
-        )
-      variables:
-      - group: common
-      - template: /.azuredevops/variables-global.yml
-      pool: ${{ job.target }}_test_pool
-      workspace:
-        clean: all
-      steps:
-      - checkout: none
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          packageManager: ${{ job.packageManager }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-        parameters:
-          preTargetFilter: ${{ parameters.componentName }}
-          gpuTarget: ${{ job.target }}
-          os: ${{ job.os }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-        parameters:
-          checkoutRef: ${{ parameters.checkoutRef }}
-          dependencyList: ${{ parameters.rocmTestDependencies }}
-          gpuTarget: ${{ job.target }}
-          os: ${{ job.os }}
-          ${{ if parameters.triggerDownstreamJobs }}:
-            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-        parameters:
-          componentName: ${{ parameters.componentName }}
-          testDir: $(Agent.BuildDirectory)/rocm/share/hsa-amd-aqlprofile/
-          testExecutable: ./run_tests.sh
-          testParameters: ''
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          environment: test
-          gpuTarget: ${{ job.target }}
--- a/.azuredevops/components/hipBLASLt.yml
+++ b/.azuredevops/components/hipBLASLt.yml
@@ -77,7 +77,6 @@ parameters:
    - clr
    - hipBLAS-common
    - llvm-project
-    - rocm-cmake
    - rocminfo
    - rocm_smi_lib
    - rocprofiler-register
@@ -145,7 +144,7 @@ jobs:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
        packageManager: ${{ job.packageManager }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
--- a/.azuredevops/components/hipSPARSELt.yml
+++ b/.azuredevops/components/hipSPARSELt.yml
@@ -40,7 +40,6 @@ parameters:
    - gfortran
    - libgfortran5
    - libopenblas-dev
-    - liblapack-dev
 - name: pipModules
  type: object
  default:
@@ -54,7 +53,6 @@ parameters:
    - hipSPARSE
    - llvm-project
    - rocBLAS
-    - rocm-cmake
    - rocm_smi_lib
    - rocminfo
    - rocprofiler-register
@@ -68,7 +66,6 @@ parameters:
    - llvm-project
    - hipBLAS-common
    - hipBLASLt
-    - rocm-cmake
    - rocBLAS
    - rocminfo
    - rocprofiler-register
@@ -112,7 +109,7 @@ jobs:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
        packageManager: ${{ job.packageManager }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
@@ -128,13 +125,10 @@ jobs:
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
        ${{ if parameters.triggerDownstreamJobs }}:
          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
-  # NOTE: content between `---` is for transition support between old/new build systems
-  # and should be removed once transition is complete.
-  # -----------------------------
  # Build and install gtest and lapack
  # $(Pipeline.Workspace)/deps is a temporary folder for the build process
  # $(Pipeline.Workspace)/s/deps is part of the hipSPARSELt repo
-    - script: mkdir -p $(Pipeline.Workspace)/deps
+    - script: mkdir $(Pipeline.Workspace)/deps
      displayName: Create temp folder for external dependencies
  # hipSPARSELt already has a CMake script for external deps, so we can just run that
  # https://github.com/ROCm/hipSPARSELt/blob/develop/deps/CMakeLists.txt
@@ -150,35 +144,22 @@ jobs:
    - script: sudo make install
      displayName: Install hipSPARSELt external dependencies
      workingDirectory: $(Pipeline.Workspace)/deps
-  # -----------------------------
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        os: ${{ job.os }}
-        # NOTE: the following options are old build only 
-        # and can be removed after full transition to new build
-        # -DAMDGPU_TARGETS=${{ job.target }}
-        # -DCMAKE_Fortran_COMPILER=f95
-        # -DTensile_LOGIC=
-        # -DTensile_CPU_THREADS=
-        # -DTensile_LIBRARY_FORMAT=msgpack
-        # -DROCM_PATH=$(Agent.BuildDirectory)/rocm
-        # -DBUILD_CLIENTS_TESTS=ON
-        # -DBUILD_USE_LOCAL_TENSILE=OFF
        extraBuildFlags: >-
          -DCMAKE_BUILD_TYPE=Release
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
-          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm"
-          -DGPU_TARGETS=${{ job.target }}
-          -DAMDGPU_TARGETS=${{ job.target }}
          -DCMAKE_Fortran_COMPILER=f95
+          -DAMDGPU_TARGETS=${{ job.target }}
          -DTensile_LOGIC=
          -DTensile_CPU_THREADS=
          -DTensile_LIBRARY_FORMAT=msgpack
+          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm"
          -DROCM_PATH=$(Agent.BuildDirectory)/rocm
          -DBUILD_CLIENTS_TESTS=ON
          -DBUILD_USE_LOCAL_TENSILE=OFF
-          -DHIPSPARSELT_ENABLE_FETCH=ON
          -GNinja
        ${{ if ne(parameters.sparseCheckoutDir, '') }}:
          cmakeSourceDir: $(Build.SourcesDirectory)/projects/hipsparselt
--- a/.azuredevops/components/hipTensor.yml
+++ b/.azuredevops/components/hipTensor.yml
@@ -77,7 +77,6 @@ jobs:
        extraBuildFlags: >-
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/rocm/llvm
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
-          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
          -DROCM_PATH=$(Agent.BuildDirectory)/rocm
          -DCMAKE_BUILD_TYPE=Release
          -DHIPTENSOR_BUILD_TESTS=ON
--- a/.azuredevops/components/hipfort.yml
+++ b/.azuredevops/components/hipfort.yml
@@ -71,7 +71,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
--- a/.azuredevops/components/origami.yml
+++ b/.azuredevops/components/origami.yml
@@ -39,9 +39,6 @@ parameters:
    - python3
    - python3-dev
    - python3-pip
-    - libgtest-dev
-    - libboost-filesystem-dev
-    - libboost-program-options-dev
 - name: pipModules
  type: object
  default:
@@ -110,12 +107,8 @@ jobs:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
        packageManager: ${{ job.packageManager }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
-      parameters:
-        dependencyList:
-          - gtest
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
@@ -132,7 +125,7 @@ jobs:
      parameters:
        os: ${{ job.os }}
        extraBuildFlags: >-
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor
+          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
          -DORIGAMI_BUILD_SHARED_LIBS=ON
          -DORIGAMI_ENABLE_PYTHON=ON
@@ -213,15 +206,7 @@ jobs:
          ${{ if parameters.triggerDownstreamJobs }}:
            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-        parameters:
-          componentName: ${{ parameters.componentName }}
-          os: ${{ job.os }}
-          testDir: '$(Agent.BuildDirectory)/rocm/bin'
-          testExecutable: './origami-tests'
-          testParameters: '--yaml origami-tests.yaml --gtest_output=xml:./test_output.xml --gtest_color=yes'
      - script: |
-          set -e
          export PYTHONPATH=$(Agent.BuildDirectory)/s/build/python:$PYTHONPATH

          echo "--- Running origami_test.py ---"
--- a/.azuredevops/components/rccl.yml
+++ b/.azuredevops/components/rccl.yml
@@ -70,7 +70,7 @@ parameters:
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: rccl_build_${{ job.target }}
-    timeoutInMinutes: 120
+    timeoutInMinutes: 90
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -83,7 +83,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
--- a/.azuredevops/components/rdc.yml
+++ b/.azuredevops/components/rdc.yml
@@ -1,29 +1,10 @@
 parameters:
- name: componentName
-  type: string
-  default: rdc
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
-# monorepo related parameters
- name: sparseCheckoutDir
-  type: string
-  default: ''
- name: triggerDownstreamJobs
-  type: boolean
-  default: false
- name: downstreamAggregateNames
-  type: string
-  default: ''
- name: buildDependsOn
-  type: object
-  default: null
- name: unifiedBuild
-  type: boolean
-  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -52,7 +33,6 @@ parameters:
    - clr
    - hipBLAS-common
    - hipBLASLt
-    - hipRAND
    - llvm-project
    - rocBLAS
    - rocm-cmake
@@ -63,7 +43,6 @@ parameters:
    - rocprofiler
    - rocprofiler-register
    - rocprofiler-sdk
-    - rocRAND
    - ROCR-Runtime
 - name: rocmTestDependencies
  type: object
@@ -95,11 +74,7 @@ parameters:

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: ${{ parameters.componentName }}_build_${{ job.target }}
-    ${{ if parameters.buildDependsOn }}:
-      dependsOn:
-        - ${{ each build in parameters.buildDependsOn }}:
-          - ${{ build }}_${{ job.target }}
+  - job: rdc_build_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -110,22 +85,16 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
-      parameters:
-        cmakeVersion: '3.25.0'
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-        ${{ if parameters.triggerDownstreamJobs }}:
-          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
  # Build grpc
    - task: Bash@3
      displayName: 'git clone grpc'
@@ -135,7 +104,6 @@ jobs:
        workingDirectory: $(Build.SourcesDirectory)
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
        cmakeBuildDir: $(Build.SourcesDirectory)/grpc/build
        cmakeSourceDir: $(Build.SourcesDirectory)/grpc
        installDir: $(Build.SourcesDirectory)/bin
@@ -149,7 +117,6 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
        extraBuildFlags: >-
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
          -DGRPC_ROOT="$(Build.SourcesDirectory)/bin"
@@ -159,12 +126,9 @@ jobs:
          -DAMDGPU_TARGETS=${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
@@ -172,64 +136,60 @@ jobs:
        aptPackages: ${{ parameters.aptPackages }}
        gpuTarget: ${{ job.target }}

- ${{ if eq(parameters.unifiedBuild, False) }}:
-  - ${{ each job in parameters.jobMatrix.testJobs }}:
-    - job: ${{ parameters.componentName }}_test_${{ job.target }}
-      dependsOn: ${{ parameters.componentName }}_build_${{ job.target }}
-      condition:
-        and(succeeded(),
-          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
-          eq(${{ parameters.aggregatePipeline }}, False)
-        )
-      variables:
-      - group: common
-      - template: /.azuredevops/variables-global.yml
-      - name: ROCM_PATH
-        value: $(Agent.BuildDirectory)/rocm
-      - name: ROCM_DIR
-        value: $(Agent.BuildDirectory)/rocm
-      pool: ${{ job.target }}_test_pool
-      workspace:
-        clean: all
-      steps:
-      - checkout: none
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-        parameters:
-          gpuTarget: ${{ job.target }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-        parameters:
-          checkoutRef: ${{ parameters.checkoutRef }}
-          dependencyList: ${{ parameters.rocmTestDependencies }}
-          gpuTarget: ${{ job.target }}
-          ${{ if parameters.triggerDownstreamJobs }}:
-              downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
-      - task: Bash@3
-        displayName: Setup test environment
-        inputs:
-          targetType: inline
-          script: |
-            sudo ln -s $(Agent.BuildDirectory)/rocm/bin/rdcd /usr/sbin/rdcd
-            echo $(Agent.BuildDirectory)/rocm/lib/rdc/grpc/lib | sudo tee /etc/ld.so.conf.d/grpc.conf
-            sudo ldconfig -v
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-      - task: Bash@3
-        displayName: Test rdc
-        inputs:
-          targetType: inline
-          script: >-
-            $(Agent.BuildDirectory)/rocm/share/rdc/rdctst_tests/rdctst
-            --batch_mode
-            --start_rdcd
-            --unauth_comm
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          environment: test
-          gpuTarget: ${{ job.target }}
-          extraPaths: /home/user/workspace/rocm/bin
+- ${{ each job in parameters.jobMatrix.testJobs }}:
+  - job: rdc_test_${{ job.target }}
+    dependsOn: rdc_build_${{ job.target }}
+    condition:
+      and(succeeded(),
+        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
+        eq(${{ parameters.aggregatePipeline }}, False)
+      )
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    - name: ROCM_PATH
+      value: $(Agent.BuildDirectory)/rocm
+    - name: ROCM_DIR
+      value: $(Agent.BuildDirectory)/rocm
+    pool: ${{ job.target }}_test_pool
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+      parameters:
+        gpuTarget: ${{ job.target }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      parameters:
+        checkoutRef: ${{ parameters.checkoutRef }}
+        dependencyList: ${{ parameters.rocmTestDependencies }}
+        gpuTarget: ${{ job.target }}
+    - task: Bash@3
+      displayName: Setup test environment
+      inputs:
+        targetType: inline
+        script: |
+          sudo ln -s $(Agent.BuildDirectory)/rocm/bin/rdcd /usr/sbin/rdcd
+          echo $(Agent.BuildDirectory)/rocm/lib/rdc/grpc/lib | sudo tee /etc/ld.so.conf.d/grpc.conf
+          sudo ldconfig -v
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+    - task: Bash@3
+      displayName: Test rdc
+      inputs:
+        targetType: inline
+        script: >-
+          $(Agent.BuildDirectory)/rocm/share/rdc/rdctst_tests/rdctst
+          --batch_mode
+          --start_rdcd
+          --unauth_comm
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        environment: test
+        gpuTarget: ${{ job.target }}
+        extraPaths: /home/user/workspace/rocm/bin
--- a/.azuredevops/components/rocBLAS.yml
+++ b/.azuredevops/components/rocBLAS.yml
@@ -70,7 +70,6 @@ parameters:
    - hipBLAS-common
    - hipBLASLt
    - llvm-project
-    - rocm-cmake
    - rocminfo
    - rocprofiler-register
    - rocm_smi_lib
@@ -155,7 +154,7 @@ jobs:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
        packageManager: ${{ job.packageManager }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
--- a/.azuredevops/components/rocPRIM.yml
+++ b/.azuredevops/components/rocPRIM.yml
@@ -210,7 +210,7 @@ jobs:
        parameters:
          componentName: ${{ parameters.componentName }}
          testDir: '$(Agent.BuildDirectory)/rocm/bin/rocprim'
-          extraTestParameters: '-I ${{ job.shard }},,${{ job.shardCount }}'
+          extraTestParameters: '-I ${{ job.shard }},,${{ job.shardCount }} -E device_merge_inplace'
          os: ${{ job.os }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
        parameters:
--- a/.azuredevops/components/rocm-examples.yml
+++ b/.azuredevops/components/rocm-examples.yml
@@ -14,17 +14,9 @@ parameters:
  type: object
  default:
    - cmake
-    - libdw-dev
    - libglfw3-dev
    - libmsgpack-dev
-    - libomp-dev
-    - libopencv-dev
    - libtbb-dev
-    - libtiff-dev
-    - libva-amdgpu-dev
-    - libavcodec-dev
-    - libavformat-dev
-    - libavutil-dev
    - ninja-build
    - python3-pip
 - name: rocmDependencies
@@ -41,24 +33,16 @@ parameters:
    - hipRAND
    - hipSOLVER
    - hipSPARSE
-    - hipTensor
    - llvm-project
-    - MIOpen
-    - MIVisionX
    - rocBLAS
-    - rocDecode
    - rocFFT
-    - rocJPEG
    - rocPRIM
    - rocprofiler-register
-    - rocprofiler-sdk
    - ROCR-Runtime
    - rocRAND
    - rocSOLVER
    - rocSPARSE
    - rocThrust
-    - rocWMMA
-    - rpp
 - name: rocmTestDependencies
  type: object
  default:
@@ -73,26 +57,18 @@ parameters:
    - hipRAND
    - hipSOLVER
    - hipSPARSE
-    - hipTensor
    - llvm-project
-    - MIOpen
-    - MIVisionX
    - rocBLAS
-    - rocDecode
    - rocFFT
    - rocminfo
    - rocPRIM
-    - rocJPEG
    - rocprofiler-register
-    - rocprofiler-sdk
    - ROCR-Runtime
    - rocRAND
    - rocSOLVER
    - rocSPARSE
    - rocThrust
    - roctracer
-    - rocWMMA
-    - rpp

 - name: jobMatrix
  type: object
@@ -121,10 +97,6 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
-        registerROCmPackages: true
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
-      parameters:
-        cmakeVersion: '3.25.0'
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
@@ -186,10 +158,6 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
-        registerROCmPackages: true
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
-      parameters:
-        cmakeVersion: '3.25.0'
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
--- a/.azuredevops/components/rocm-libraries.yml
+++ b/.azuredevops/components/rocm-libraries.yml
@@ -43,14 +43,9 @@ parameters:
    - ninja-build
    - python3-pip
    - python3-venv
-    - googletest
-    - libgtest-dev
-    - libgmock-dev
-    - libboost-filesystem-dev
 - name: pipModules
  type: object
  default:
-    - msgpack
    - joblib
    - "packaging>=22.0"
    - pytest
@@ -107,7 +102,7 @@ jobs:
    workspace:
      clean: all
    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
@@ -152,13 +147,6 @@ jobs:
          echo "##vso[task.prependpath]$USER_BASE/bin"
          echo "##vso[task.setvariable variable=PytestCmakePath]$USER_BASE/share/Pytest/cmake"
        displayName: Set cmake configure paths
-    - task: Bash@3
-      displayName: Add ROCm binaries to PATH
-      inputs:
-        targetType: inline
-        script: |
-          echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/bin"
-          echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/llvm/bin"
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        os: ${{ job.os }}
--- a/.azuredevops/components/rocprofiler-sdk.yml
+++ b/.azuredevops/components/rocprofiler-sdk.yml
@@ -213,7 +213,6 @@ jobs:
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
        parameters:
          componentName: ${{ parameters.componentName }}
-          testDir: $(Agent.BuildDirectory)/s/build
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
        parameters:
          aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/rocprofiler-systems.yml
+++ b/.azuredevops/components/rocprofiler-systems.yml
@@ -226,11 +226,8 @@ jobs:
            echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/llvm/bin"
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
        parameters:
-          cmakeSourceDir: $(Agent.BuildDirectory)/s/projects/rocprofiler-systems
    # build flags reference: https://rocm.docs.amd.com/projects/omnitrace/en/latest/install/install.html
          extraBuildFlags: >-
-            -DCMAKE_INSTALL_PREFIX=$(Agent.BuildDirectory)/rocprofiler-systems
-            -DROCPROFSYS_USE_PYTHON=ON
            -DROCPROFSYS_BUILD_TESTING=ON
            -DROCPROFSYS_BUILD_DYNINST=ON
            -DROCPROFSYS_BUILD_LIBUNWIND=ON
@@ -248,13 +245,11 @@ jobs:
        displayName: Set up rocprofiler-systems env
        inputs:
          targetType: inline
-          script: source $(Agent.BuildDirectory)/rocprofiler-systems/share/rocprofiler-systems/setup-env.sh
-          workingDirectory: $(Agent.BuildDirectory)/rocprofiler-systems/share/rocprofiler-systems
+          script: source share/rocprofiler-systems/setup-env.sh
+          workingDirectory: build
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
        parameters:
          componentName: ${{ parameters.componentName }}
-          testDir: $(Agent.BuildDirectory)/s/build/tests/
-          testParameters: '--output-on-failure'
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
        parameters:
          gpuTarget: ${{ job.target }}
--- a/.azuredevops/dependencies/cli11.yml
+++ b/.azuredevops/dependencies/cli11.yml
@@ -1,63 +0,0 @@
-parameters:
- name: checkoutRepo
-  type: string
-  default: 'self'
- name: checkoutRef
-  type: string
-  default: ''
- name: cli11Version
-  type: string
-  default: ''
- name: aptPackages
-  type: object
-  default:
-    - cmake
-    - git
-    - ninja-build
-
- name: jobMatrix
-  type: object
-  default:
-    buildJobs:
-      - { os: ubuntu2204, packageManager: apt}
-      - { os: almalinux8, packageManager: dnf}
-
-jobs:
- ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: cli11_${{ job.os }}
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool:
-      vmImage: 'ubuntu-22.04'
-    ${{ if eq(job.os, 'almalinux8') }}:
-      container:
-        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-        endpoint: ContainerService3
-    workspace:
-      clean: all
-    steps:
-    - checkout: none
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        packageManager: ${{ job.packageManager }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - task: Bash@3
-      displayName: Clone cli11 ${{ parameters.cli11Version }}
-      inputs:
-        targetType: inline
-        script: git clone https://github.com/CLIUtils/CLI11.git -b ${{ parameters.cli11Version }}
-        workingDirectory: $(Agent.BuildDirectory)
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-      parameters:
-        os: ${{ job.os }}
-        cmakeBuildDir: $(Agent.BuildDirectory)/CLI11/build
-        cmakeSourceDir: $(Agent.BuildDirectory)/CLI11
-        useAmdclang: false
-        extraBuildFlags: >-
-          -DCMAKE_BUILD_TYPE=Release
-          -GNinja
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-      parameters:
-        os: ${{ job.os }}
--- a/.azuredevops/dependencies/yamlcpp.yml
+++ b/.azuredevops/dependencies/yamlcpp.yml
@@ -1,66 +0,0 @@
-parameters:
- name: checkoutRepo
-  type: string
-  default: 'self'
- name: checkoutRef
-  type: string
-  default: ''
- name: yamlcppVersion
-  type: string
-  default: ''
- name: aptPackages
-  type: object
-  default:
-    - cmake
-    - git
-    - ninja-build
-
- name: jobMatrix
-  type: object
-  default:
-    buildJobs:
-      - { os: ubuntu2204, packageManager: apt}
-      - { os: almalinux8, packageManager: dnf}
-
-jobs:
- ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: yamlcpp_${{ job.os }}
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool:
-      vmImage: 'ubuntu-22.04'
-    ${{ if eq(job.os, 'almalinux8') }}:
-      container:
-        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-        endpoint: ContainerService3
-    workspace:
-      clean: all
-    steps:
-    - checkout: none
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        packageManager: ${{ job.packageManager }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - task: Bash@3
-      displayName: Clone yaml-cpp ${{ parameters.yamlcppVersion }}
-      inputs:
-        targetType: inline
-        script: git clone  https://github.com/jbeder/yaml-cpp.git -b ${{ parameters.yamlcppVersion }}
-        workingDirectory: $(Agent.BuildDirectory)
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-      parameters:
-        os: ${{ job.os }}
-        cmakeBuildDir: $(Agent.BuildDirectory)/yaml-cpp/build
-        cmakeSourceDir: $(Agent.BuildDirectory)/yaml-cpp
-        useAmdclang: false
-        extraBuildFlags: >-
-          -DCMAKE_BUILD_TYPE=Release
-          -DYAML_CPP_BUILD_TOOLS=OFF
-          -DYAML_BUILD_SHARED_LIBS=OFF
-          -DYAML_CPP_INSTALL=ON
-          -GNinja
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-      parameters:
-        os: ${{ job.os }}
--- a/.azuredevops/tag-builds/cli11.yml
+++ b/.azuredevops/tag-builds/cli11.yml
@@ -1,23 +0,0 @@
-variables:
- group: common
- template: /.azuredevops/variables-global.yml
-
-parameters:
- name: cli11Version
-  type: string
-  default: "main"
-
-resources:
-  repositories:
-  - repository: pipelines_repo
-    type: github
-    endpoint: ROCm
-    name: ROCm/ROCm
-
-trigger: none
-pr: none
-
-jobs:
-  - template: ${{ variables.CI_DEPENDENCIES_PATH }}/cli11.yml
-    parameters:
-      cli11Version: ${{ parameters.cli11Version }}
--- a/.azuredevops/tag-builds/yaml-cpp.yml
+++ b/.azuredevops/tag-builds/yaml-cpp.yml
@@ -1,24 +0,0 @@
-variables:
- group: common
- template: /.azuredevops/variables-global.yml
-
-parameters:
- name: yamlcppVersion
-  type: string
-  default: "0.8.0"
-
-resources:
-  repositories:
-  - repository: pipelines_repo
-    type: github
-    endpoint: ROCm
-    name: ROCm/ROCm
-
-trigger: none
-pr: none
-
-jobs:
-  - template: ${{ variables.CI_DEPENDENCIES_PATH }}/yamlcpp.yml
-    parameters:
-      yamlcppVersion: ${{ parameters.yamlcppVersion }}
-      
--- a/.azuredevops/templates/steps/dependencies-cmake-latest.yml
+++ b/.azuredevops/templates/steps/dependencies-cmake-latest.yml
@@ -1,15 +1,10 @@
-parameters:
-  - name: cmakeVersion
-    type: string
-    default: '3.31.0'
-
 steps:
 - task: Bash@3
-  displayName: Install CMake ${{ parameters.cmakeVersion }}
+  displayName: Install CMake 3.31
  inputs:
    targetType: inline
    script: |
-      CMAKE_VERSION=${{ parameters.cmakeVersion }}
+      CMAKE_VERSION=3.31.0
      CMAKE_ROOT="$(Pipeline.Workspace)/cmake"

      echo "Downloading CMake $CMAKE_VERSION..."
--- a/.azuredevops/templates/steps/dependencies-rocm.yml
+++ b/.azuredevops/templates/steps/dependencies-rocm.yml
@@ -46,10 +46,6 @@ parameters:
      pipelineId: 115
      developBranch: aomp-dev
      hasGpuTarget: false
-    aqlprofile:
-      pipelineId: 365
-      developBranch: develop
-      hasGpuTarget: false
    clr:
      pipelineId: 335
      developBranch: develop
@@ -130,17 +126,13 @@ parameters:
      pipelineId: 80
      developBranch: develop
      hasGpuTarget: true
-    origami:
-      pipelineId: 364
-      developBranch: develop
-      hasGpuTarget: true
    rccl:
      pipelineId: 107
      developBranch: develop
      hasGpuTarget: true
    rdc:
-      pipelineId: 360
-      developBranch: develop
+      pipelineId: 100
+      developBranch: amd-staging
      hasGpuTarget: false
    rocAL:
      pipelineId: 151
@@ -227,8 +219,8 @@ parameters:
      developBranch: develop
      hasGpuTarget: true
    rocprofiler-systems:
-      pipelineId: 345
-      developBranch: develop
+      pipelineId: 255
+      developBranch: amd-staging
      hasGpuTarget: true
    rocPyDecode:
      pipelineId: 239
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -43,7 +43,6 @@ Blit
 Blockwise
 Bluefield
 Bootloader
-Broadcom
 CAS
 CCD
 CDNA
@@ -147,8 +146,6 @@ Filesystem
 FindDb
 Flang
 FlashAttention
-FlashInfer’s
-FlashInfer
 FluxBenchmark
 Fortran
 Fuyu
@@ -313,7 +310,6 @@ Mooncake
 Mpops
 Multicore
 Multithreaded
-MXFP
 MyEnvironment
 MyST
 NANOO
@@ -484,7 +480,6 @@ TCI
 TCIU
 TCP
 TCR
-TVM
 THREADGROUPS
 threadgroups
 TensorRT
@@ -627,7 +622,6 @@ coalescable
 codename
 collater
 comgr
-compat
 completers
 composable
 concretization
@@ -675,7 +669,6 @@ detections
 dev
 devicelibs
 devsel
-dgl
 dimensionality
 disambiguates
 distro
@@ -715,7 +708,6 @@ githooks
 github
 globals
 gnupg
-gpu
 grayscale
 gx
 gzip
@@ -770,7 +762,6 @@ invariants
 invocating
 ipo
 jax
-json
 kdb
 kfd
 kv
@@ -792,7 +783,6 @@ lossy
 macOS
 matchers
 maxtext
-megablocks
 megatron
 microarchitecture
 migraphx
@@ -951,7 +941,6 @@ softmax
 spack
 spmm
 src
-stanford
 stochastically
 strided
 subcommand
@@ -971,7 +960,6 @@ tabindex
 targetContainer
 td
 tensorfloat
-tf
 th
 tokenization
 tokenize
@@ -984,7 +972,6 @@ toolset
 toolsets
 torchtitan
 torchvision
-tp
 tqdm
 tracebacks
 txt
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,123 +4,9 @@ This page is a historical overview of changes made to ROCm components. This
 consolidated changelog documents key modifications and improvements across
 different versions of the ROCm software stack and its components.

-## ROCm 7.0.2
-
-See the [ROCm 7.0.2 release notes](https://rocm.docs.amd.com/en/docs-7.0.2/about/release-notes.html#rocm-7-0-2-release-notes)
-for a complete overview of this release.
-
-### **AMD SMI** (26.0.2)
-
-#### Added
-
-* Added `bad_page_threshold_exceeded` field to `amd-smi static --ras`, which compares retired pages count against bad page threshold. This field displays `True` if retired pages exceed the threshold, `False` if within threshold, or `N/A` if threshold data is unavailable. Note that `sudo` is required to have the `bad_page_threshold_exceeded` field populated.
-
-#### Removed
-
-* Removed gpuboard and baseboard temperatures enums in amdsmi Python Library.
-    * `AmdSmiTemperatureType` had issues with referencing the correct attribute. As such, the following duplicate enums have been removed:
-        - `AmdSmiTemperatureType.GPUBOARD_NODE_FIRST`
-        - `AmdSmiTemperatureType.GPUBOARD_VR_FIRST`
-        - `AmdSmiTemperatureType.BASEBOARD_FIRST`
-
-#### Resolved Issues
-
-* Fixed `attribute error` in `amd-smi monitor` on Linux Guest systems, where the violations argument caused CLI to break.
-* Fixed certain output in `amd-smi monitor` when GPUs are partitioned.  
-  * It fixes the amd-smi monitor such as: `amd-smi monitor -Vqt`, `amd-smi monitor -g 0 -Vqt -w 1`, `amd-smi monitor -Vqt --file /tmp/test1`, etc. These commands will now be able to display as normal in partitioned GPU scenarios.
-
-* Fixed an issue where using `amd-smi ras --folder <folder_name>` was forcing the created folder's name to be lowercase. This fix also allows all string input options to be case insensitive.
-
-* Fixed an issue of some processes not being detected by AMD SMI despite making use of KFD resources. This fix, with the addition of KFD Fallback for process detection, ensures that all KFD processes will be detected.
-
-* Multiple CPER issues were fixed.  
-  - Issue of being unable to query for additional CPERs after 20 were generated on a single device.
-  - Issue where the RAS HBM CRC read was failing due to an incorrect AFID value.
-  - Issue where RAS injections were not consistently producing related CPERs.
-
-### **HIP** (7.0.2)
-
-#### Added
-
-* Support for the `hipMemAllocationTypeUncached` flag, enabling developers to allocate uncached memory. This flag is now supported in the following APIs:
-    - `hipMemGetAllocationGranularity` determines the recommended allocation granularity for uncached memory.
-    - `hipMemCreate` allocates memory with uncached properties.
-
-#### Resolved issues
-
-* A compilation failure affecting applications that compile kernels using `hiprtc` with the compiler option `std=c++11`.
-* A permission-related error occurred during the execution of `hipLaunchHostFunc`. This API is now supported and permitted to run during stream capture, aligning its behavior with CUDA.
-* A numerical error during graph capture of kernels that rely on a remainder in `globalWorkSize`, in frameworks like MIOpen and PyTorch, where the grid size is not a multiple of the block size. To ensure correct replay behavior, HIP runtime now stores this remainder in `hip::GraphKernelNode` during `hipExtModuleLaunchKernel` capture, enabling accurate execution and preventing corruption.
-* A page fault occurred during viewport rendering while running the file undo.blend in Blender. The issue was resolved by the HIP runtime, which reused the same context during image creation.
-* Resolved a segmentation fault in `gpu_metrics`, which is used in threshold logic for command submission patches to GPU device(s) during CPU synchronization.
-
-### **hipBLAS** (3.0.2)
- 
-#### Added
- 
-* Enabled support for gfx1150, gfx1151, gfx1200, and gfx1201 AMD hardware.
-
-### **RCCL** (2.26.6)
-
-#### Added
-
-* Enabled double-buffering in `reduceCopyPacks` to trigger pipelining, especially to overlap bf16 arithmetic.
-* Added `--force-reduce-pipeline` as an option that can be passed to the `install.sh` script. Passing this option will enable software-triggered pipelining `bfloat16` reductions (that is, `all_reduce`, `reduce_scatter`, and `reduce`).
-
-### **rocBLAS** (5.0.2)
- 
-#### Added
- 
-* Enabled gfx1150 and gfx1151.
-* The `ROCBLAS_USE_HIPBLASLT_BATCHED` variable to independently control the batched hipblaslt backend. Set `ROCBLAS_USE_HIPBLASLT_BATCHED=0` to disable batched GEMM use of the hipblaslt backend.
-
-#### Resolved issues
- 
-* Set the imaginary portion of the main diagonal of the output matrix to zero in syrk and herk.
-
-### **ROCdbgapi** (0.77.4)
-
-#### Added
-
-* ROCdbgapi documentation link in the README.md file.
-
-### **ROCm Systems Profiler** (1.1.1)
-
-#### Resolved issues
-
-* Fixed an issue where ROC-TX ranges were displayed as two separate events instead of a single spanning event.
-
-### **rocPRIM** (4.0.1)
-
-#### Resolved issues
-
-* Fixed compilation issue when using `rocprim::texture_cache_iterator`.
-* Fixed a HIP version check used to determine whether `hipStreamLegacy` is supported. This resolves runtime errors that occur when `hipStreamLegacy` is used in ROCm 7.0.0 and later.
-
-### **rocSPARSE** (4.0.3)
-
-#### Resolved issues
-
-* Fixed an issue causing premature deallocation of internal buffers while still in use.
-
-### **rocSOLVER** (3.30.1)
-
-#### Optimized
-
-Improved the performance of:
-
-* LARFT and downstream functions such as GEQRF and ORMTR.
-* LARF and downstream functions such as GEQR2.
-* ORMTR and downstream functions such as SYEVD.
-* GEQR2 and downstream functions such as GEQRF.
-
-## ROCm 7.0.1
-
-ROCm 7.0.1 is a quality release that resolves the existing issue. There is no change in component from the previous ROCm 7.0.0 release. See the [ROCm 7.0.1 release notes](https://rocm.docs.amd.com/en/docs-7.0.1/about/release-notes.html#rocm-7-0-1-release-notes) for a complete overview of this release.
-
 ## ROCm 7.0.0

-See the [ROCm 7.0.0 release notes](https://rocm.docs.amd.com/en/docs-7.0.0/about/release-notes.html#rocm-7-0-0-release-notes)
+See the [ROCm 7.0.0 release notes](https://rocm-stg.amd.com/en/latest/about/release-notes.html#rocm-7-0-0-release-notes)
 for a complete overview of this release.

 ### **AMD SMI** (26.0.0)
@@ -912,15 +798,11 @@ HIP runtime has the following functional improvements which improves runtime per
 * Compatibility with NCCL 2.25.1.
 * Compatibility with NCCL 2.26.6.

-#### Optimized
-* Improved the performance of the `FP8` Sum operation by upcasting to `FP16`.
-
 #### Resolved issues

 * Resolved an issue when using more than 64 channels when multiple collectives are used in the same `ncclGroup()` call.
 * Fixed unit test failures in tests ending with the `ManagedMem` and `ManagedMemGraph` suffixes.
 * Fixed a suboptimal algorithmic switching point for AllReduce on the AMD Instinct MI300X.
-* Fixed broken functionality within the LL protocol on gfx950 by disabling inlining of LLGenericOp kernels.
 * Fixed the known issue "When splitting a communicator using `ncclCommSplit` in some GPU configurations, MSCCL initialization can cause a segmentation fault" with a design change to use `comm` instead of `rank` for `mscclStatus`. The global map for `comm` to `mscclStatus` is still not thread safe but should be explicitly handled by mutexes for read-write operations. This is tested for correctness, but there is a plan to use a thread-safe map data structure in an upcoming release.

 ### **rocAL** (2.3.0)
@@ -4116,7 +3998,7 @@ memory partition modes upon an invalid argument return from memory partition mod

 - JSON output plugin for `rocprofv2`. The JSON file matches Google Trace Format making it easy to load on Perfetto, Chrome tracing, or Speedscope. For Speedscope, use `--disable-json-data-flows` option as speedscope doesn't work with data flows.
 - `--no-serialization` flag to disable kernel serialization when `rocprofv2` is in counter collection mode. This allows `rocprofv2` to avoid deadlock when profiling certain programs in counter collection mode.
- `FP64_ACTIVE` and `ENGINE_ACTIVE` metrics to AMD Instinct MI300 GPU
+- `FP64_ACTIVE` and `ENGINE_ACTIVE` metrics to AMD Instinct MI300 accelerator
 - New HIP APIs with struct defined inside union.
 - Early checks to confirm the eligibility of ELF file in ATT plugin
 - Support for kernel name filtering in `rocprofv2`
@@ -4140,18 +4022,18 @@ memory partition modes upon an invalid argument return from memory partition mod

 #### Resolved issues

- Bandwidth measurement in AMD Instinct MI300 GPU
+- Bandwidth measurement in AMD Instinct MI300 accelerator
 - Perfetto plugin issue of `roctx` trace not getting displayed
 - `--help` for counter collection
 - Signal management issues in `queue.cpp`
 - Perfetto tracks for multi-GPU
 - Perfetto plugin usage with `rocsys`
 - Incorrect number of columns in the output CSV files for counter collection and kernel tracing
- The ROCProfiler hang issue when running kernel trace, thread trace, or counter collection on Iree benchmark for AMD Instinct MI300 GPU
+- The ROCProfiler hang issue when running kernel trace, thread trace, or counter collection on Iree benchmark for AMD Instinct MI300 accelerator
 - Build errors thrown during parsing of unions
 - The system hang caused while running `--kernel-trace` with Perfetto for certain applications
 - Missing profiler records issue caused while running `--trace-period`
- The hang issue of `ProfilerAPITest` of `runFeatureTests` on AMD Instinct MI300 GPU
+- The hang issue of `ProfilerAPITest` of `runFeatureTests` on AMD Instinct MI300 accelerator
 - Segmentation fault on Navi32


@@ -5548,7 +5430,7 @@ See [issue #3499](https://github.com/ROCm/ROCm/issues/3499) on GitHub.
  intermediary script to call the application with the necessary arguments, then call the script with Omniperf. This
  issue is fixed in a future release of Omniperf. See [#347](https://github.com/ROCm/rocprofiler-compute/issues/347).

- Omniperf might not work with AMD Instinct MI300 GPUs out of the box, resulting in the following error:
+- Omniperf might not work with AMD Instinct MI300 accelerators out of the box, resulting in the following error:
  "*ERROR gfx942 is not enabled rocprofv1. Available profilers include: ['rocprofv2']*". As a workaround, add the
  environment variable `export ROCPROF=rocprofv2`.

@@ -5664,7 +5546,7 @@ See [issue #3498](https://github.com/ROCm/ROCm/issues/3498) on GitHub.

 #### Optimized

-* Improved performance of Level 1 `dot_batched` and `dot_strided_batched` for all precisions. Performance enhanced by 6 times for bigger problem sizes, as measured on an Instinct MI210 GPU.
+* Improved performance of Level 1 `dot_batched` and `dot_strided_batched` for all precisions. Performance enhanced by 6 times for bigger problem sizes, as measured on an Instinct MI210 accelerator.

 #### Removed

--- a/RELEASE.md
+++ b/RELEASE.md
--- a/default.xml
+++ b/default.xml
@@ -1,7 +1,7 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <manifest>
    <remote name="rocm-org" fetch="https://github.com/ROCm/" />
-    <default revision="refs/tags/rocm-7.0.2"
+    <default revision="refs/tags/rocm-7.0.0"
     remote="rocm-org"
     sync-c="true"
     sync-j="4" />
@@ -41,6 +41,7 @@
    <project groups="mathlibs" name="MIVisionX" />
    <project groups="mathlibs" name="ROCmValidationSuite" />
    <project groups="mathlibs" name="composable_kernel" />
+    <project groups="mathlibs" name="hipSOLVER" />
    <project groups="mathlibs" name="hipTensor" />
    <project groups="mathlibs" name="hipfort" />
    <project groups="mathlibs" name="rccl" />
@@ -56,6 +57,7 @@
    <project groups="mathlibs" name="rocm-libraries" />
    <project groups="mathlibs" name="rocPyDecode" />
    <project groups="mathlibs" name="rocSHMEM" />
+    <project groups="mathlibs" name="rocSOLVER" />
    <project groups="mathlibs" name="rocWMMA" />
    <project groups="mathlibs" name="rocm-cmake" />
    <project groups="mathlibs" name="rpp" />
--- a/docs/about/license.md
+++ b/docs/about/license.md
@@ -30,7 +30,6 @@ additional licenses. Please review individual repositories for more information.
 | [aomp](https://github.com/ROCm/aomp/) | [Apache 2.0](https://github.com/ROCm/aomp/blob/aomp-dev/LICENSE) |
 | [aomp-extras](https://github.com/ROCm/aomp-extras/) | [MIT](https://github.com/ROCm/aomp-extras/blob/aomp-dev/LICENSE) |
 | [AQLprofile](https://github.com/rocm/aqlprofile/) | [MIT](https://github.com/ROCm/aqlprofile/blob/amd-staging/LICENSE.md) |
-| [Cluster Validation Suite](https://github.com/ROCm/cvs) | [MIT](https://github.com/ROCm/cvs/blob/main/LICENSE) |
 | [Code Object Manager (Comgr)](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/comgr) | [The University of Illinois/NCSA](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/comgr/LICENSE.txt) |
 | [Composable Kernel](https://github.com/ROCm/composable_kernel) | [MIT](https://github.com/ROCm/composable_kernel/blob/develop/LICENSE) |
 | [half](https://github.com/ROCm/half/) | [MIT](https://github.com/ROCm/half/blob/rocm/LICENSE.txt) |
--- a/docs/compatibility/compatibility-matrix-historical-6.0.csv
+++ b/docs/compatibility/compatibility-matrix-historical-6.0.csv
@@ -1,137 +1,136 @@
-ROCm Version,7.0.2,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0, 6.1.5, 6.1.2, 6.1.1, 6.1.0, 6.0.2, 6.0.0
-      :ref:`Operating systems & kernels <OS-kernel-versions>`,Ubuntu 24.04.3,Ubuntu 24.04.3,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,"Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04",Ubuntu 24.04,,,,,,
-      ,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,"Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3, 22.04.2","Ubuntu 22.04.4, 22.04.3, 22.04.2"
-      ,,,,,,,,,,,,,,,"Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5"
-      ,"RHEL 10.0 [#rhel-10-702-past-60]_, 9.6 [#rhel-10-702-past-60]_, 9.4 [#rhel-94-702-past-60]_","RHEL 9.6 [#rhel-10-702-past-60]_, 9.4 [#rhel-94-702-past-60]_","RHEL 9.6, 9.4","RHEL 9.6, 9.4","RHEL 9.6, 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.3, 9.2","RHEL 9.3, 9.2"
-      ,RHEL 8.10 [#rhel-700-past-60]_,RHEL 8.10 [#rhel-700-past-60]_,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,"RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8"
-      ,SLES 15 SP7 [#sles-db-700-past-60]_,SLES 15 SP7 [#sles-db-700-past-60]_,"SLES 15 SP7, SP6","SLES 15 SP7, SP6",SLES 15 SP6,SLES 15 SP6,"SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4"
-      ,,,,,,,,,,,,,,,,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9
-      ,"Oracle Linux 10, 9, 8 [#ol-700-mi300x-past-60]_","Oracle Linux 9, 8 [#ol-700-mi300x-past-60]_","Oracle Linux 9, 8 [#mi300x-past-60]_","Oracle Linux 9, 8 [#mi300x-past-60]_","Oracle Linux 9, 8 [#mi300x-past-60]_","Oracle Linux 9, 8 [#mi300x-past-60]_",Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,,,
-      ,"Debian 13 [#db-mi300x-past-60]_, 12 [#sles-db-700-past-60]_",Debian 12 [#sles-db-700-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,,,,,,,,,,,
-      ,Azure Linux 3.0 [#az-mi300x-past-60]_,Azure Linux 3.0 [#az-mi300x-past-60]_,Azure Linux 3.0 [#az-mi300x-past-60]_,Azure Linux 3.0 [#az-mi300x-past-60]_,Azure Linux 3.0 [#az-mi300x-past-60]_,Azure Linux 3.0 [#az-mi300x-past-60]_,Azure Linux 3.0 [#az-mi300x-630-past-60]_,Azure Linux 3.0 [#az-mi300x-630-past-60]_,,,,,,,,,,,,
-      ,Rocky Linux 9 [#rl-700-past-60]_,Rocky Linux 9 [#rl-700-past-60]_,,,,,,,,,,,,,,,,,,
-      ,.. _architecture-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
-      :doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA4,CDNA4,,,,,,,,,,,,,,,,,,
-      ,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3
-      ,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2
-      ,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA
-      ,RDNA4,RDNA4,RDNA4,RDNA4,RDNA4,,,,,,,,,,,,,,,
-      ,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3
-      ,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2
-      ,.. _gpu-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
-      :doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>`,gfx950 [#mi350x-os-past-60]_,gfx950 [#mi350x-os-past-60]_,,,,,,,,,,,,,,,,,,
-      ,gfx1201 [#RDNA-OS-700-past-60]_,gfx1201 [#RDNA-OS-700-past-60]_,gfx1201 [#RDNA-OS-past-60]_,gfx1201 [#RDNA-OS-past-60]_,gfx1201 [#RDNA-OS-past-60]_,,,,,,,,,,,,,,,
-      ,gfx1200 [#RDNA-OS-700-past-60]_,gfx1200 [#RDNA-OS-700-past-60]_,gfx1200 [#RDNA-OS-past-60]_,gfx1200 [#RDNA-OS-past-60]_,gfx1200 [#RDNA-OS-past-60]_,,,,,,,,,,,,,,,
-      ,gfx1101 [#RDNA-OS-700-past-60]_ [#rd-v710-past-60]_,gfx1101 [#RDNA-OS-700-past-60]_ [#rd-v710-past-60]_,gfx1101 [#RDNA-OS-past-60]_ [#7700XT-OS-past-60]_,gfx1101 [#RDNA-OS-past-60]_ [#7700XT-OS-past-60]_,gfx1101 [#RDNA-OS-past-60]_,,,,,,,,,,,,,,,
-      ,gfx1100 [#RDNA-OS-700-past-60]_,gfx1100 [#RDNA-OS-700-past-60]_,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100
-      ,gfx1030 [#RDNA-OS-700-past-60]_ [#rd-v620-past-60]_,gfx1030 [#RDNA-OS-700-past-60]_ [#rd-v620-past-60]_,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030
-      ,gfx942 [#mi325x-os-past-60]_ [#mi300x-os-past-60]_ [#mi300A-os-past-60]_,gfx942 [#mi325x-os-past-60]_ [#mi300x-os-past-60]_ [#mi300A-os-past-60]_,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942 [#mi300_624-past-60]_,gfx942 [#mi300_622-past-60]_,gfx942 [#mi300_621-past-60]_,gfx942 [#mi300_620-past-60]_, gfx942 [#mi300_612-past-60]_, gfx942 [#mi300_612-past-60]_, gfx942 [#mi300_611-past-60]_, gfx942 [#mi300_610-past-60]_, gfx942 [#mi300_602-past-60]_, gfx942 [#mi300_600-past-60]_
-      ,gfx90a [#mi200x-os-past-60]_,gfx90a [#mi200x-os-past-60]_,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a
-      ,gfx908 [#mi100-os-past-60]_,gfx908 [#mi100-os-past-60]_,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908
-      ,,,,,,,,,,,,,,,,,,,,
-      FRAMEWORK SUPPORT,.. _framework-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
-      :doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.8, 2.7, 2.6","2.7, 2.6, 2.5","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13"
-      :doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.19.1, 2.18.1, 2.17.1 [#tf-mi350-past-60]_","2.19.1, 2.18.1, 2.17.1 [#tf-mi350-past-60]_","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.14.0, 2.13.1, 2.12.1","2.14.0, 2.13.1, 2.12.1"
-      :doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.6.0,0.6.0,0.4.35,0.4.35,0.4.35,0.4.35,0.4.31,0.4.31,0.4.31,0.4.31,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26
-      :doc:`verl <../compatibility/ml-compatibility/verl-compatibility>` [#verl_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.3.0.post0,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>` [#stanford-megatron-lm_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,85f95ae,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,2.4.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>` [#megablocks_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.7.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`Taichi <../compatibility/ml-compatibility/taichi-compatibility>` [#taichi_compat-past-60]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,1.8.0b1,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`Ray <../compatibility/ml-compatibility/ray-compatibility>` [#ray_compat-past-60]_,N/A,N/A,N/A,N/A,2.48.0.post0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`llama.cpp <../compatibility/ml-compatibility/llama-cpp-compatibility>` [#llama-cpp_compat-past-60]_,N/A,b6356,b6356,b6356,b6356,b5997,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`FlashInfer <../compatibility/ml-compatibility/flashinfer-compatibility>` [#flashinfer_compat-past-60]_,N/A,N/A,N/A,N/A,v0.2.5,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.22.0,1.22.0,1.20.0,1.20.0,1.20.0,1.20.0,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.14.1,1.14.1
-      ,,,,,,,,,,,,,,,,,,,,
-      ,,,,,,,,,,,,,,,,,,,,
-      THIRD PARTY COMMS,.. _thirdpartycomms-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
-      `UCC <https://github.com/ROCm/ucc>`_,>=1.4.0,>=1.4.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.2.0,>=1.2.0
-      `UCX <https://github.com/ROCm/ucx>`_,>=1.17.0,>=1.17.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1
-      ,,,,,,,,,,,,,,,,,,,,
-      THIRD PARTY ALGORITHM,.. _thirdpartyalgorithm-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
-      Thrust,2.6.0,2.6.0,2.5.0,2.5.0,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
-      CUB,2.6.0,2.6.0,2.5.0,2.5.0,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
-      ,,,,,,,,,,,,,,,,,,,,
-     DRIVER & USER SPACE [#kfd_support-past-60]_,.. _kfd-userspace-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
-      :doc:`AMD GPU Driver <rocm-install-on-linux:reference/user-kernel-space-compat-matrix>`,"30.10.2, 30.10.1 [#driver_patch-past-60]_, 30.10, 6.4.x, 6.3.x","30.10.1 [#driver_patch-past-60]_, 30.10, 6.4.x, 6.3.x, 6.2.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x"
-      ,,,,,,,,,,,,,,,,,,,,
-      ML & COMPUTER VISION,.. _mllibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
-      :doc:`Composable Kernel <composable_kernel:index>`,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0
-      :doc:`MIGraphX <amdmigraphx:index>`,2.13.0,2.13.0,2.12.0,2.12.0,2.12.0,2.12.0,2.11.0,2.11.0,2.11.0,2.11.0,2.10.0,2.10.0,2.10.0,2.10.0,2.9.0,2.9.0,2.9.0,2.9.0,2.8.0,2.8.0
-      :doc:`MIOpen <miopen:index>`,3.5.0,3.5.0,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
-      :doc:`MIVisionX <mivisionx:index>`,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0
-      :doc:`rocAL <rocal:index>`,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0,2.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
-      :doc:`rocDecode <rocdecode:index>`,1.0.0,1.0.0,0.10.0,0.10.0,0.10.0,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.0,0.5.0,N/A,N/A
-      :doc:`rocJPEG <rocjpeg:index>`,1.1.0,1.1.0,0.8.0,0.8.0,0.8.0,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`rocPyDecode <rocpydecode:index>`,0.6.0,0.6.0,0.3.1,0.3.1,0.3.1,0.3.1,0.2.0,0.2.0,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`RPP <rpp:index>`,2.0.0,2.0.0,1.9.10,1.9.10,1.9.10,1.9.10,1.9.1,1.9.1,1.9.1,1.9.1,1.8.0,1.8.0,1.8.0,1.8.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0
-      ,,,,,,,,,,,,,,,,,,,,
-      COMMUNICATION,.. _commlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
-      :doc:`RCCL <rccl:index>`,2.26.6,2.26.6,2.22.3,2.22.3,2.22.3,2.22.3,2.21.5,2.21.5,2.21.5,2.21.5,2.20.5,2.20.5,2.20.5,2.20.5,2.18.6,2.18.6,2.18.6,2.18.6,2.18.3,2.18.3
-      :doc:`rocSHMEM <rocshmem:index>`,3.0.0,3.0.0,2.0.1,2.0.1,2.0.0,2.0.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      ,,,,,,,,,,,,,,,,,,,,
-      MATH LIBS,.. _mathlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
-      `half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0
-      :doc:`hipBLAS <hipblas:index>`,3.0.2,3.0.0,2.4.0,2.4.0,2.4.0,2.4.0,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0
-      :doc:`hipBLASLt <hipblaslt:index>`,1.0.0,1.0.0,0.12.1,0.12.1,0.12.1,0.12.0,0.10.0,0.10.0,0.10.0,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.7.0,0.7.0,0.7.0,0.7.0,0.6.0,0.6.0
-      :doc:`hipFFT <hipfft:index>`,1.0.20,1.0.20,1.0.18,1.0.18,1.0.18,1.0.18,1.0.17,1.0.17,1.0.17,1.0.17,1.0.16,1.0.15,1.0.15,1.0.14,1.0.14,1.0.14,1.0.14,1.0.14,1.0.13,1.0.13
-      :doc:`hipfort <hipfort:index>`,0.7.0,0.7.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.1,0.5.1,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0
-      :doc:`hipRAND <hiprand:index>`,3.0.0,3.0.0,2.12.0,2.12.0,2.12.0,2.12.0,2.11.1,2.11.1,2.11.1,2.11.0,2.11.1,2.11.0,2.11.0,2.11.0,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16
-      :doc:`hipSOLVER <hipsolver:index>`,3.0.0,3.0.0,2.4.0,2.4.0,2.4.0,2.4.0,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.1,2.1.1,2.1.1,2.1.0,2.0.0,2.0.0
-      :doc:`hipSPARSE <hipsparse:index>`,4.0.1,4.0.1,3.2.0,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.1.1,3.1.1,3.1.1,3.1.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
-      :doc:`hipSPARSELt <hipsparselt:index>`,0.2.4,0.2.4,0.2.3,0.2.3,0.2.3,0.2.3,0.2.2,0.2.2,0.2.2,0.2.2,0.2.1,0.2.1,0.2.1,0.2.1,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0
-      :doc:`rocALUTION <rocalution:index>`,4.0.0,4.0.0,3.2.3,3.2.3,3.2.3,3.2.2,3.2.1,3.2.1,3.2.1,3.2.1,3.2.1,3.2.0,3.2.0,3.2.0,3.1.1,3.1.1,3.1.1,3.1.1,3.0.3,3.0.3
-      :doc:`rocBLAS <rocblas:index>`,5.0.2,5.0.0,4.4.1,4.4.1,4.4.0,4.4.0,4.3.0,4.3.0,4.3.0,4.3.0,4.2.4,4.2.1,4.2.1,4.2.0,4.1.2,4.1.2,4.1.0,4.1.0,4.0.0,4.0.0
-      :doc:`rocFFT <rocfft:index>`,1.0.34,1.0.34,1.0.32,1.0.32,1.0.32,1.0.32,1.0.31,1.0.31,1.0.31,1.0.31,1.0.30,1.0.29,1.0.29,1.0.28,1.0.27,1.0.27,1.0.27,1.0.26,1.0.25,1.0.23
-      :doc:`rocRAND <rocrand:index>`,4.0.0,4.0.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.1,3.1.0,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,2.10.17
-      :doc:`rocSOLVER <rocsolver:index>`,3.30.1,3.30.0,3.28.2,3.28.2,3.28.0,3.28.0,3.27.0,3.27.0,3.27.0,3.27.0,3.26.2,3.26.0,3.26.0,3.26.0,3.25.0,3.25.0,3.25.0,3.25.0,3.24.0,3.24.0
-      :doc:`rocSPARSE <rocsparse:index>`,4.0.2,4.0.2,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.0.2,3.0.2
-      :doc:`rocWMMA <rocwmma:index>`,2.0.0,2.0.0,1.7.0,1.7.0,1.7.0,1.7.0,1.6.0,1.6.0,1.6.0,1.6.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0
-      :doc:`Tensile <tensile:src/index>`,4.44.0,4.44.0,4.43.0,4.43.0,4.43.0,4.43.0,4.42.0,4.42.0,4.42.0,4.42.0,4.41.0,4.41.0,4.41.0,4.41.0,4.40.0,4.40.0,4.40.0,4.40.0,4.39.0,4.39.0
-      ,,,,,,,,,,,,,,,,,,,,
-      PRIMITIVES,.. _primitivelibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
-      :doc:`hipCUB <hipcub:index>`,4.0.0,4.0.0,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
-      :doc:`hipTensor <hiptensor:index>`,2.0.0,2.0.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0,1.3.0,1.3.0,1.2.0,1.2.0,1.2.0,1.2.0,1.1.0,1.1.0
-      :doc:`rocPRIM <rocprim:index>`,4.0.1,4.0.0,3.4.1,3.4.1,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.2,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
-      :doc:`rocThrust <rocthrust:index>`,4.0.0,4.0.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.1.1,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
-      ,,,,,,,,,,,,,,,,,,,,
-      SUPPORT LIBS,,,,,,,,,,,,,,,,,,,,
-      `hipother <https://github.com/ROCm/hipother>`_,7.0.51830,7.0.51830,6.4.43483,6.4.43483,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
-      `rocm-core <https://github.com/ROCm/rocm-core>`_,7.0.2,7.0.1/7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0,6.1.5,6.1.2,6.1.1,6.1.0,6.0.2,6.0.0
-      `ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,20240607.5.7,20240607.5.7,20240607.4.05,20240607.1.4246,20240125.5.08,20240125.5.08,20240125.5.08,20240125.3.30,20231016.2.245,20231016.2.245
-      ,,,,,,,,,,,,,,,,,,,,
-      SYSTEM MGMT TOOLS,.. _tools-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
-      :doc:`AMD SMI <amdsmi:index>`,26.0.2,26.0.0,25.5.1,25.5.1,25.4.2,25.3.0,24.7.1,24.7.1,24.7.1,24.7.1,24.6.3,24.6.3,24.6.3,24.6.2,24.5.1,24.5.1,24.5.1,24.4.1,23.4.2,23.4.2
-      :doc:`ROCm Data Center Tool <rdc:index>`,1.1.0,1.1.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0
-      :doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
-      :doc:`ROCm SMI <rocm_smi_lib:index>`,7.8.0,7.8.0,7.7.0,7.5.0,7.5.0,7.5.0,7.4.0,7.4.0,7.4.0,7.4.0,7.3.0,7.3.0,7.3.0,7.3.0,7.2.0,7.2.0,7.0.0,7.0.0,6.0.2,6.0.0
-      :doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.2.0,1.2.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.0.60204,1.0.60202,1.0.60201,1.0.60200,1.0.60105,1.0.60102,1.0.60101,1.0.60100,1.0.60002,1.0.60000
-      ,,,,,,,,,,,,,,,,,,,,
-      PERFORMANCE TOOLS,,,,,,,,,,,,,,,,,,,,
-      :doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,2.6.0,2.6.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0
-      :doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.2.3,3.2.3,3.1.1,3.1.1,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.0.1,2.0.1,2.0.1,2.0.1,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.1.1,1.1.0,1.0.2,1.0.2,1.0.1,1.0.0,0.1.2,0.1.1,0.1.0,0.1.0,1.11.2,1.11.2,1.11.2,1.11.2,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`ROCProfiler <rocprofiler:index>`,2.0.70002,2.0.70000,2.0.60403,2.0.60402,2.0.60401,2.0.60400,2.0.60303,2.0.60302,2.0.60301,2.0.60300,2.0.60204,2.0.60202,2.0.60201,2.0.60200,2.0.60105,2.0.60102,2.0.60101,2.0.60100,2.0.60002,2.0.60000
-      :doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,1.0.0,1.0.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.0,0.5.0,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`ROCTracer <roctracer:index>`,4.1.70002,4.1.70000,4.1.60403,4.1.60402,4.1.60401,4.1.60400,4.1.60303,4.1.60302,4.1.60301,4.1.60300,4.1.60204,4.1.60202,4.1.60201,4.1.60200,4.1.60105,4.1.60102,4.1.60101,4.1.60100,4.1.60002,4.1.60000
-      ,,,,,,,,,,,,,,,,,,,,
-      DEVELOPMENT TOOLS,,,,,,,,,,,,,,,,,,,,
-      :doc:`HIPIFY <hipify:index>`,20.0.0,20.0.0,19.0.0,19.0.0,19.0.0,19.0.0,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
-      :doc:`ROCm CMake <rocmcmakebuildtools:index>`,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.13.0,0.13.0,0.13.0,0.13.0,0.12.0,0.12.0,0.12.0,0.12.0,0.11.0,0.11.0
-      :doc:`ROCdbgapi <rocdbgapi:index>`,0.77.4,0.77.3,0.77.2,0.77.2,0.77.2,0.77.2,0.77.0,0.77.0,0.77.0,0.77.0,0.76.0,0.76.0,0.76.0,0.76.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0
-      :doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,16.3.0,16.3.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,14.2.0,14.2.0,14.2.0,14.2.0,14.1.0,14.1.0,14.1.0,14.1.0,13.2.0,13.2.0
-      `rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.3.0,0.3.0,0.3.0,0.3.0,N/A,N/A
-      :doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.1.0,2.1.0,2.0.4,2.0.4,2.0.4,2.0.4,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3
-      ,,,,,,,,,,,,,,,,,,,,
-      COMPILERS,.. _compilers-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
-      `clang-ocl <https://github.com/ROCm/clang-ocl>`_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0
-      :doc:`hipCC <hipcc:index>`,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
-      `Flang <https://github.com/ROCm/flang>`_,20.0.0.25385,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
-      :doc:`llvm-project <llvm-project:index>`,20.0.0.25385,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
-      `OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,20.0.0.25385,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
-      ,,,,,,,,,,,,,,,,,,,,
-      RUNTIMES,.. _runtime-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,,
-      :doc:`AMD CLR <hip:understand/amd_clr>`,7.0.51831,7.0.51830,6.4.43484,6.4.43484,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
-      :doc:`HIP <hip:index>`,7.0.51831,7.0.51830,6.4.43484,6.4.43484,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
-      `OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0
-      :doc:`ROCr Runtime <rocr-runtime:index>`,1.18.0,1.18.0,1.15.0,1.15.0,1.15.0,1.15.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.13.0,1.13.0,1.13.0,1.13.0,1.13.0,1.12.0,1.12.0
+ROCm Version,7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0, 6.1.5, 6.1.2, 6.1.1, 6.1.0, 6.0.2, 6.0.0
+      :ref:`Operating systems & kernels <OS-kernel-versions>`,Ubuntu 24.04.3,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,"Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04",Ubuntu 24.04,,,,,,
+      ,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,"Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3, 22.04.2","Ubuntu 22.04.4, 22.04.3, 22.04.2"
+      ,,,,,,,,,,,,,,"Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5"
+      ,"RHEL 9.6, 9.4","RHEL 9.6, 9.4","RHEL 9.6, 9.4","RHEL 9.6, 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.3, 9.2","RHEL 9.3, 9.2"
+      ,RHEL 8.10 [#rhel-700]_,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,"RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8"
+      ,SLES 15 SP7 [#sles-db-700]_,"SLES 15 SP7, SP6","SLES 15 SP7, SP6",SLES 15 SP6,SLES 15 SP6,"SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4"
+      ,,,,,,,,,,,,,,,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9
+      ,"Oracle Linux 9, 8 [#ol-700-mi300x-past-60]_","Oracle Linux 9, 8 [#mi300x-past-60]_","Oracle Linux 9, 8 [#mi300x-past-60]_","Oracle Linux 9, 8 [#mi300x-past-60]_","Oracle Linux 9, 8 [#mi300x-past-60]_",Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,,,
+      ,Debian 12,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,,,,,,,,,,,
+      ,Azure Linux 3.0 [#az-mi300x-past-60]_,Azure Linux 3.0 [#az-mi300x-past-60]_,Azure Linux 3.0 [#az-mi300x-past-60]_,Azure Linux 3.0 [#az-mi300x-past-60]_,Azure Linux 3.0 [#az-mi300x-past-60]_,Azure Linux 3.0 [#az-mi300x-630-past-60]_,Azure Linux 3.0 [#az-mi300x-630-past-60]_,,,,,,,,,,,,
+,Rocky Linux 9,,,,,,,,,,,,,,,,,,
+      ,.. _architecture-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,
+      :doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA4,,,,,,,,,,,,,,,,,,
+,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3
+      ,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2
+      ,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA
+      ,RDNA4,RDNA4,RDNA4,RDNA4,,,,,,,,,,,,,,,
+      ,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3
+      ,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2
+      ,.. _gpu-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,
+      :doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>`,gfx950,,,,,,,,,,,,,,,,,,
+,gfx1201 [#RDNA-OS-past-60]_,gfx1201 [#RDNA-OS-past-60]_,gfx1201 [#RDNA-OS-past-60]_,gfx1201 [#RDNA-OS-past-60]_,,,,,,,,,,,,,,,
+      ,gfx1200 [#RDNA-OS-past-60]_,gfx1200 [#RDNA-OS-past-60]_,gfx1200 [#RDNA-OS-past-60]_,gfx1200 [#RDNA-OS-past-60]_,,,,,,,,,,,,,,,
+,gfx1101 [#RDNA-OS-past-60]_ [#7700XT-OS-past-60]_,gfx1101 [#RDNA-OS-past-60]_ [#7700XT-OS-past-60]_,gfx1101 [#RDNA-OS-past-60]_ [#7700XT-OS-past-60]_,gfx1101 [#RDNA-OS-past-60]_,,,,,,,,,,,,,,,
+      ,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100
+      ,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030
+      ,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942 [#mi300_624-past-60]_,gfx942 [#mi300_622-past-60]_,gfx942 [#mi300_621-past-60]_,gfx942 [#mi300_620-past-60]_, gfx942 [#mi300_612-past-60]_, gfx942 [#mi300_612-past-60]_, gfx942 [#mi300_611-past-60]_, gfx942 [#mi300_610-past-60]_, gfx942 [#mi300_602-past-60]_, gfx942 [#mi300_600-past-60]_
+      ,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a
+      ,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908
+,,,,,,,,,,,,,,,,,,,
+      FRAMEWORK SUPPORT,.. _framework-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,
+      :doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.7, 2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13"
+      :doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.19.1, 2.18.1","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.14.0, 2.13.1, 2.12.1","2.14.0, 2.13.1, 2.12.1"
+      :doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.6.0,0.4.35,0.4.35,0.4.35,0.4.35,0.4.31,0.4.31,0.4.31,0.4.31,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26
+      :doc:`verl <../compatibility/ml-compatibility/verl-compatibility>` [#verl_compat]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.3.0.post0,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>`,N/A,N/A,N/A,N/A,N/A,85f95ae,85f95ae,85f95ae,85f95ae,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat]_,N/A,N/A,N/A,N/A,2.4.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>`,N/A,N/A,N/A,N/A,N/A,0.7.0,0.7.0,0.7.0,0.7.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`Taichi <../compatibility/ml-compatibility/taichi-compatibility>` [#taichi_compat]_,N/A,N/A,N/A,N/A,N/A,N/A,1.8.0b1,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+:doc:`Ray <../compatibility/ml-compatibility/ray-compatibility>` [#ray_compat]_,N/A,N/A,N/A,2.48.0.post0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+:doc:`llama.cpp <../compatibility/ml-compatibility/llama-cpp-compatibility>` [#llama-cpp_compat]_,N/A,N/A,N/A,N/A,b5997,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.22.0,1.20.0,1.20.0,1.20.0,1.20.0,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.14.1,1.14.1
+,,,,,,,,,,,,,,,,,,,
+      ,,,,,,,,,,,,,,,,,,,
+      THIRD PARTY COMMS,.. _thirdpartycomms-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,
+      `UCC <https://github.com/ROCm/ucc>`_,>=1.4.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.2.0,>=1.2.0
+      `UCX <https://github.com/ROCm/ucx>`_,>=1.17.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1
+      ,,,,,,,,,,,,,,,,,,,
+      THIRD PARTY ALGORITHM,.. _thirdpartyalgorithm-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,
+      Thrust,2.6.0,2.5.0,2.5.0,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
+      CUB,2.6.0,2.5.0,2.5.0,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
+,,,,,,,,,,,,,,,,,,,
+      KMD & USER SPACE [#kfd_support-past-60]_,.. _kfd-userspace-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,
+      :doc:`KMD versions <rocm-install-on-linux:reference/user-kernel-space-compat-matrix>`,"30.10, 6.4.x, 6.3.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x"
+      ,,,,,,,,,,,,,,,,,,,
+      ML & COMPUTER VISION,.. _mllibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,
+      :doc:`Composable Kernel <composable_kernel:index>`,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0
+      :doc:`MIGraphX <amdmigraphx:index>`,2.13.0,2.12.0,2.12.0,2.12.0,2.12.0,2.11.0,2.11.0,2.11.0,2.11.0,2.10.0,2.10.0,2.10.0,2.10.0,2.9.0,2.9.0,2.9.0,2.9.0,2.8.0,2.8.0
+      :doc:`MIOpen <miopen:index>`,3.5.0,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
+      :doc:`MIVisionX <mivisionx:index>`,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0
+      :doc:`rocAL <rocal:index>`,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0,2.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
+      :doc:`rocDecode <rocdecode:index>`,1.0.0,0.10.0,0.10.0,0.10.0,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.0,0.5.0,N/A,N/A
+      :doc:`rocJPEG <rocjpeg:index>`,1.1.0,0.8.0,0.8.0,0.8.0,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`rocPyDecode <rocpydecode:index>`,0.6.0,0.3.1,0.3.1,0.3.1,0.3.1,0.2.0,0.2.0,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`RPP <rpp:index>`,2.0.0,1.9.10,1.9.10,1.9.10,1.9.10,1.9.1,1.9.1,1.9.1,1.9.1,1.8.0,1.8.0,1.8.0,1.8.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0
+      ,,,,,,,,,,,,,,,,,,,
+      COMMUNICATION,.. _commlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,
+      :doc:`RCCL <rccl:index>`,2.26.6,2.22.3,2.22.3,2.22.3,2.22.3,2.21.5,2.21.5,2.21.5,2.21.5,2.20.5,2.20.5,2.20.5,2.20.5,2.18.6,2.18.6,2.18.6,2.18.6,2.18.3,2.18.3
+      :doc:`rocSHMEM <rocshmem:index>`,3.0.0,2.0.1,2.0.1,2.0.0,2.0.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      ,,,,,,,,,,,,,,,,,,,
+      MATH LIBS,.. _mathlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,
+      `half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0
+      :doc:`hipBLAS <hipblas:index>`,3.0.0,2.4.0,2.4.0,2.4.0,2.4.0,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0
+      :doc:`hipBLASLt <hipblaslt:index>`,1.0.0,0.12.1,0.12.1,0.12.1,0.12.0,0.10.0,0.10.0,0.10.0,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.7.0,0.7.0,0.7.0,0.7.0,0.6.0,0.6.0
+      :doc:`hipFFT <hipfft:index>`,1.0.20,1.0.18,1.0.18,1.0.18,1.0.18,1.0.17,1.0.17,1.0.17,1.0.17,1.0.16,1.0.15,1.0.15,1.0.14,1.0.14,1.0.14,1.0.14,1.0.14,1.0.13,1.0.13
+      :doc:`hipfort <hipfort:index>`,0.7.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.1,0.5.1,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0
+      :doc:`hipRAND <hiprand:index>`,3.0.0,2.12.0,2.12.0,2.12.0,2.12.0,2.11.1,2.11.1,2.11.1,2.11.0,2.11.1,2.11.0,2.11.0,2.11.0,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16
+      :doc:`hipSOLVER <hipsolver:index>`,3.0.0,2.4.0,2.4.0,2.4.0,2.4.0,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.1,2.1.1,2.1.1,2.1.0,2.0.0,2.0.0
+      :doc:`hipSPARSE <hipsparse:index>`,4.0.1,3.2.0,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.1.1,3.1.1,3.1.1,3.1.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
+      :doc:`hipSPARSELt <hipsparselt:index>`,0.2.4,0.2.3,0.2.3,0.2.3,0.2.3,0.2.2,0.2.2,0.2.2,0.2.2,0.2.1,0.2.1,0.2.1,0.2.1,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0
+      :doc:`rocALUTION <rocalution:index>`,4.0.0,3.2.3,3.2.3,3.2.3,3.2.2,3.2.1,3.2.1,3.2.1,3.2.1,3.2.1,3.2.0,3.2.0,3.2.0,3.1.1,3.1.1,3.1.1,3.1.1,3.0.3,3.0.3
+      :doc:`rocBLAS <rocblas:index>`,5.0.0,4.4.1,4.4.1,4.4.0,4.4.0,4.3.0,4.3.0,4.3.0,4.3.0,4.2.4,4.2.1,4.2.1,4.2.0,4.1.2,4.1.2,4.1.0,4.1.0,4.0.0,4.0.0
+      :doc:`rocFFT <rocfft:index>`,1.0.34,1.0.32,1.0.32,1.0.32,1.0.32,1.0.31,1.0.31,1.0.31,1.0.31,1.0.30,1.0.29,1.0.29,1.0.28,1.0.27,1.0.27,1.0.27,1.0.26,1.0.25,1.0.23
+      :doc:`rocRAND <rocrand:index>`,4.0.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.1,3.1.0,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,2.10.17
+      :doc:`rocSOLVER <rocsolver:index>`,3.30.0,3.28.2,3.28.2,3.28.0,3.28.0,3.27.0,3.27.0,3.27.0,3.27.0,3.26.2,3.26.0,3.26.0,3.26.0,3.25.0,3.25.0,3.25.0,3.25.0,3.24.0,3.24.0
+      :doc:`rocSPARSE <rocsparse:index>`,4.0.2,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.0.2,3.0.2
+      :doc:`rocWMMA <rocwmma:index>`,2.0.0,1.7.0,1.7.0,1.7.0,1.7.0,1.6.0,1.6.0,1.6.0,1.6.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0
+      :doc:`Tensile <tensile:src/index>`,4.44.0,4.43.0,4.43.0,4.43.0,4.43.0,4.42.0,4.42.0,4.42.0,4.42.0,4.41.0,4.41.0,4.41.0,4.41.0,4.40.0,4.40.0,4.40.0,4.40.0,4.39.0,4.39.0
+      ,,,,,,,,,,,,,,,,,,,
+      PRIMITIVES,.. _primitivelibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,
+      :doc:`hipCUB <hipcub:index>`,4.0.0,3.4.0,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
+      :doc:`hipTensor <hiptensor:index>`,2.0.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0,1.3.0,1.3.0,1.2.0,1.2.0,1.2.0,1.2.0,1.1.0,1.1.0
+      :doc:`rocPRIM <rocprim:index>`,4.0.0,3.4.1,3.4.1,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.2,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
+      :doc:`rocThrust <rocthrust:index>`,4.0.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.1.1,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
+      ,,,,,,,,,,,,,,,,,,,
+      SUPPORT LIBS,,,,,,,,,,,,,,,,,,,
+      `hipother <https://github.com/ROCm/hipother>`_,7.0.51830,6.4.43483,6.4.43483,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
+      `rocm-core <https://github.com/ROCm/rocm-core>`_,7.0.0,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0,6.1.5,6.1.2,6.1.1,6.1.0,6.0.2,6.0.0
+      `ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,20240607.5.7,20240607.5.7,20240607.4.05,20240607.1.4246,20240125.5.08,20240125.5.08,20240125.5.08,20240125.3.30,20231016.2.245,20231016.2.245
+      ,,,,,,,,,,,,,,,,,,,
+      SYSTEM MGMT TOOLS,.. _tools-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,
+      :doc:`AMD SMI <amdsmi:index>`,26.0.0,25.5.1,25.5.1,25.4.2,25.3.0,24.7.1,24.7.1,24.7.1,24.7.1,24.6.3,24.6.3,24.6.3,24.6.2,24.5.1,24.5.1,24.5.1,24.4.1,23.4.2,23.4.2
+      :doc:`ROCm Data Center Tool <rdc:index>`,1.1.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0
+      :doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
+      :doc:`ROCm SMI <rocm_smi_lib:index>`,7.8.0,7.7.0,7.5.0,7.5.0,7.5.0,7.4.0,7.4.0,7.4.0,7.4.0,7.3.0,7.3.0,7.3.0,7.3.0,7.2.0,7.2.0,7.0.0,7.0.0,6.0.2,6.0.0
+      :doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.2.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.0.60204,1.0.60202,1.0.60201,1.0.60200,1.0.60105,1.0.60102,1.0.60101,1.0.60100,1.0.60002,1.0.60000
+      ,,,,,,,,,,,,,,,,,,,
+      PERFORMANCE TOOLS,,,,,,,,,,,,,,,,,,,
+      :doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,2.6.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0
+      :doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.2.3,3.1.1,3.1.1,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.0.1,2.0.1,2.0.1,2.0.1,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.1.0,1.0.2,1.0.2,1.0.1,1.0.0,0.1.2,0.1.1,0.1.0,0.1.0,1.11.2,1.11.2,1.11.2,1.11.2,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`ROCProfiler <rocprofiler:index>`,2.0.70000,2.0.60403,2.0.60402,2.0.60401,2.0.60400,2.0.60303,2.0.60302,2.0.60301,2.0.60300,2.0.60204,2.0.60202,2.0.60201,2.0.60200,2.0.60105,2.0.60102,2.0.60101,2.0.60100,2.0.60002,2.0.60000
+      :doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,1.0.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.0,0.5.0,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`ROCTracer <roctracer:index>`,4.1.70000,4.1.60403,4.1.60402,4.1.60401,4.1.60400,4.1.60303,4.1.60302,4.1.60301,4.1.60300,4.1.60204,4.1.60202,4.1.60201,4.1.60200,4.1.60105,4.1.60102,4.1.60101,4.1.60100,4.1.60002,4.1.60000
+      ,,,,,,,,,,,,,,,,,,,
+      DEVELOPMENT TOOLS,,,,,,,,,,,,,,,,,,,
+      :doc:`HIPIFY <hipify:index>`,20.0.0,19.0.0,19.0.0,19.0.0,19.0.0,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
+      :doc:`ROCm CMake <rocmcmakebuildtools:index>`,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.13.0,0.13.0,0.13.0,0.13.0,0.12.0,0.12.0,0.12.0,0.12.0,0.11.0,0.11.0
+      :doc:`ROCdbgapi <rocdbgapi:index>`,0.77.3,0.77.2,0.77.2,0.77.2,0.77.2,0.77.0,0.77.0,0.77.0,0.77.0,0.76.0,0.76.0,0.76.0,0.76.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0
+      :doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,16.3.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,14.2.0,14.2.0,14.2.0,14.2.0,14.1.0,14.1.0,14.1.0,14.1.0,13.2.0,13.2.0
+      `rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.3.0,0.3.0,0.3.0,0.3.0,N/A,N/A
+      :doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.1.0,2.0.4,2.0.4,2.0.4,2.0.4,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3
+      ,,,,,,,,,,,,,,,,,,,
+      COMPILERS,.. _compilers-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,
+      `clang-ocl <https://github.com/ROCm/clang-ocl>`_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0
+      :doc:`hipCC <hipcc:index>`,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
+      `Flang <https://github.com/ROCm/flang>`_,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
+      :doc:`llvm-project <llvm-project:index>`,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
+      `OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,20.0.0.25314,19.0.0.25224,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
+,,,,,,,,,,,,,,,,,,,
+      RUNTIMES,.. _runtime-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,,,
+      :doc:`AMD CLR <hip:understand/amd_clr>`,7.0.51830,6.4.43484,6.4.43484,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
+      :doc:`HIP <hip:index>`,7.0.51830,6.4.43484,6.4.43484,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
+      `OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0
+      :doc:`ROCr Runtime <rocr-runtime:index>`,1.18.0,1.15.0,1.15.0,1.15.0,1.15.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.13.0,1.13.0,1.13.0,1.13.0,1.13.0,1.12.0,1.12.0
--- a/docs/compatibility/compatibility-matrix.rst
+++ b/docs/compatibility/compatibility-matrix.rst
@@ -10,9 +10,10 @@ Use this matrix to view the ROCm compatibility and system requirements across su

 You can also refer to the :ref:`past versions of ROCm compatibility matrix<past-rocm-compatibility-matrix>`.

-GPUs listed in the following table support compute workloads (no display
-information or graphics). If you’re using ROCm with AMD Radeon GPUs or Ryzen APUs for graphics
-workloads, see the :docs:`Use ROCm on Radeon and Ryzen <radeon:index.html>` to verify
+Accelerators and GPUs listed in the following table support compute workloads (no display
+information or graphics). If you’re using ROCm with AMD Radeon or Radeon Pro GPUs for graphics
+workloads, see the `Use ROCm on Radeon GPU documentation
+<https://rocm.docs.amd.com/projects/radeon/en/latest/docs/compatibility.html>`_ to verify
 compatibility and system requirements.

 .. |br| raw:: html
@@ -22,20 +23,20 @@ compatibility and system requirements.
 .. container:: format-big-table

  .. csv-table::
-      :header: "ROCm Version", "7.0.2", "7.0.1/7.0.0", "6.4.0"
+      :header: "ROCm Version", "7.0.0", "6.4.3", "6.3.0"
      :stub-columns: 1

-      :ref:`Operating systems & kernels <OS-kernel-versions>`,Ubuntu 24.04.3,Ubuntu 24.04.3,Ubuntu 24.04.2
+      :ref:`Operating systems & kernels <OS-kernel-versions>`,Ubuntu 24.04.3,Ubuntu 24.04.2,Ubuntu 24.04.2
      ,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5
-      ,"RHEL 10.0 [#rhel-10-702]_, 9.6 [#rhel-10-702]_, 9.4 [#rhel-94-702]_","RHEL 9.6 [#rhel-10-702]_, 9.4 [#rhel-94-702]_","RHEL 9.5, 9.4"
-      ,RHEL 8.10 [#rhel-700]_,RHEL 8.10 [#rhel-700]_,RHEL 8.10
-      ,SLES 15 SP7 [#sles-db-700]_,SLES 15 SP7 [#sles-db-700]_,SLES 15 SP6
-      ,"Oracle Linux 10, 9, 8 [#ol-700-mi300x]_","Oracle Linux 9, 8 [#ol-700-mi300x]_","Oracle Linux 9, 8 [#ol-mi300x]_"
-      ,"Debian 13 [#db-mi300x]_, 12 [#sles-db-700]_",Debian 12 [#sles-db-700]_,Debian 12 [#single-node]_
-      ,Azure Linux 3.0 [#az-mi300x]_,Azure Linux 3.0 [#az-mi300x]_,Azure Linux 3.0 [#az-mi300x]_
-      ,Rocky Linux 9 [#rl-700]_,Rocky Linux 9 [#rl-700]_,
+      ,"RHEL 9.6, 9.4","RHEL 9.6, 9.4","RHEL 9.5, 9.4"
+      ,RHEL 8.10 [#rhel-700]_,RHEL 8.10,RHEL 8.10
+      ,SLES 15 SP7 [#sles-db-700]_,"SLES 15 SP7, SP6","SLES 15 SP6, SP5"
+      ,"Oracle Linux 9, 8 [#ol-700-mi300x]_","Oracle Linux 9, 8 [#ol-mi300x]_",Oracle Linux 8.10 [#ol-mi300x]_
+      ,Debian 12 [#sles-db-700]_,Debian 12 [#single-node]_,
+      ,Azure Linux 3.0 [#az-mi300x]_,Azure Linux 3.0 [#az-mi300x]_,
+      ,Rocky Linux 9 [#rl-700]_,,
      ,.. _architecture-support-compatibility-matrix:,,
-      :doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA4,CDNA4,
+      :doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA4,,
      ,CDNA3,CDNA3,CDNA3
      ,CDNA2,CDNA2,CDNA2
      ,CDNA,CDNA,CDNA
@@ -43,143 +44,136 @@ compatibility and system requirements.
      ,RDNA3,RDNA3,RDNA3
      ,RDNA2,RDNA2,RDNA2
      ,.. _gpu-support-compatibility-matrix:,,
-      :doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>`,gfx950 [#mi350x-os]_,gfx950 [#mi350x-os]_,
-      ,gfx1201 [#RDNA-OS-700]_,gfx1201 [#RDNA-OS-700]_,
-      ,gfx1200 [#RDNA-OS-700]_,gfx1200 [#RDNA-OS-700]_,
-      ,gfx1101 [#RDNA-OS-700]_ [#rd-v710]_,gfx1101 [#RDNA-OS-700]_ [#rd-v710]_,
-      ,gfx1100 [#RDNA-OS-700]_,gfx1100 [#RDNA-OS-700]_,gfx1100
-      ,gfx1030 [#RDNA-OS-700]_ [#rd-v620]_,gfx1030 [#RDNA-OS-700]_ [#rd-v620]_,gfx1030
-      ,gfx942 [#mi325x-os]_ [#mi300x-os]_ [#mi300A-os]_,gfx942 [#mi325x-os]_ [#mi300x-os]_ [#mi300A-os]_,gfx942
-      ,gfx90a [#mi200x-os]_,gfx90a [#mi200x-os]_,gfx90a
-      ,gfx908 [#mi100-os]_,gfx908 [#mi100-os]_,gfx908
+      :doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>`,gfx950 [#mi350x-os]_,,
+      ,gfx1201 [#RDNA-OS-700]_,gfx1201 [#RDNA-OS]_,
+      ,gfx1200 [#RDNA-OS-700]_,gfx1200 [#RDNA-OS]_,
+      ,gfx1101 [#RDNA-OS-700]_ [#rd-v710]_,gfx1101 [#RDNA-OS]_ [#7700XT-OS]_,
+      ,gfx1100 [#RDNA-OS-700]_,gfx1100,gfx1100
+      ,gfx1030 [#RDNA-OS-700]_ [#rd-v620]_,gfx1030,gfx1030
+      ,gfx942 [#mi325x-os]_ [#mi300x-os]_ [#mi300A-os]_,gfx942,gfx942
+      ,gfx90a [#mi200x-os]_,gfx90a,gfx90a
+      ,gfx908 [#mi100-os]_,gfx908,gfx908
      ,,,
      FRAMEWORK SUPPORT,.. _framework-support-compatibility-matrix:,,
-      :doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.8, 2.7, 2.6","2.7, 2.6, 2.5","2.6, 2.5, 2.4, 2.3"
-      :doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.19.1, 2.18.1, 2.17.1 [#tf-mi350]_","2.19.1, 2.18.1, 2.17.1 [#tf-mi350]_","2.18.1, 2.17.1, 2.16.2"
-      :doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.6.0,0.6.0,0.4.35
-      :doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat]_,N/A,N/A,2.4.0
-      :doc:`llama.cpp <../compatibility/ml-compatibility/llama-cpp-compatibility>` [#llama-cpp_compat]_,N/A,b6356,b5997
-      `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.22.0,1.22.0,1.20.0
+      :doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.7, 2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.4, 2.3, 2.2, 2.1, 2.0, 1.13"
+      :doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.19.1, 2.18.1","2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1"
+      :doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.6.0,0.4.35,0.4.31
+      :doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>` [#stanford-megatron-lm_compat]_,N/A,N/A,85f95ae
+      :doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>` [#megablocks_compat]_,N/A,N/A,0.7.0
+      `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.22.0,1.20.0,1.17.3
      ,,,
      THIRD PARTY COMMS,.. _thirdpartycomms-support-compatibility-matrix:,,
-      `UCC <https://github.com/ROCm/ucc>`_,>=1.4.0,>=1.4.0,>=1.3.0
-      `UCX <https://github.com/ROCm/ucx>`_,>=1.17.0,>=1.17.0,>=1.15.0
+      `UCC <https://github.com/ROCm/ucc>`_,>=1.4.0,>=1.3.0,>=1.3.0
+      `UCX <https://github.com/ROCm/ucx>`_,>=1.17.0,>=1.15.0,>=1.15.0
      ,,,
      THIRD PARTY ALGORITHM,.. _thirdpartyalgorithm-support-compatibility-matrix:,,
-      Thrust,2.6.0,2.6.0,2.5.0
-      CUB,2.6.0,2.6.0,2.5.0
+      Thrust,2.6.0,2.5.0,2.3.2
+      CUB,2.6.0,2.5.0,2.3.2
      ,,,
-      DRIVER & USER SPACE [#kfd_support]_,.. _kfd-userspace-support-compatibility-matrix:,,
-      :doc:`AMD GPU Driver <rocm-install-on-linux:reference/user-kernel-space-compat-matrix>`,"30.10.2, 30.10.1 [#driver_patch]_, |br| 30.10, 6.4.x, 6.3.x","30.10.1 [#driver_patch]_, 30.10, |br| 6.4.x, 6.3.x, 6.2.x","6.4.x, 6.3.x, 6.2.x, 6.1.x"
+      KMD & USER SPACE [#kfd_support]_,.. _kfd-userspace-support-compatibility-matrix:,,
+      :doc:`KMD versions <rocm-install-on-linux:reference/user-kernel-space-compat-matrix>`,"30.10, 6.4.x, 6.3.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x"
      ,,,
      ML & COMPUTER VISION,.. _mllibs-support-compatibility-matrix:,,
      :doc:`Composable Kernel <composable_kernel:index>`,1.1.0,1.1.0,1.1.0
-      :doc:`MIGraphX <amdmigraphx:index>`,2.13.0,2.13.0,2.12.0
-      :doc:`MIOpen <miopen:index>`,3.5.0,3.5.0,3.4.0
-      :doc:`MIVisionX <mivisionx:index>`,3.3.0,3.3.0,3.2.0
-      :doc:`rocAL <rocal:index>`,2.3.0,2.3.0,2.2.0
-      :doc:`rocDecode <rocdecode:index>`,1.0.0,1.0.0,0.10.0
-      :doc:`rocJPEG <rocjpeg:index>`,1.1.0,1.1.0,0.8.0
-      :doc:`rocPyDecode <rocpydecode:index>`,0.6.0,0.6.0,0.3.1
-      :doc:`RPP <rpp:index>`,2.0.0,2.0.0,1.9.10
+      :doc:`MIGraphX <amdmigraphx:index>`,2.13.0,2.12.0,2.11.0
+      :doc:`MIOpen <miopen:index>`,3.5.0,3.4.0,3.3.0
+      :doc:`MIVisionX <mivisionx:index>`,3.3.0,3.2.0,3.1.0
+      :doc:`rocAL <rocal:index>`,2.3.0,2.2.0,2.1.0
+      :doc:`rocDecode <rocdecode:index>`,1.0.0,0.10.0,0.8.0
+      :doc:`rocJPEG <rocjpeg:index>`,1.1.0,0.8.0,0.6.0
+      :doc:`rocPyDecode <rocpydecode:index>`,0.6.0,0.3.1,0.2.0
+      :doc:`RPP <rpp:index>`,2.0.0,1.9.10,1.9.1
      ,,,
      COMMUNICATION,.. _commlibs-support-compatibility-matrix:,,
-      :doc:`RCCL <rccl:index>`,2.26.6,2.26.6,2.22.3
-      :doc:`rocSHMEM <rocshmem:index>`,3.0.0,3.0.0,2.0.0
+      :doc:`RCCL <rccl:index>`,2.26.6,2.22.3,2.21.5
+      :doc:`rocSHMEM <rocshmem:index>`,3.0.0,2.0.1,N/A
      ,,,
      MATH LIBS,.. _mathlibs-support-compatibility-matrix:,,
      `half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0,1.12.0
-      :doc:`hipBLAS <hipblas:index>`,3.0.2,3.0.0,2.4.0
-      :doc:`hipBLASLt <hipblaslt:index>`,1.0.0,1.0.0,0.12.0
-      :doc:`hipFFT <hipfft:index>`,1.0.20,1.0.20,1.0.18
-      :doc:`hipfort <hipfort:index>`,0.7.0,0.7.0,0.6.0
-      :doc:`hipRAND <hiprand:index>`,3.0.0,3.0.0,2.12.0
-      :doc:`hipSOLVER <hipsolver:index>`,3.0.0,3.0.0,2.4.0
-      :doc:`hipSPARSE <hipsparse:index>`,4.0.1,4.0.1,3.2.0
-      :doc:`hipSPARSELt <hipsparselt:index>`,0.2.4,0.2.4,0.2.3
-      :doc:`rocALUTION <rocalution:index>`,4.0.0,4.0.0,3.2.2
-      :doc:`rocBLAS <rocblas:index>`,5.0.2,5.0.0,4.4.0
-      :doc:`rocFFT <rocfft:index>`,1.0.34,1.0.34,1.0.32
-      :doc:`rocRAND <rocrand:index>`,4.0.0,4.0.0,3.3.0
-      :doc:`rocSOLVER <rocsolver:index>`,3.30.1,3.30.0,3.28.0
-      :doc:`rocSPARSE <rocsparse:index>`,4.0.2,4.0.2,3.4.0
-      :doc:`rocWMMA <rocwmma:index>`,2.0.0,2.0.0,1.7.0
-      :doc:`Tensile <tensile:src/index>`,4.44.0,4.44.0,4.43.0
+      :doc:`hipBLAS <hipblas:index>`,3.0.0,2.4.0,2.3.0
+      :doc:`hipBLASLt <hipblaslt:index>`,1.0.0,0.12.1,0.10.0
+      :doc:`hipFFT <hipfft:index>`,1.0.20,1.0.18,1.0.17
+      :doc:`hipfort <hipfort:index>`,0.7.0,0.6.0,0.5.0
+      :doc:`hipRAND <hiprand:index>`,3.0.0,2.12.0,2.11.0
+      :doc:`hipSOLVER <hipsolver:index>`,3.0.0,2.4.0,2.3.0
+      :doc:`hipSPARSE <hipsparse:index>`,4.0.1,3.2.0,3.1.2
+      :doc:`hipSPARSELt <hipsparselt:index>`,0.2.4,0.2.3,0.2.2
+      :doc:`rocALUTION <rocalution:index>`,4.0.0,3.2.3,3.2.1
+      :doc:`rocBLAS <rocblas:index>`,5.0.0,4.4.1,4.3.0
+      :doc:`rocFFT <rocfft:index>`,1.0.34,1.0.32,1.0.31
+      :doc:`rocRAND <rocrand:index>`,4.0.0,3.3.0,3.2.0
+      :doc:`rocSOLVER <rocsolver:index>`,3.30.0,3.28.2,3.27.0
+      :doc:`rocSPARSE <rocsparse:index>`,4.0.2,3.4.0,3.3.0
+      :doc:`rocWMMA <rocwmma:index>`,2.0.0,1.7.0,1.6.0
+      :doc:`Tensile <tensile:src/index>`,4.44.0,4.43.0,4.42.0
      ,,,
      PRIMITIVES,.. _primitivelibs-support-compatibility-matrix:,,
-      :doc:`hipCUB <hipcub:index>`,4.0.0,4.0.0,3.4.0
-      :doc:`hipTensor <hiptensor:index>`,2.0.0,2.0.0,1.5.0
-      :doc:`rocPRIM <rocprim:index>`,4.0.1,4.0.0,3.4.0
-      :doc:`rocThrust <rocthrust:index>`,4.0.0,4.0.0,3.3.0
+      :doc:`hipCUB <hipcub:index>`,4.0.0,3.4.0,3.3.0
+      :doc:`hipTensor <hiptensor:index>`,2.0.0,1.5.0,1.4.0
+      :doc:`rocPRIM <rocprim:index>`,4.0.0,3.4.1,3.3.0
+      :doc:`rocThrust <rocthrust:index>`,4.0.0,3.3.0,3.3.0
      ,,,
      SUPPORT LIBS,,,
-      `hipother <https://github.com/ROCm/hipother>`_,7.0.51830,7.0.51830,6.4.43482
-      `rocm-core <https://github.com/ROCm/rocm-core>`_,7.0.2,7.0.1/7.0.0,6.4.0
+      `hipother <https://github.com/ROCm/hipother>`_,7.0.51830,6.4.43483,6.3.42131
+      `rocm-core <https://github.com/ROCm/rocm-core>`_,7.0.0,6.4.3,6.3.0
      `ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr]_,N/A [#ROCT-rocr]_,N/A [#ROCT-rocr]_
      ,,,
      SYSTEM MGMT TOOLS,.. _tools-support-compatibility-matrix:,,
-      :doc:`AMD SMI <amdsmi:index>`,26.0.2,26.0.0,25.3.0
-      :doc:`ROCm Data Center Tool <rdc:index>`,1.1.0,1.1.0,0.3.0
+      :doc:`AMD SMI <amdsmi:index>`,26.0.0,25.5.1,24.7.1
+      :doc:`ROCm Data Center Tool <rdc:index>`,1.1.0,0.3.0,0.3.0
      :doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0,1.0.0
-      :doc:`ROCm SMI <rocm_smi_lib:index>`,7.8.0,7.8.0,7.5.0
-      :doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.2.0,1.2.0,1.1.0
-      :doc:`Cluster Validation Suite <cvs:index>`,1.0.0,1.0.0,N/A
+      :doc:`ROCm SMI <rocm_smi_lib:index>`,7.8.0,7.7.0,7.4.0
+      :doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.2.0,1.1.0,1.1.0
      ,,,
      PERFORMANCE TOOLS,,,
-      :doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,2.6.0,2.6.0,1.4.0
-      :doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.2.3,3.2.3,3.1.0
-      :doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.1.1,1.1.0,1.0.0
-      :doc:`ROCProfiler <rocprofiler:index>`,2.0.70002,2.0.70000,2.0.60400
-      :doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,1.0.0,1.0.0,0.6.0
-      :doc:`ROCTracer <roctracer:index>`,4.1.70002,4.1.70000,4.1.60400
+      :doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,2.6.0,1.4.0,1.4.0
+      :doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.2.3,3.1.1,3.0.0
+      :doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.1.0,1.0.2,0.1.0
+      :doc:`ROCProfiler <rocprofiler:index>`,2.0.70000,2.0.60403,2.0.60300
+      :doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,1.0.0,0.6.0,0.5.0
+      :doc:`ROCTracer <roctracer:index>`,4.1.70000,4.1.60403,4.1.60300
      ,,,
      DEVELOPMENT TOOLS,,,
-      :doc:`HIPIFY <hipify:index>`,20.0.0,20.0.0,19.0.0
+      :doc:`HIPIFY <hipify:index>`,20.0.0,19.0.0,18.0.0.24455
      :doc:`ROCm CMake <rocmcmakebuildtools:index>`,0.14.0,0.14.0,0.14.0
-      :doc:`ROCdbgapi <rocdbgapi:index>`,0.77.4,0.77.3,0.77.2
-      :doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,16.3.0,16.3.0,15.2.0
-      `rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.5.0,0.5.0,0.4.0
-      :doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.1.0,2.1.0,2.0.4
+      :doc:`ROCdbgapi <rocdbgapi:index>`,0.77.3,0.77.2,0.77.0
+      :doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,16.3.0,15.2.0,15.2.0
+      `rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.5.0,0.4.0,0.4.0
+      :doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.1.0,2.0.4,2.0.3
      ,,,
      COMPILERS,.. _compilers-support-compatibility-matrix:,,
-      `clang-ocl <https://github.com/ROCm/clang-ocl>`_,N/A,N/A,N/A
      :doc:`hipCC <hipcc:index>`,1.1.1,1.1.1,1.1.1
-      `Flang <https://github.com/ROCm/flang>`_,20.0.0.25385,20.0.0.25314,19.0.0.25133
-      :doc:`llvm-project <llvm-project:index>`,20.0.0.25385,20.0.0.25314,19.0.0.25133
-      `OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,20.0.0.25385,20.0.0.25314,19.0.0.25133
+      `Flang <https://github.com/ROCm/flang>`_,20.0.0.25314,19.0.0.25224,18.0.0.24455
+      :doc:`llvm-project <llvm-project:index>`,20.0.0.25314,19.0.0.25224,18.0.0.24491
+      `OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,20.0.0.25314,19.0.0.25224,18.0.0.24491
      ,,,
      RUNTIMES,.. _runtime-support-compatibility-matrix:,,
-      :doc:`AMD CLR <hip:understand/amd_clr>`,7.0.51831,7.0.51830,6.4.43482
-      :doc:`HIP <hip:index>`,7.0.51831,7.0.51830,6.4.43482
+      :doc:`AMD CLR <hip:understand/amd_clr>`,7.0.51830,6.4.43484,6.3.42131
+      :doc:`HIP <hip:index>`,7.0.51830,6.4.43484,6.3.42131
      `OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0,2.0.0
-      :doc:`ROCr Runtime <rocr-runtime:index>`,1.18.0,1.18.0,1.15.0
+      :doc:`ROCr Runtime <rocr-runtime:index>`,1.18.0,1.15.0,1.14.0

 .. rubric:: Footnotes

-.. [#rhel-10-702] RHEL 10.0 and RHEL 9.6 are supported on all listed :ref:`supported_GPUs` except AMD Radeon PRO V620 GPU.
-.. [#rhel-94-702] RHEL 9.4 is supported on all AMD Instinct GPUs listed under :ref:`supported_GPUs`.
-.. [#rhel-700] RHEL 8.10 is supported only on AMD Instinct MI300X, MI300A, MI250X, MI250, MI210, and MI100 GPUs.
-.. [#ol-700-mi300x] **For ROCm 7.0.x** - Oracle Linux 10 and 9 are supported only on AMD Instinct MI355X, MI350X, and MI300X GPUs. Oracle Linux 8 is supported only on AMD Instinct MI300X GPU.
+.. [#rhel-700] RHEL 8.10 is only supported on AMD Instinct MI300X, MI300A, MI250X, MI250, MI210, and MI100 GPUs.
+.. [#ol-700-mi300x] **For ROCm 7.0.0** - Oracle Linux 9 is supported only on AMD Instinct MI355X, MI350X, and MI300X GPUs. Oracle Linux 8 is supported only on AMD Instinct MI300X GPUs.
 .. [#ol-mi300x] **Prior ROCm 7.0.0** - Oracle Linux is supported only on AMD Instinct MI300X GPUs.
-.. [#db-mi300x] **For ROCm 7.0.2** - Debian 13 is supported only on AMD Instinct MI300X GPUs.
-.. [#sles-db-700] **For ROCm 7.0.x** - SLES 15 SP7 and Debian 12 are supported only on AMD Instinct MI300X, MI300A, MI250X, MI250, and MI210 GPUs.
-.. [#az-mi300x] Starting ROCm 6.4.0, Azure Linux 3.0 is supported only on AMD Instinct MI300X and AMD Radeon PRO V710 GPUs.
-.. [#rl-700] Rocky Linux 9 is supported only on AMD Instinct MI300X and MI300A GPUs.
-.. [#single-node] **Prior to ROCm 7.0.0** - Debian 12 is supported only on AMD Instinct MI300X GPUs for single-node functionality.
-.. [#mi350x-os] AMD Instinct MI355X (gfx950) and MI350X(gfx950) GPUs are supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 10.0, RHEL 9.6, RHEL 9.4, Oracle Linux 10, and Oracle Linux 9.
-.. [#RDNA-OS-700] **For ROCm 7.0.x** - AMD Radeon PRO AI PRO R9700 (gfx1201), AMD Radeon RX 9070 XT (gfx1201), AMD Radeon RX 9070 GRE (gfx1201), AMD Radeon RX 9070 (gfx1201), AMD Radeon RX 9060 XT (gfx1200), AMD Radeon RX 9060 (gfx1200), AMD Radeon RX 7800 XT (gfx1101), AMD Radeon RX 7700 XT (gfx1101), AMD Radeon PRO W7700 (gfx1101), and AMD Radeon PRO W6800 (gfx1030) are supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 10.0, and RHEL 9.6.
-.. [#rd-v710] **For ROCm 7.0.x** - AMD Radeon PRO V710 (gfx1101) GPUs are supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 10.0, RHEL 9.6, and Azure Linux 3.0.
-.. [#rd-v620] **For ROCm 7.0.x** - AMD Radeon PRO V620 (gfx1030) GPUs are supported only on Ubuntu 24.04.3 and Ubuntu 22.04.5.
-.. [#mi325x-os] **For ROCm 7.0.x** - AMD Instinct MI325X GPUs (gfx942) are supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 9.6, and RHEL 9.4.
-.. [#mi300x-os] **For ROCm 7.0.x** - AMD Instinct MI300X GPUs (gfx942) are supported on all listed :ref:`supported_distributions`.
-.. [#mi300A-os] **For ROCm 7.0.x** - AMD Instinct MI300A GPUs (gfx942) are supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 10.0, RHEL 9.6, RHEL 9.4, RHEL 8.10, SLES 15 SP7, Debian 12, and Rocky Linux 9.
-.. [#mi200x-os] **For ROCm 7.0.x** - AMD Instinct MI200 Series GPUs (gfx90a) are supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 10.0, RHEL 9.6, RHEL 9.4, RHEL 8.10, SLES 15 SP7, and Debian 12.
-.. [#mi100-os] **For ROCm 7.0.x** - AMD Instinct MI100 GPUs (gfx908) are supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 10.0, RHEL 9.6, RHEL 9.4, and RHEL 8.10.
-.. [#tf-mi350] TensorFlow 2.17.1 is not supported on AMD Instinct MI350 Series GPUs. Use TensorFlow 2.19.1 or 2.18.1 with MI350 Series GPUs instead.
-.. [#dgl_compat] DGL is supported only on ROCm 6.4.0.
-.. [#llama-cpp_compat] llama.cpp is supported only on ROCm 7.0.0 and ROCm 6.4.x.
-.. [#driver_patch] AMD GPU Driver (amdgpu) 30.10.1 is a quality release that resolves an issue identified in the 30.10 release. There are no other significant changes or feature additions in ROCm 7.0.1 from ROCm 7.0.0. AMD GPU Driver (amdgpu) 30.10.1 is compatible with ROCm 7.0.1 and ROCm 7.0.0.
-.. [#kfd_support] As of ROCm 6.4.0, forward and backward compatibility between the AMD GPU Driver (amdgpu) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The supported user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and AMD GPU Driver support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
+.. [#sles-db-700] **For ROCm 7.0.0** - SLES 15 SP7 and Debian 12 are only supported on AMD Instinct MI300X, MI300A, MI250X, MI250, and MI210 GPUs.
+.. [#az-mi300x] Starting ROCm 6.4.0, Azure Linux 3.0 is supported only on AMD Instinct MI300X and AMD Radeon PRO V710.
+.. [#rl-700] Rocky Linux 9 is only supported on AMD Instinct MI300X and MI300A GPUs.
+.. [#single-node] **Prior to ROCm 7.0.0** - Debian 12 is supported only on AMD Instinct MI300X for single-node functionality.
+.. [#mi350x-os] AMD Instinct MI355X (gfx950) and MI350X(gfx950) GPUs are only supported on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 9.6, RHEL 9.4, and Oracle Linux 9.
+.. [#RDNA-OS-700] **For ROCm 7.0.0** AMD Radeon PRO AI PRO R9700 (gfx1201), AMD Radeon RX 9070 XT (gfx1201), AMD Radeon RX 9070 GRE (gfx1201), AMD Radeon RX 9070 (gfx1201), AMD Radeon RX 9060 XT (gfx1200), AMD Radeon RX 7800 XT (gfx1101), AMD Radeon RX 7700 XT (gfx1101), AMD Radeon PRO W7700 (gfx1101), AMD Radeon PRO W6800 (gfx1030) are only supported on Ubuntu 24.04.3, Ubuntu 22.04.5, and RHEL 9.6.
+.. [#RDNA-OS] Radeon AI PRO R9700, Radeon RX 9070 XT (gfx1201), Radeon RX 9060 XT (gfx1200), Radeon PRO W7700 (gfx1101), and Radeon RX 7800 XT (gfx1101) are supported only on Ubuntu 24.04.2, Ubuntu 22.04.5, RHEL 9.6, and RHEL 9.4.
+.. [#rd-v710] AMD Radeon PRO V710 (gfx1101) is only supported on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 9.6, and Azure Linux 3.0.
+.. [#rd-v620] AMD Radeon PRO V620 (gfx1030) is only supported on Ubuntu 24.04.3 and Ubuntu 22.04.5.
+.. [#mi325x-os] AMD Instinct MI325X GPU (gfx942) is only supported on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 9.6, and RHEL 9.4.
+.. [#mi300x-os] AMD Instinct MI300X GPU (gfx942) is supported on all listed :ref:`supported_distributions`.
+.. [#mi300A-os] AMD Instinct MI300A GPU (gfx942) is supported only on Ubuntu 24.04, Ubuntu 22.04, RHEL 9.6, RHEL 9.4, RHEL 8.10, SLES 15 SP7, Debian 12, and Rocky Linux 9.
+.. [#mi200x-os] AMD Instinct MI200 Series GPUs (gfx90a) are supported only on Ubuntu 24.04, Ubuntu 22.04, RHEL 9.6, RHEL 9.4, RHEL 8.10, SLES 15 SP7, and Debian 12.
+.. [#mi100-os] AMD Instinct MI100 GPU (gfx908) is only supported on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 9.6, RHEL 9.4, and RHEL 8.10.
+.. [#7700XT-OS] Radeon RX 7700 XT (gfx1101) is supported only on Ubuntu 24.04.2 and RHEL 9.6.
+.. [#kfd_support] As of ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The supported user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and kernel-space support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
 .. [#ROCT-rocr] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.


@@ -201,8 +195,6 @@ Use this lookup table to confirm which operating system and kernel versions are
   ,,
   `Ubuntu <https://ubuntu.com/about/release-cycle#ubuntu-kernel-release-cycle>`_, 22.04.5, "5.15 [GA], 6.8 [HWE]", 2.35
   ,,
-   `Red Hat Enterprise Linux (RHEL 10) <https://access.redhat.com/articles/3078#RHEL9>`_, 10.0, 6.12.0-55, 2.39
-   ,,
   `Red Hat Enterprise Linux (RHEL 9) <https://access.redhat.com/articles/3078#RHEL9>`_, 9.6, 5.14.0-570, 2.34
   ,9.5, 5.14+, 2.34
   ,9.4, 5.14.0-427, 2.34
@@ -215,12 +207,10 @@ Use this lookup table to confirm which operating system and kernel versions are
   ,,
   `Rocky Linux <https://wiki.rockylinux.org/rocky/version/>`_, 9, 5.14.0-570, 2.34
   ,,
-   `Oracle Linux <https://blogs.oracle.com/scoter/post/oracle-linux-and-unbreakable-enterprise-kernel-uek-releases>`_, 10, 6.12.0 (UEK), 2.39
-   ,9, 6.12.0 (UEK), 2.34
+   `Oracle Linux <https://blogs.oracle.com/scoter/post/oracle-linux-and-unbreakable-enterprise-kernel-uek-releases>`_, 9, 6.12.0 (UEK), 2.34
   ,8, 5.15.0 (UEK), 2.28
   ,,
-   `Debian <https://www.debian.org/download>`_,13, 6.12, 2.35
-   ,12, 6.1.0, 2.36
+   `Debian <https://www.debian.org/download>`_,12, 6.1.0, 2.36
   ,,
   `Azure Linux <https://techcommunity.microsoft.com/blog/linuxandopensourceblog/azure-linux-3-0-now-in-preview-on-azure-kubernetes-service-v1-31/4287229>`_,3.0, 6.6.92, 2.38
   ,,
@@ -255,47 +245,29 @@ Expand for full historical view of:

   .. rubric:: Footnotes

-   .. [#rhel-10-702-past-60] RHEL 10.0 and RHEL 9.6 are supported on all listed :ref:`supported_GPUs` except AMD Radeon PRO V620 GPU.
-   .. [#rhel-94-702-past-60] RHEL 9.4 is supported on all AMD Instinct GPUs listed under :ref:`supported_GPUs`.
-   .. [#rhel-700-past-60] **For ROCm 7.0.x** - RHEL 8.10 is supported only on AMD Instinct MI300X, MI300A, MI250X, MI250, MI210, and MI100 GPUs.
-   .. [#ol-700-mi300x-past-60] **For ROCm 7.0.x** - Oracle Linux 10 and 9 are supported only on AMD Instinct MI355X, MI350X, and MI300X GPUs. Oracle Linux 8 is supported only on AMD Instinct MI300X GPU.
-   .. [#mi300x-past-60] **Prior ROCm 7.0.0** - Oracle Linux is supported only on AMD Instinct MI300X GPUs.
-   .. [#db-mi300x-past-60] **For ROCm 7.0.2** - Debian 13 is supported only on AMD Instinct MI300X GPUs.
-   .. [#sles-db-700-past-60] **For ROCm 7.0.x** - SLES 15 SP7 and Debian 12 are supported only on AMD Instinct MI300X, MI300A, MI250X, MI250, and MI210 GPUs.
-   .. [#single-node-past-60] **Prior to ROCm 7.0.0** - Debian 12 is supported only on AMD Instinct MI300X GPUs for single-node functionality.
-   .. [#az-mi300x-past-60] Starting from ROCm 6.4.0, Azure Linux 3.0 is supported only on AMD Instinct MI300X and AMD Radeon PRO V710 GPUs.
-   .. [#az-mi300x-630-past-60] **Prior ROCm 6.4.0**- Azure Linux 3.0 is supported only on AMD Instinct MI300X GPUs.
-   .. [#rl-700-past-60] Rocky Linux 9 is supported only on AMD Instinct MI300X and MI300A GPUs.
-   .. [#mi350x-os-past-60] AMD Instinct MI355X (gfx950) and MI350X(gfx950) GPUs are supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 9.6, RHEL 9.4, and Oracle Linux 9.
-   .. [#RDNA-OS-700-past-60] **For ROCm 7.0.x** AMD Radeon PRO AI PRO R9700 (gfx1201), AMD Radeon RX 9070 XT (gfx1201), AMD Radeon RX 9070 GRE (gfx1201), AMD Radeon RX 9070 (gfx1201), AMD Radeon RX 9060 XT (gfx1200), AMD Radeon RX 9060 (gfx1200), AMD Radeon RX 7800 XT (gfx1101), AMD Radeon RX 7700 XT (gfx1101), AMD Radeon PRO W7700 (gfx1101), and AMD Radeon PRO W6800 (gfx1030) are supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 10.0, RHEL 9.6, RHEL 9.4, Oracle Linux 10, and Oracle Linux 9.
-   .. [#RDNA-OS-past-60] **Prior ROCm 7.0.0** - Radeon AI PRO R9700, Radeon RX 9070 XT (gfx1201), Radeon RX 9060 XT (gfx1200), Radeon PRO W7700 (gfx1101), and Radeon RX 7800 XT (gfx1101) are supported only on Ubuntu 24.04.2, Ubuntu 22.04.5, RHEL 9.6, and RHEL 9.4.
-   .. [#rd-v710-past-60] **For ROCm 7.0.x** - AMD Radeon PRO V710 (gfx1101) is supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 10.0, RHEL 9.6, and Azure Linux 3.0.
-   .. [#rd-v620-past-60] **For ROCm 7.0.x** - AMD Radeon PRO V620 (gfx1030) is supported only on Ubuntu 24.04.3 and Ubuntu 22.04.5.
-   .. [#mi325x-os-past-60] **For ROCm 7.0.x** - AMD Instinct MI325X GPU (gfx942) is supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 9.6, and RHEL 9.4.
-   .. [#mi300x-os-past-60] **For ROCm 7.0.x** - AMD Instinct MI300X GPU (gfx942) is supported on all listed :ref:`supported_distributions`.
-   .. [#mi300A-os-past-60] **For ROCm 7.0.x** - AMD Instinct MI300A GPU (gfx942) is supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 10.0, RHEL 9.6, RHEL 9.4, RHEL 8.10, SLES 15 SP7, Debian 12, and Rocky Linux 9.
-   .. [#mi200x-os-past-60] **For ROCm 7.0.x** - AMD Instinct MI200 Series GPUs (gfx90a) are supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 10.0, RHEL 9.6, RHEL 9.4, RHEL 8.10, SLES 15 SP7, and Debian 12.
-   .. [#mi100-os-past-60] **For ROCm 7.0.x** - AMD Instinct MI100 GPU (gfx908) is supported only on Ubuntu 24.04.3, Ubuntu 22.04.5, RHEL 10.0, RHEL 9.6, RHEL 9.4, and RHEL 8.10.
-   .. [#7700XT-OS-past-60] **Prior to ROCm 7.0.0** - Radeon RX 7700 XT (gfx1101) is supported only on Ubuntu 24.04.2 and RHEL 9.6.
+   .. [#ol-700-mi300x-past-60] **For ROCm 7.0.0** - Oracle Linux 9 is supported only on AMD Instinct MI300X, MI350X, and MI355X. Oracle Linux 8 is only supported on AMD Instinct MI300X.
+   .. [#mi300x-past-60] **Prior to ROCm 7.0.0** - Oracle Linux is supported only on AMD Instinct MI300X.
+   .. [#single-node-past-60] **Prior to ROCm 7.0.0** - Debian 12 is supported only on AMD Instinct MI300X for single-node functionality.
+   .. [#az-mi300x-past-60] Starting from ROCm 6.4.0, Azure Linux 3.0 is supported only on AMD Instinct MI300X and AMD Radeon PRO V710.
+   .. [#az-mi300x-630-past-60] **Prior ROCm 6.4.0**- Azure Linux 3.0 is supported only on AMD Instinct MI300X.
+   .. [#RDNA-OS-past-60] Radeon AI PRO R9700, Radeon RX 9070 XT (gfx1201), Radeon RX 9060 XT (gfx1200), Radeon PRO W7700 (gfx1101), and Radeon RX 7800 XT (gfx1101) are supported only on Ubuntu 24.04.2, Ubuntu 22.04.5, RHEL 9.6, and RHEL 9.4.
+   .. [#7700XT-OS-past-60] Radeon RX 7700 XT (gfx1101) is supported only on Ubuntu 24.04.2 and RHEL 9.6.
   .. [#mi300_624-past-60] **For ROCm 6.2.4** - MI300X (gfx942) is supported on listed operating systems *except* Ubuntu 22.04.5 [6.8 HWE] and Ubuntu 22.04.4 [6.5 HWE].
   .. [#mi300_622-past-60] **For ROCm 6.2.2** - MI300X (gfx942) is supported on listed operating systems *except* Ubuntu 22.04.5 [6.8 HWE] and Ubuntu 22.04.4 [6.5 HWE].
   .. [#mi300_621-past-60] **For ROCm 6.2.1** - MI300X (gfx942) is supported on listed operating systems *except* Ubuntu 22.04.5 [6.8 HWE] and Ubuntu 22.04.4 [6.5 HWE].
   .. [#mi300_620-past-60] **For ROCm 6.2.0** - MI300X (gfx942) is supported on listed operating systems *except* Ubuntu 22.04.5 [6.8 HWE] and Ubuntu 22.04.4 [6.5 HWE].
-   .. [#mi300_612-past-60] **For ROCm 6.1.2** - MI300A (gfx942) is supported on Ubuntu 22.04.4, RHEL 9.4, RHEL 9.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is supported only on Ubuntu 22.04.4 and Oracle Linux.
-   .. [#mi300_611-past-60] **For ROCm 6.1.1** - MI300A (gfx942) is supported on Ubuntu 22.04.4, RHEL 9.4, RHEL 9.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is supported only on Ubuntu 22.04.4 and Oracle Linux.
-   .. [#mi300_610-past-60] **For ROCm 6.1.0** - MI300A (gfx942) is supported on Ubuntu 22.04.4, RHEL 9.4, RHEL 9.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is supported only on Ubuntu 22.04.4.
-   .. [#mi300_602-past-60] **For ROCm 6.0.2** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is supported only on Ubuntu 22.04.3.
-   .. [#mi300_600-past-60] **For ROCm 6.0.0** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is supported only on Ubuntu 22.04.3.
-   .. [#tf-mi350-past-60] TensorFlow 2.17.1 is not supported on AMD Instinct MI350 Series GPUs. Use TensorFlow 2.19.1 or 2.18.1 with MI350 Series GPUs instead.
-   .. [#verl_compat-past-60] verl is supported only on ROCm 6.2.0.
-   .. [#stanford-megatron-lm_compat-past-60] Stanford Megatron-LM is supported only on ROCm 6.3.0.
-   .. [#dgl_compat-past-60] DGL is supported only on ROCm 6.4.0.
-   .. [#megablocks_compat-past-60] Megablocks is supported only on ROCm 6.3.0.
-   .. [#taichi_compat-past-60] Taichi is supported only on ROCm 6.3.2.
-   .. [#ray_compat-past-60] Ray is supported only on ROCm 6.4.1.
-   .. [#llama-cpp_compat-past-60] llama.cpp is supported only on ROCm 7.0.0 and 6.4.x.
-   .. [#flashinfer_compat-past-60] FlashInfer is supported only on ROCm 6.4.1.
-   .. [#driver_patch-past-60] AMD GPU Driver (amdgpu) 30.10.1 is a quality release that resolves an issue identified in the 30.10 release. There are no other significant changes or feature additions in ROCm 7.0.1 from ROCm 7.0.0. AMD GPU Driver (amdgpu) 30.10.1 is compatible with ROCm 7.0.1 and ROCm 7.0.0.
-   .. [#kfd_support-past-60] As of ROCm 6.4.0, forward and backward compatibility between the AMD GPU Driver (amdgpu) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The supported user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and AMD GPU Driver support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
+   .. [#mi300_612-past-60] **For ROCm 6.1.2** - MI300A (gfx942) is supported on Ubuntu 22.04.4, RHEL 9.4, RHEL 9.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.4 and Oracle Linux.
+   .. [#mi300_611-past-60] **For ROCm 6.1.1** - MI300A (gfx942) is supported on Ubuntu 22.04.4, RHEL 9.4, RHEL 9.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.4 and Oracle Linux.
+   .. [#mi300_610-past-60] **For ROCm 6.1.0** - MI300A (gfx942) is supported on Ubuntu 22.04.4, RHEL 9.4, RHEL 9.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.4.
+   .. [#mi300_602-past-60] **For ROCm 6.0.2** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.
+   .. [#mi300_600-past-60] **For ROCm 6.0.0** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.
+   .. [#verl_compat] verl is only supported on ROCm 6.2.0.
+   .. [#stanford-megatron-lm_compat] Stanford Megatron-LM is only supported on ROCm 6.3.0.
+   .. [#dgl_compat] DGL is only supported on ROCm 6.4.0.
+   .. [#megablocks_compat] Megablocks is only supported on ROCm 6.3.0.
+   .. [#taichi_compat] Taichi is only supported on ROCm 6.3.2.
+   .. [#ray_compat] Ray is only supported on ROCm 6.4.1.
+   .. [#llama-cpp_compat] llama.cpp is only supported on ROCm 6.4.0.
+   .. [#kfd_support-past-60] As of ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The supported user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and kernel-space support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
   .. [#ROCT-rocr-past-60] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.
   
--- a/docs/compatibility/ml-compatibility/flashinfer-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/flashinfer-compatibility.rst
@@ -1,107 +0,0 @@
-:orphan:
-
-.. meta::
-    :description: FlashInfer deep learning framework compatibility
-    :keywords: GPU, LLM, FlashInfer, compatibility
-
-.. version-set:: rocm_version latest
-
-********************************************************************************
-FlashInfer compatibility
-********************************************************************************
-
-`FlashInfer <https://docs.flashinfer.ai/index.html>`__ is a library and kernel generator 
-for Large Language Models (LLMs) that provides high-performance implementation of graphics 
-processing units (GPUs) kernels. FlashInfer focuses on LLM serving and inference, as well 
-as advanced performance across diverse scenarios.
-
-FlashInfer features highly efficient attention kernels, load-balanced scheduling, and memory-optimized 
-techniques, while supporting customized attention variants. It’s compatible with ``torch.compile``, and 
-offers high-performance LLM-specific operators, with easy integration through PyTorch, and C++ APIs.
-
-.. note::
-
-  The ROCm port of FlashInfer is under active development, and some features are not yet available. 
-  For the latest feature compatibility matrix, refer to the ``README`` of the 
-  `https://github.com/ROCm/flashinfer <https://github.com/ROCm/flashinfer>`__ repository.
-
-Support for the ROCm port of FlashInfer is available as follows:
-
- ROCm support for FlashInfer is hosted in the `https://github.com/ROCm/flashinfer 
-  <https://github.com/ROCm/flashinfer>`__ repository. This location differs from the 
-  `https://github.com/flashinfer-ai/flashinfer <https://github.com/flashinfer-ai/flashinfer>`_ 
-  upstream repository.
-
- To install FlashInfer, use the prebuilt :ref:`Docker image <flashinfer-docker-compat>`, 
-  which includes ROCm, FlashInfer, and all required dependencies.
-
-  - See the :doc:`ROCm FlashInfer installation guide <rocm-install-on-linux:install/3rd-party/flashinfer-install>` 
-    to install and get started.
-
-  - See the `Installation guide <https://docs.flashinfer.ai/installation.html>`__ 
-    in the upstream FlashInfer documentation.
-
-.. note::
-
-  Flashinfer is supported on ROCm 6.4.1.
-
-Supported devices
-================================================================================
-
-**Officially Supported**: AMD Instinct™ MI300X
-
-
-.. _flashinfer-recommendations:
-
-Use cases and recommendations
-================================================================================
-
-This release of FlashInfer on ROCm provides the decode functionality for LLM inferencing.
-In the decode phase, tokens are generated sequentially, with the model predicting each new 
-token based on the previously generated tokens and the input context.
-
-FlashInfer on ROCm brings over upstream features such as load balancing, sparse and dense 
-attention optimizations, and batching support, enabling efficient execution on AMD Instinct™ MI300X GPUs.
-
-Because large LLMs often require substantial KV caches or long context windows, FlashInfer on ROCm 
-also implements cascade attention from upstream to reduce memory usage. 
-
-For currently supported use cases and recommendations, refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__, 
-where you can search for examples and best practices to optimize your workloads on AMD GPUs.
-
-.. _flashinfer-docker-compat:
-
-Docker image compatibility
-================================================================================
-
-.. |docker-icon| raw:: html
-
-   <i class="fab fa-docker"></i>
-
-AMD validates and publishes `ROCm FlashInfer images <https://hub.docker.com/r/rocm/flashinfer/tags>`__
-with ROCm and Pytorch backends on Docker Hub. The following Docker image tags and associated
-inventories represent the FlashInfer version from the official Docker Hub.
-The Docker images have been validated for `ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__.
-Click |docker-icon| to view the image on Docker Hub.
-
-.. list-table:: 
-    :header-rows: 1
-    :class: docker-image-compatibility
-
-    * - Docker image
-      - ROCm
-      - FlashInfer
-      - PyTorch
-      - Ubuntu
-      - Python
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/flashinfer/flashinfer-0.2.5_rocm6.4_ubuntu24.04_py3.12_pytorch2.7/images/sha256-558914838821c88c557fb6d42cfbc1bdb67d79d19759f37c764a9ee801f93313"><i class="fab fa-docker fa-lg"></i> rocm/flashinfer</a>
-      - `6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__
-      - `v0.2.5 <https://github.com/flashinfer-ai/flashinfer/releases/tag/v0.2.5>`__
-      - `2.7.1 <https://github.com/ROCm/pytorch/releases/tag/v2.7.1>`__
-      - 24.04
-      - `3.12 <https://www.python.org/downloads/release/python-3129/>`__
-
-
--- a/docs/compatibility/ml-compatibility/jax-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/jax-compatibility.rst
@@ -79,7 +79,7 @@ Use cases and recommendations

 * The `MI300X workload optimization guide <https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html>`_
  provides detailed guidance on optimizing workloads for the AMD Instinct MI300X
-  GPU using ROCm. The page is aimed at helping users achieve optimal
+  accelerator using ROCm. The page is aimed at helping users achieve optimal
  performance for deep learning and other high-performance computing tasks on
  the MI300X GPU.

@@ -90,15 +90,75 @@ For more use cases and recommendations, see `ROCm JAX blog posts <https://rocm.b
 Docker image compatibility
 ================================================================================

-AMD provides preconfigured Docker images with JAX and the ROCm backend.
-These images are published on `Docker Hub <https://hub.docker.com/r/rocm/jax>`__ and are the
-recommended way to get started with deep learning with JAX on ROCm.
-For ``jax-community`` images, see `rocm/jax-community
-<https://hub.docker.com/r/rocm/jax-community/tags>`__ on Docker Hub.
+.. |docker-icon| raw:: html

-To find the right image tag, see the :ref:`JAX on ROCm installation
-documentation <rocm-install-on-linux:jax-docker-support>` for a list of
-available ``rocm/jax`` images.
+   <i class="fab fa-docker"></i>
+
+AMD validates and publishes ready-made `ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax>`_
+with ROCm backends on Docker Hub. The following Docker image tags and
+associated inventories represent the latest JAX version from the official Docker Hub and are validated for
+`ROCm 6.4.2 <https://repo.radeon.com/rocm/apt/6.4.2/>`_. Click the |docker-icon|
+icon to view the image on Docker Hub.
+
+.. list-table:: JAX Docker image components
+    :header-rows: 1
+
+    * - Docker image
+      - JAX
+      - Linux
+      - Python
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/jax/rocm6.4.2-jax0.4.35-py3.12/images/sha256-8918fa806a172c1a10eb2f57131eb31b5d7c8fa1656b8729fe7d3d736112de83"><i class="fab fa-docker fa-lg"></i> rocm/jax</a>
+
+      - `0.4.35 <https://github.com/ROCm/jax/releases/tag/rocm-jax-v0.4.35>`_
+      - Ubuntu 24.04
+      - `3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/jax/rocm6.4.2-jax0.4.35-py3.10/images/sha256-a394be13c67b7fc602216abee51233afd4b6cb7adaa57ca97e688fba82f9ad79"><i class="fab fa-docker fa-lg"></i> rocm/jax</a>
+
+      - `0.4.35 <https://github.com/ROCm/jax/releases/tag/rocm-jax-v0.4.35>`_
+      - Ubuntu 22.04
+      - `3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
+
+AMD publishes `Community ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax-community>`_
+with ROCm backends on Docker Hub. The following Docker image tags and
+associated inventories are tested for `ROCm 6.3.2 <https://repo.radeon.com/rocm/apt/6.3.2/>`_.
+
+.. list-table:: JAX community Docker image components
+    :header-rows: 1
+
+    * - Docker image
+      - JAX
+      - Linux
+      - Python
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/jax-community/rocm6.3.2-jax0.5.0-py3.12.8/images/sha256-25dfaa0183e274bd0a3554a309af3249c6f16a1793226cb5373f418e39d3146a"><i class="fab fa-docker fa-lg"></i> rocm/jax-community</a>
+
+      - `0.5.0 <https://github.com/ROCm/jax/releases/tag/rocm-jax-v0.5.0>`_
+      - Ubuntu 22.04
+      - `3.12.8 <https://www.python.org/downloads/release/python-3128/>`_
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/jax-community/rocm6.3.2-jax0.5.0-py3.11.11/images/sha256-ff9baeca9067d13e6c279c911e5a9e5beed0817d24fafd424367cc3d5bd381d7"><i class="fab fa-docker fa-lg"></i> rocm/jax-community</a>
+
+      - `0.5.0 <https://github.com/ROCm/jax/releases/tag/rocm-jax-v0.5.0>`_
+      - Ubuntu 22.04
+      - `3.11.11 <https://www.python.org/downloads/release/python-31111/>`_
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/jax-community/rocm6.3.2-jax0.5.0-py3.10.16/images/sha256-8bab484be1713655f74da51a191ed824bb9d03db1104fd63530a1ac3c37cf7b1"><i class="fab fa-docker fa-lg"></i> rocm/jax-community</a>
+
+      - `0.5.0 <https://github.com/ROCm/jax/releases/tag/rocm-jax-v0.5.0>`_
+      - Ubuntu 22.04
+      - `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_

 .. _key_rocm_libraries:

--- a/docs/compatibility/ml-compatibility/llama-cpp-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/llama-cpp-compatibility.rst
@@ -16,7 +16,7 @@ for Large Language Model (LLM) inference that runs on both central processing un
 a simple, dependency-free setup. 

 The framework supports multiple quantization options, from 1.5-bit to 8-bit integers, 
-to accelerate inference and reduce memory usage. Originally built as a CPU-first library, 
+to speed up inference and reduce memory usage. Originally built as a CPU-first library, 
 llama.cpp is easy to integrate with other programming environments and is widely 
 adopted across diverse platforms, including consumer devices. 

@@ -40,12 +40,12 @@ with ROCm support:

 .. note::

-  llama.cpp is supported on ROCm 7.0.0 and ROCm 6.4.x.
+  llama.cpp is supported on ROCm 6.4.0.

 Supported devices
 ================================================================================

-**Officially Supported**: AMD Instinct™ MI300X, MI325X, MI210
+**Officially Supported**: AMD Instinct™ MI300X, MI210


 Use cases and recommendations
@@ -70,7 +70,7 @@ llama.cpp is also used in a range of real-world applications, including:
 For more use cases and recommendations, refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__, 
 where you can search for llama.cpp examples and best practices to optimize your workloads on AMD GPUs.

- The `Llama.cpp Meets Instinct: A New Era of Open-Source AI Acceleration <https://rocm.blogs.amd.com/ecosystems-and-partners/llama-cpp/README.html>`__ 
+- The `Llama.cpp Meets Instinct: A New Era of Open-Source AI Acceleration <https://rocm.blogs.amd.com/ecosystems-and-partners/llama-cpp/README.html>`__, 
  blog post outlines how the open-source llama.cpp framework enables efficient LLM inference—including interactive inference with ``llama-cli``, 
  server deployment with ``llama-server``, GGUF model preparation and quantization, performance benchmarking, and optimizations tailored for 
  AMD Instinct GPUs within the ROCm ecosystem. 
@@ -84,9 +84,9 @@ Docker image compatibility

   <i class="fab fa-docker"></i>

-AMD validates and publishes `ROCm llama.cpp Docker images <https://hub.docker.com/r/rocm/llama.cpp/tags>`__
+AMD validates and publishes `ROCm llama.cpp Docker images <https://hub.docker.com/r/rocm/llama.cpp>`__
 with ROCm backends on Docker Hub. The following Docker image tags and associated
-inventories represent the available llama.cpp versions from the official Docker Hub.
+inventories were tested on `ROCm 6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__.
 Click |docker-icon| to view the image on Docker Hub.

 .. important::
@@ -105,115 +105,8 @@ Click |docker-icon| to view the image on Docker Hub.
      - Server Docker
      - Light Docker
      - llama.cpp
-      - ROCm
      - Ubuntu

-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm7.0.0_ubuntu24.04_full/images/sha256-a2ecd635eaa65bb289a9041330128677f3ae88bee6fee0597424b17e38d4903c"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm7.0.0_ubuntu24.04_server/images/sha256-cb46b47df415addb5ceb6e6fdf0be70bf9d7f6863bbe6e10c2441ecb84246d52"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm7.0.0_ubuntu24.04_light/images/sha256-8f8536eec4b05c0ff1c022f9fc6c527ad1c89e6c1ca0906e4d39e4de73edbde9"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
-      - `7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__
-      - 24.04
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm7.0.0_ubuntu22.04_full/images/sha256-f36de2a3b03ae53e81c85422cb3780368c9891e1ac7884b04403a921fe2ea45d"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm7.0.0_ubuntu22.04_server/images/sha256-df15e8ab11a6837cd3736644fec1e047465d49e37d610ab0b79df000371327df"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm7.0.0_ubuntu22.04_light/images/sha256-4ea2d5bb7964f0ee3ea9b30ba7f343edd6ddfab1b1037669ca7eafad2e3c2bd7"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
-      - `7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__
-      - 22.04
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.3_ubuntu24.04_full/images/sha256-5960fc850024a8a76451f9eaadd89b7e59981ae9f393b407310c1ddf18892577"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.3_ubuntu24.04_server/images/sha256-1b79775d9f546065a6aaf9ca426e1dd4ed4de0b8f6ee83687758cc05af6538e6"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.3_ubuntu24.04_light/images/sha256-8f863c4c2857ae42bebd64e4f1a0a1e7cc3ec4503f243e32b4a4dcad070ec361"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
-      - `6.4.3 <https://repo.radeon.com/rocm/apt/6.4.3/>`__
-      - 24.04
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.3_ubuntu22.04_full/images/sha256-888879b3ee208f9247076d7984524b8d1701ac72611689e89854a1588bec9867"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.3_ubuntu22.04_server/images/sha256-90e4ff99a66743e33fd00728cd71a768588e5f5ef355aaa196669fe65ac70672"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.3_ubuntu22.04_light/images/sha256-bd447a049939cb99054f8fbf3f2352870fe906a75e2dc3339c845c08b9c53f9b"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
-      - `6.4.3 <https://repo.radeon.com/rocm/apt/6.4.3/>`__
-      - 22.04
-
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.2_ubuntu24.04_full/images/sha256-5b3a1bc4889c1fcade434b937fbf9cc1c22ff7dc0317c130339b0c9238bc88c4"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.2_ubuntu24.04_server/images/sha256-5228ff99d0f627a9032d668f4381b2e80dc1e301adc3e0821f26d8354b175271"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.2_ubuntu24.04_light/images/sha256-b12723b332a826a89b7252dddf868cbe4d1a869562fc4aa4032f59e1a683b968"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
-      - `6.4.2 <https://repo.radeon.com/rocm/apt/6.4.2/>`__
-      - 24.04
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.2_ubuntu22.04_full/images/sha256-cd6e21a6a73f59b35dd5309b09dd77654a94d783bf13a55c14eb8dbf8e9c2615"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.2_ubuntu22.04_server/images/sha256-c2b4689ab2c47e6626e8fea22d7a63eb03d47c0fde9f5ef8c9f158d15c423e58"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.2_ubuntu22.04_light/images/sha256-1acc28f29ed87db9cbda629cb29e1989b8219884afe05f9105522be929e94da4"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
-      - `6.4.2 <https://repo.radeon.com/rocm/apt/6.4.2/>`__
-      - 22.04
-
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.1_ubuntu24.04_full/images/sha256-2f8ae8a44510d96d52dea6cb398b224f7edeb7802df7ec488c6f63d206b3cdc9"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.1_ubuntu24.04_server/images/sha256-fece497ff9f4a28b12f645de52766941da8ead8471aa1ea84b61d4b4568e51f2"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.1_ubuntu24.04_light/images/sha256-3e14352fa6f8c6128b23cf9342531c20dbfb522550b626e09d83b260a1947022"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
-      - `6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__
-      - 24.04
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.1_ubuntu22.04_full/images/sha256-80763062ef0bec15038c35fd01267f1fc99a5dd171d4b48583cc668b15efad69"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.1_ubuntu22.04_server/images/sha256-db2a6c957555ed83b819bbc54aea884a93192da0fb512dae63d32e0dc4e8ab8f"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b6356_rocm6.4.1_ubuntu22.04_light/images/sha256-c6dbb07cc655fb079d5216e4b77451cb64a9daa0585d23b6fb8b32cb22021197"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
-      - `b6356 <https://github.com/ROCm/llama.cpp/tree/release/b6356>`__
-      - `6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__
-      - 22.04
-
    * - .. raw:: html

           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b5997_rocm6.4.0_ubuntu24.04_full/images/sha256-f78f6c81ab2f8e957469415fe2370a1334fe969c381d1fe46050c85effaee9d5"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
@@ -224,52 +117,40 @@ Click |docker-icon| to view the image on Docker Hub.

           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b5997_rocm6.4.0_ubuntu24.04_light/images/sha256-cc324e6faeedf0e400011f07b49d2dc41a16bae257b2b7befa0f4e2e97231320"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
      - `b5997 <https://github.com/ROCm/llama.cpp/tree/release/b5997>`__
-      - `6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__
      - 24.04

-
 Key ROCm libraries for llama.cpp
 ================================================================================

 llama.cpp functionality on ROCm is determined by its underlying library
 dependencies. These ROCm components affect the capabilities, performance, and
-feature set available to developers. Ensure you have the required libraries for 
-your corresponding ROCm version.
+feature set available to developers.

 .. list-table::
    :header-rows: 1

    * - ROCm library
-      - ROCm 7.0.0 version
-      - ROCm 6.4.x version
+      - Version
      - Purpose
      - Usage
    * - `hipBLAS <https://github.com/ROCm/hipBLAS>`__
-      - 3.0.0
-      - 2.4.0
+      - :version-ref:`hipBLAS rocm_version`
      - Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
        matrix and vector operations.
      - Supports operations such as matrix multiplication, matrix-vector
        products, and tensor contractions. Utilized in both dense and batched
        linear algebra operations.
    * - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`__
-      - 1.0.0
-      - 0.12.0
+      - :version-ref:`hipBLASLt rocm_version`
      - hipBLASLt is an extension of the hipBLAS library, providing additional
        features like epilogues fused into the matrix multiplication kernel or
        use of integer tensor cores.
      - By setting the flag ``ROCBLAS_USE_HIPBLASLT``, you can dispatch hipblasLt
        kernels where possible.
    * - `rocWMMA <https://github.com/ROCm/rocWMMA>`__
-      - 2.0.0
-      - 1.7.0
+      - :version-ref:`rocWMMA rocm_version`
      - Accelerates warp-level matrix-multiply and matrix-accumulate to speed up matrix
        multiplication (GEMM) and accumulation operations with mixed precision
        support.
      - Can be used to enhance the flash attention performance on AMD compute, by enabling
-        the flag during compile time.
-
-Previous versions
-===============================================================================
-See :doc:`rocm-install-on-linux:install/3rd-party/previous-versions/llama-cpp-history` to find documentation for previous releases
-of the ``ROCm/llama.cpp`` Docker image.
+        the flag during compile time.
--- a/docs/compatibility/ml-compatibility/megablocks-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/megablocks-compatibility.rst
@@ -28,7 +28,7 @@ Supported devices
 ================================================================================

 - **Officially Supported**: AMD Instinct MI300X
- **Partially Supported** (functionality or performance limitations): AMD Instinct MI250X, MI210
+- **Partially Supported** (functionality or performance limitations): AMD Instinct MI250X, MI210X

 Supported models and features
 ================================================================================
--- a/docs/compatibility/ml-compatibility/pytorch-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/pytorch-compatibility.rst
@@ -73,9 +73,9 @@ Use cases and recommendations

 * The :doc:`Instinct MI300X workload optimization guide </how-to/rocm-for-ai/inference-optimization/workload>`
  provides detailed guidance on optimizing workloads for the AMD Instinct MI300X
-  GPU using ROCm. This guide helps users achieve optimal performance for
+  accelerator using ROCm. This guide helps users achieve optimal performance for
  deep learning and other high-performance computing tasks on the MI300X
-  GPU.
+  accelerator.

 * The :doc:`Inception with PyTorch documentation </conceptual/ai-pytorch-inception>`
  describes how PyTorch integrates with ROCm for AI workloads It outlines the
@@ -89,13 +89,141 @@ For more use cases and recommendations, see `ROCm PyTorch blog posts <https://ro
 Docker image compatibility
 ================================================================================

-AMD provides preconfigured Docker images with PyTorch and the ROCm backend.
-These images are published on `Docker Hub <https://hub.docker.com/r/rocm/pytorch>`__ and are the
-recommended way to get started with deep learning with PyTorch on ROCm.
+.. |docker-icon| raw:: html

-To find the right image tag, see the :ref:`PyTorch on ROCm installation
-documentation <rocm-install-on-linux:pytorch-docker-support>` for a list of
-available ``rocm/pytorch`` images.
+   <i class="fab fa-docker"></i>
+
+AMD validates and publishes `PyTorch images <https://hub.docker.com/r/rocm/pytorch>`__
+with ROCm backends on Docker Hub. The following Docker image tags and associated
+inventories were tested on `ROCm 6.4.2 <https://repo.radeon.com/rocm/apt/6.4.2/>`__.
+Click |docker-icon| to view the image on Docker Hub.
+
+.. list-table:: PyTorch Docker image components
+    :header-rows: 1
+    :class: docker-image-compatibility
+
+    * - Docker
+      - PyTorch
+      - Ubuntu
+      - Python
+      - Apex
+      - torchvision
+      - TensorBoard
+      - MAGMA
+      - UCX
+      - OMPI
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.2_ubuntu24.04_py3.12_pytorch_release_2.6.0/images/sha256-6a287591500b4048a9556c1ecc92bc411fd3d552f6c8233bc399f18eb803e8d6"><i class="fab fa-docker fa-lg"></i></a>
+
+      - `2.6.0 <https://github.com/ROCm/pytorch/tree/release/2.6>`__
+      - 24.04
+      - `3.12 <https://www.python.org/downloads/release/python-31210/>`__
+      - `1.6.0 <https://github.com/ROCm/apex/tree/release/1.6.0>`__
+      - `0.21.0 <https://github.com/pytorch/vision/tree/v0.21.0>`__
+      - `2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
+      - `master <https://bitbucket.org/icl/magma/src/master/>`__
+      - `1.16.0+ds-5ubuntu1 <https://github.com/openucx/ucx/tree/v1.16.0>`__
+      - `4.1.6-7ubuntu2 <https://github.com/open-mpi/ompi/tree/v4.1.6>`__
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.2_ubuntu22.04_py3.10_pytorch_release_2.6.0/images/sha256-06b967629ba6657709f04169832cd769a11e6b491e8b1394c361d42d7a0c8b43"><i class="fab fa-docker fa-lg"></i></a>
+
+      - `2.6.0 <https://github.com/ROCm/pytorch/tree/release/2.6>`__
+      - 22.04
+      - `3.10 <https://www.python.org/downloads/release/python-31017/>`__
+      - `1.6.0 <https://github.com/ROCm/apex/tree/release/1.6.0>`__
+      - `0.21.0 <https://github.com/pytorch/vision/tree/v0.21.0>`__
+      - `2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
+      - `master <https://bitbucket.org/icl/magma/src/master/>`__
+      - `1.12.1~rc2-1 <https://github.com/openucx/ucx/tree/v1.12.1>`__
+      - `4.1.2-2ubuntu1 <https://github.com/open-mpi/ompi/tree/v4.1.2>`__
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.2_ubuntu24.04_py3.12_pytorch_release_2.5.1/images/sha256-62022414217ef6de33ac5b1341e57db8a48e8573fa2ace12d48aa5edd4b99ef0"><i class="fab fa-docker fa-lg"></i></a>
+
+      - `2.5.1 <https://github.com/ROCm/pytorch/tree/release/2.5>`__
+      - 24.04
+      - `3.12 <https://www.python.org/downloads/release/python-31210/>`__
+      - `1.5.0 <https://github.com/ROCm/apex/tree/release/1.5.0>`__
+      - `0.20.1 <https://github.com/pytorch/vision/tree/v0.20.1>`__
+      - `2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
+      - `master <https://bitbucket.org/icl/magma/src/master/>`__
+      - `1.16.0+ds-5ubuntu1 <https://github.com/openucx/ucx/tree/v1.10.0>`__
+      - `4.1.6-7ubuntu2 <https://github.com/open-mpi/ompi/tree/v4.1.6>`__
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.2_ubuntu22.04_py3.11_pytorch_release_2.5.1/images/sha256-469a7f74fc149aff31797e011ee41978f6a190adc69fa423b3c6a718a77bd985"><i class="fab fa-docker fa-lg"></i></a>
+
+      - `2.5.1 <https://github.com/ROCm/pytorch/tree/release/2.5>`__
+      - 22.04
+      - `3.11 <https://www.python.org/downloads/release/python-31113/>`__
+      - `1.5.0 <https://github.com/ROCm/apex/tree/release/1.5.0>`__
+      - `0.20.1 <https://github.com/pytorch/vision/tree/v0.20.1>`__
+      - `2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
+      - `master <https://bitbucket.org/icl/magma/src/master/>`__
+      - `1.12.1~rc2-1 <https://github.com/openucx/ucx/tree/v1.12.1>`__
+      - `4.1.2-2ubuntu1 <https://github.com/open-mpi/ompi/tree/v4.1.2>`__
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.2_ubuntu22.04_py3.10_pytorch_release_2.5.1/images/sha256-37f41a1cd94019688669a1b20d33ea74156e0c129ef6b8270076ef214a6a1a2c"><i class="fab fa-docker fa-lg"></i></a>
+
+      - `2.5.1 <https://github.com/ROCm/pytorch/tree/release/2.5>`__
+      - 22.04
+      - `3.10 <https://www.python.org/downloads/release/python-31017/>`__
+      - `1.5.0 <https://github.com/ROCm/apex/tree/release/1.5.0>`__
+      - `0.20.1 <https://github.com/pytorch/vision/tree/v0.20.1>`__
+      - `2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
+      - `master <https://bitbucket.org/icl/magma/src/master/>`__
+      - `1.12.1~rc2-1 <https://github.com/openucx/ucx/tree/v1.12.1>`__
+      - `4.1.2-2ubuntu1 <https://github.com/open-mpi/ompi/tree/v4.1.2>`__
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.2_ubuntu24.04_py3.12_pytorch_release_2.4.1/images/sha256-60824ba83dc1b9d94164925af1f81c0235c105dd555091ec04c57e05177ead1b"><i class="fab fa-docker fa-lg"></i></a>
+
+      - `2.4.1 <https://github.com/ROCm/pytorch/tree/release/2.4>`__
+      - 24.04
+      - `3.12 <https://www.python.org/downloads/release/python-31210/>`__
+      - `1.4.0 <https://github.com/ROCm/apex/tree/release/1.4.0>`__
+      - `0.19.0 <https://github.com/pytorch/vision/tree/v0.19.0>`__
+      - `2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
+      - `master <https://bitbucket.org/icl/magma/src/master/>`__
+      - `1.16.0+ds-5ubuntu1 <https://github.com/openucx/ucx/tree/v1.16.0>`__
+      - `4.1.6-7ubuntu2 <https://github.com/open-mpi/ompi/tree/v4.1.6>`__
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.2_ubuntu22.04_py3.10_pytorch_release_2.4.1/images/sha256-fe944fe083312f901be6891ab4d3ffebf2eaf2cf4f5f0f435ef0b76ec714fabd"><i class="fab fa-docker fa-lg"></i></a>
+
+      - `2.4.1 <https://github.com/ROCm/pytorch/tree/release/2.4>`__
+      - 22.04
+      - `3.10 <https://www.python.org/downloads/release/python-31017/>`__
+      - `1.4.0 <https://github.com/ROCm/apex/tree/release/1.4.0>`__
+      - `0.19.0 <https://github.com/pytorch/vision/tree/v0.19.0>`__
+      - `2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
+      - `master <https://bitbucket.org/icl/magma/src/master/>`__
+      - `1.12.1~rc2-1 <https://github.com/openucx/ucx/tree/v1.12.1>`__
+      - `4.1.2-2ubuntu1 <https://github.com/open-mpi/ompi/tree/v4.1.2>`__
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.2_ubuntu24.04_py3.12_pytorch_release_2.3.0/images/sha256-1d59251c47170c5b8960d1172a4dbe52f5793d8966edd778f168eaf32d56661a"><i class="fab fa-docker fa-lg"></i></a>
+
+      - `2.3.0 <https://github.com/ROCm/pytorch/tree/release/2.3>`__
+      - 24.04
+      - `3.12 <https://www.python.org/downloads/release/python-31210/>`__
+      - `1.3.0 <https://github.com/ROCm/apex/tree/release/1.3.0>`__
+      - `0.18.0 <https://github.com/pytorch/vision/tree/v0.18.0>`__
+      - `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13>`__
+      - `master <https://bitbucket.org/icl/magma/src/master/>`__
+      - `1.16.0+ds-5ubuntu1 <https://github.com/openucx/ucx/tree/v1.16.0>`__
+      - `4.1.6-7ubuntu2 <https://github.com/open-mpi/ompi/tree/v4.1.6>`__

 Key ROCm libraries for PyTorch
 ================================================================================
@@ -338,7 +466,7 @@ with ROCm.
    * - Library
      - Description

-    * - `torchaudio <https://docs.pytorch.org/audio/stable/index.html>`_
+    * - `torchaudio <https://docs.pytorch.org/audio/stable/index.html>`_ 
      - Audio and signal processing library for PyTorch. Provides utilities for
        audio I/O, signal and data processing functions, datasets, model
        implementations, and application components for audio and speech
@@ -365,11 +493,11 @@ with ROCm.
        and popular datasets for natural language processing, including
        tokenization, vocabulary management, and text embeddings.

-        **Note:** ``torchtext`` does not implement ROCm-specific kernels.
+        **Note:** ``torchtext`` does not implement ROCm-specific kernels. 
        ROCm acceleration is provided through the underlying PyTorch framework
        and ROCm library integration. Only official release exists.

-    * - `torchdata <https://meta-pytorch.org/data/beta/index.html#torchdata>`_
+    * - `torchdata <https://docs.pytorch.org/data/beta/index.html>`_
      - Beta library of common modular data loading primitives for easily
        constructing flexible and performant data pipelines, with features still
        in prototype stage.
@@ -417,7 +545,7 @@ Key features and enhancements for PyTorch 2.7 with ROCm 7.0

 - Expanded GPU architecture support: Provides optimized support for newer GPU
  architectures, including gfx1200 and gfx1201 with preferred hipBLASLt backend
-  selection, along with improvements for gfx950 and gfx1100 Series GPUs.
+  selection, along with improvements for gfx950 and gfx1100 series GPUs.

 - Advanced Triton Integration: AOTriton 0.10b introduces official support for
  gfx950 and gfx1201, along with experimental support for gfx1101, gfx1151,
@@ -471,7 +599,7 @@ Known issues and notes for PyTorch 2.7 with ROCm 7.0
 ================================================================================

 - The ``matmul.allow_fp16_reduced_precision_reduction`` and
-  ``matmul.allow_bf16_reduced_precision_reduction`` options under
-  ``torch.backends.cuda`` are not supported. As a result,
+  ``matmul.allow_bf16_reduced_precision_reduction`` options under 
+  ``torch.backends.cuda`` are not supported. As a result, 
  reduced-precision reductions using FP16 or BF16 accumulation types are not
  available.
--- a/docs/compatibility/ml-compatibility/stanford-megatron-lm-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/stanford-megatron-lm-compatibility.rst
@@ -27,7 +27,7 @@ Supported Devices
 ================================================================================

 - **Officially Supported**: AMD Instinct MI300X
- **Partially Supported** (functionality or performance limitations): AMD Instinct MI250X, MI210
+- **Partially Supported** (functionality or performance limitations): AMD Instinct MI250X, MI210X


 Supported models and features
--- a/docs/compatibility/ml-compatibility/taichi-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/taichi-compatibility.rst
@@ -30,8 +30,8 @@ visual effects in film and gaming, and general-purpose computing.

 Supported devices and features
 ===============================================================================
-There is support through the ROCm software stack for all Taichi GPU features on AMD Instinct MI250X and MI210X Series GPUs with the exception of Taichi’s GPU rendering system, CGUI.
-AMD Instinct MI300X Series GPUs will be supported by November.
+There is support through the ROCm software stack for all Taichi GPU features on AMD Instinct MI250X and MI210X series GPUs with the exception of Taichi’s GPU rendering system, CGUI.
+AMD Instinct MI300X series GPUs will be supported by November.

 .. _taichi-recommendations:

--- a/docs/compatibility/ml-compatibility/tensorflow-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/tensorflow-compatibility.rst
@@ -47,15 +47,80 @@ fixes, updates, and support for the latest ROCM versions.
 .. _tensorflow-docker-compat:

 Docker image compatibility
-================================================================================
+===============================================================================

-AMD provides preconfigured Docker images with TensorFlow and the ROCm backend.
-These images are published on `Docker Hub <https://hub.docker.com/r/rocm/tensorflow>`__ and are the
-recommended way to get started with deep learning with TensorFlow on ROCm.
+.. |docker-icon| raw:: html

-To find the right image tag, see the :ref:`TensorFlow on ROCm installation
-documentation <rocm-install-on-linux:tensorflow-docker-support>` for a list of
-available ``rocm/tensorflow`` images.
+   <i class="fab fa-docker"></i>
+
+AMD validates and publishes ready-made `TensorFlow images
+<https://hub.docker.com/r/rocm/tensorflow>`__ with ROCm backends on
+Docker Hub. The following Docker image tags and associated inventories are
+validated for `ROCm 6.4.2 <https://repo.radeon.com/rocm/apt/6.4.2/>`__. Click
+the |docker-icon| icon to view the image on Docker Hub.
+
+.. list-table:: TensorFlow Docker image components
+    :header-rows: 1
+
+    * - Docker image
+      - TensorFlow
+      - Ubuntu
+      - Python
+      - TensorBoard
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.2-py3.12-tf2.18-dev/images/sha256-96754ce2d30f729e19b497279915b5212ba33d5e408e7e5dd3f2304d87e3441e"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+
+      - `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.2/tensorflow_rocm-2.18.1-cp312-cp312-manylinux_2_28_x86_64.whl>`__
+      - 24.04
+      - `Python 3.12 <https://www.python.org/downloads/release/python-31210/>`__
+      - `TensorBoard 2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.2-py3.10-tf2.18-dev/images/sha256-fa741508d383858e86985a9efac85174529127408102558ae2e3a4ac894eea1e"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+
+      - `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.2/tensorflow_rocm-2.18.1-cp310-cp310-manylinux_2_28_x86_64.whl>`__
+      - 22.04
+      - `Python 3.10 <https://www.python.org/downloads/release/python-31017/>`__
+      - `TensorBoard 2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.2-py3.12-tf2.17-dev/images/sha256-3a0aef09f2a8833c2b64b85874dd9449ffc2ad257351857338ff5b706c03a418"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+
+      - `tensorflow-rocm 2.17.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.2/tensorflow_rocm-2.17.1-cp312-cp312-manylinux_2_28_x86_64.whl>`__
+      - 24.04
+      - `Python 3.12 <https://www.python.org/downloads/release/python-31210/>`__
+      - `TensorBoard 2.17.1 <https://github.com/tensorflow/tensorboard/tree/2.17.1>`__
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.2-py3.10-tf2.17-dev/images/sha256-bc7341a41ebe7ab261aa100732874507c452421ef733e408ac4f05ed453b0bc5"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+
+      - `tensorflow-rocm 2.17.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.2/tensorflow_rocm-2.17.1-cp310-cp310-manylinux_2_28_x86_64.whl>`__
+      - 22.04
+      - `Python 3.10 <https://www.python.org/downloads/release/python-31017/>`__
+      - `TensorBoard 2.17.1 <https://github.com/tensorflow/tensorboard/tree/2.17.1>`__
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.2-py3.12-tf2.16-dev/images/sha256-4841a8df7c340dab79bf9362dad687797649a00d594e0832eb83ea6880a40d3b"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+
+      - `tensorflow-rocm 2.16.2 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.2/tensorflow_rocm-2.16.2-cp312-cp312-manylinux_2_28_x86_64.whl>`__
+      - 24.04
+      - `Python 3.12 <https://www.python.org/downloads/release/python-31210/>`__
+      - `TensorBoard 2.16.2 <https://github.com/tensorflow/tensorboard/tree/2.16.2>`__
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.2-py3.10-tf2.16-dev/images/sha256-883fa95aba960c58a3e46fceaa18f03ede2c7df89b8e9fd603ab2d47e0852897"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+
+      - `tensorflow-rocm 2.16.2 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.2/tensorflow_rocm-2.16.2-cp310-cp310-manylinux_2_28_x86_64.whl>`__
+      - 22.04
+      - `Python 3.10 <https://www.python.org/downloads/release/python-31017/>`__
+      - `TensorBoard 2.16.2 <https://github.com/tensorflow/tensorboard/tree/2.16.2>`__


 Critical ROCm libraries for TensorFlow
--- a/docs/conceptual/gpu-arch.md
+++ b/docs/conceptual/gpu-arch.md
@@ -13,22 +13,22 @@
 :gutter: 1

 :::{grid-item-card}
-**AMD Instinct MI300 Series**
+**AMD Instinct MI300 series**

-Review hardware aspects of the AMD Instinct™ MI300 Series GPUs and the CDNA™ 3
+Review hardware aspects of the AMD Instinct™ MI300 series of GPU accelerators and the CDNA™ 3
 architecture.

 * [AMD Instinct™ MI300 microarchitecture](./gpu-arch/mi300.md)
 * [AMD Instinct MI300/CDNA3 ISA](https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/instruction-set-architectures/amd-instinct-mi300-cdna3-instruction-set-architecture.pdf)
 * [White paper](https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/white-papers/amd-cdna-3-white-paper.pdf)
 * [MI300 performance counters](./gpu-arch/mi300-mi200-performance-counters.rst)
-* [MI350 Series performance counters](./gpu-arch/mi350-performance-counters.rst)
+* [MI350 series performance counters](./gpu-arch/mi350-performance-counters.rst)
 :::

 :::{grid-item-card}
-**AMD Instinct MI200 Series**
+**AMD Instinct MI200 series**

-Review hardware aspects of the AMD Instinct™ MI200 Series GPUs and the CDNA™ 2
+Review hardware aspects of the AMD Instinct™ MI200 series of GPU accelerators and the CDNA™ 2
 architecture.

 * [AMD Instinct™ MI250 microarchitecture](./gpu-arch/mi250.md)
@@ -41,7 +41,7 @@ architecture.
 :::{grid-item-card}
 **AMD Instinct MI100**

-Review hardware aspects of the AMD Instinct™ MI100 Series GPUs and the CDNA™ 1
+Review hardware aspects of the AMD Instinct™ MI100 series of GPU accelerators and the CDNA™ 1
 architecture.

 * [AMD Instinct™ MI100 microarchitecture](./gpu-arch/mi100.md)
--- a/docs/conceptual/gpu-arch/mi100.md
+++ b/docs/conceptual/gpu-arch/mi100.md
@@ -1,14 +1,14 @@
 ---
 myst:
  html_meta:
-    "description lang=en": "Learn about the AMD Instinct MI100 Series architecture."
+    "description lang=en": "Learn about the AMD Instinct MI100 series architecture."
    "keywords": "Instinct, MI100, microarchitecture, AMD, ROCm"
 ---

 # AMD Instinct™ MI100 microarchitecture

 The following image shows the node-level architecture of a system that
-comprises two AMD EPYC™ processors and (up to) eight AMD Instinct™ GPUs.
+comprises two AMD EPYC™ processors and (up to) eight AMD Instinct™ accelerators.
 The two EPYC processors are connected to each other with the AMD Infinity™
 fabric which provides a high-bandwidth (up to 18 GT/sec) and coherent links such
 that each processor can access the available node memory as a single
@@ -18,29 +18,29 @@ available to connect the processors plus one PCIe Gen 4 x16 link per processor
 can attach additional I/O devices such as the host adapters for the network
 fabric.

-![Structure of a single GCD in the AMD Instinct MI100 GPU](../../data/conceptual/gpu-arch/image004.png "Node-level system architecture with two AMD EPYC™ processors and eight AMD Instinct™ GPUs.")
+![Structure of a single GCD in the AMD Instinct MI100 accelerator](../../data/conceptual/gpu-arch/image004.png "Node-level system architecture with two AMD EPYC™ processors and eight AMD Instinct™ accelerators.")

 In a typical node configuration, each processor can host up to four AMD
-Instinct™ GPUs that are attached using PCIe Gen 4 links at 16 GT/sec,
+Instinct™ accelerators that are attached using PCIe Gen 4 links at 16 GT/sec,
 which corresponds to a peak bidirectional link bandwidth of 32 GB/sec. Each hive
-of four GPUs can participate in a fully connected, coherent AMD
-Instinct™ fabric that connects the four GPUs using 23 GT/sec AMD
+of four accelerators can participate in a fully connected, coherent AMD
+Instinct™ fabric that connects the four accelerators using 23 GT/sec AMD
 Infinity fabric links that run at a higher frequency than the inter-processor
 links. This inter-GPU link can be established in certified server systems if the
 GPUs are mounted in neighboring PCIe slots by installing the AMD Infinity
-Fabric™ bridge for the AMD Instinct™ GPUs.
+Fabric™ bridge for the AMD Instinct™ accelerators.

 ## Microarchitecture

-The microarchitecture of the AMD Instinct GPUs is based on the AMD CDNA
+The microarchitecture of the AMD Instinct accelerators is based on the AMD CDNA
 architecture, which targets compute applications such as high-performance
 computing (HPC) and AI & machine learning (ML) that run on everything from
 individual servers to the world's largest exascale supercomputers. The overall
 system architecture is designed for extreme scalability and compute performance.

-![Structure of the AMD Instinct GPU (MI100 generation)](../../data/conceptual/gpu-arch/image005.png "Structure of the AMD Instinct GPU (MI100 generation)")
+![Structure of the AMD Instinct accelerator (MI100 generation)](../../data/conceptual/gpu-arch/image005.png "Structure of the AMD Instinct accelerator (MI100 generation)")

-The above image shows the AMD Instinct GPU with its PCIe Gen 4 x16
+The above image shows the AMD Instinct accelerator with its PCIe Gen 4 x16
 link (16 GT/sec, at the bottom) that connects the GPU to (one of) the host
 processor(s). It also shows the three AMD Infinity Fabric ports that provide
 high-speed links (23 GT/sec, also at the bottom) to the other GPUs of the local
@@ -48,7 +48,7 @@ hive.

 On the left and right of the floor plan, the High Bandwidth Memory (HBM)
 attaches via the GPU memory controller.  The MI100 generation of the AMD
-Instinct GPU offers four stacks of HBM generation 2 (HBM2) for a total
+Instinct accelerator offers four stacks of HBM generation 2 (HBM2) for a total
 of 32GB with a 4,096bit-wide memory interface. The peak memory bandwidth of the
 attached HBM2 is 1.228 TB/sec at a memory clock frequency of 1.2 GHz.

@@ -64,7 +64,7 @@ Therefore, the theoretical maximum FP64 peak performance is 11.5 TFLOPS
 ![Block diagram of an MI100 compute unit with detailed SIMD view of the AMD CDNA architecture](../../data/conceptual/gpu-arch/image006.png "An MI100 compute unit with detailed SIMD view of the AMD CDNA architecture")

 The preceding image shows the block diagram of a single CU of an AMD Instinct™
-MI100 GPU and summarizes how instructions flow through the execution
+MI100 accelerator and summarizes how instructions flow through the execution
 engines. The CU fetches the instructions via a 32KB instruction cache and moves
 them forward to execution via a dispatcher. The CU can handle up to ten
 wavefronts at a time and feed their instructions into the execution unit. The
--- a/docs/conceptual/gpu-arch/mi250.md
+++ b/docs/conceptual/gpu-arch/mi250.md
@@ -1,13 +1,13 @@
 ---
 myst:
  html_meta:
-    "description lang=en": "Learn about the AMD Instinct MI250 Series architecture."
+    "description lang=en": "Learn about the AMD Instinct MI250 series architecture."
    "keywords": "Instinct, MI250, microarchitecture, AMD, ROCm"
 ---

 # AMD Instinct™ MI250 microarchitecture

-The microarchitecture of the AMD Instinct MI250 GPU is based on the
+The microarchitecture of the AMD Instinct MI250 accelerators is based on the
 AMD CDNA 2 architecture that targets compute applications such as HPC,
 artificial intelligence (AI), and machine learning (ML) and that run on
 everything from individual servers to the world’s largest exascale
@@ -40,7 +40,7 @@ execution units (also called matrix cores), which are geared toward executing
 matrix operations like matrix-matrix multiplications. For FP64, the peak
 performance of these units amounts to 90.5 TFLOPS.

-![Structure of a single GCD in the AMD Instinct MI250 GPU.](../../data/conceptual/gpu-arch/image001.png "Structure of a single GCD in the AMD Instinct MI250 GPU.")
+![Structure of a single GCD in the AMD Instinct MI250 accelerator.](../../data/conceptual/gpu-arch/image001.png "Structure of a single GCD in the AMD Instinct MI250 accelerator.")

 ```{list-table} Peak-performance capabilities of the MI250 OAM for different data types.
 :header-rows: 1
@@ -84,9 +84,16 @@ performance of these units amounts to 90.5 TFLOPS.
  - 362.1
 ```

-The above table summarizes the aggregated peak performance of the AMD Instinct MI250 Open Compute Platform (OCP) Open Accelerator Modules (OAMs) and its two GCDs for different data types and execution units. The middle column lists the peak performance (number of data elements processed in a single instruction) of a single compute unit if a SIMD (or matrix) instruction is being retired in each clock cycle. The third column lists the theoretical peak performance of the OAM module. The theoretical aggregated peak memory bandwidth of the GPU is 3.2 TB/sec (1.6 TB/sec per GCD).
+The above table summarizes the aggregated peak performance of the AMD
+Instinct MI250 OCP Open Accelerator Modules (OAM, OCP is short for Open Compute
+Platform) and its two GCDs for different data types and execution units. The
+middle column lists the peak performance (number of data elements processed in a
+single instruction) of a single compute unit if a SIMD (or matrix) instruction
+is being retired in each clock cycle. The third column lists the theoretical
+peak performance of the OAM module. The theoretical aggregated peak memory
+bandwidth of the GPU is 3.2 TB/sec (1.6 TB/sec per GCD).

-![Dual-GCD architecture of the AMD Instinct MI250 GPUs](../../data/conceptual/gpu-arch/image002.png "Dual-GCD architecture of the AMD Instinct MI250 GPUs")
+![Dual-GCD architecture of the AMD Instinct MI250 accelerators](../../data/conceptual/gpu-arch/image002.png "Dual-GCD architecture of the AMD Instinct MI250 accelerators")

 The following image shows the block diagram of an OAM package that consists
 of two GCDs, each of which constitutes one GPU device in the system. The two
@@ -98,18 +105,18 @@ between the two GCDs of an OAM, or a bidirectional peak transfer bandwidth of
 ## Node-level architecture

 The following image shows the node-level architecture of a system that is
-based on the AMD Instinct MI250 GPU. The MI250 OAMs attach to the host
+based on the AMD Instinct MI250 accelerator. The MI250 OAMs attach to the host
 system via PCIe Gen 4 x16 links (yellow lines). Each GCD maintains its own PCIe
 x16 link to the host part of the system. Depending on the server platform, the
 GCD can attach to the AMD EPYC processor directly or via an optional PCIe switch
 . Note that some platforms may offer an x8 interface to the GCDs, which reduces
 the available host-to-GPU bandwidth.

-![Block diagram of AMD Instinct MI250 GPUs with 3rd Generation AMD EPYC processor](../../data/conceptual/gpu-arch/image003.png "Block diagram of AMD Instinct MI250 GPUs with 3rd Generation AMD EPYC processor")
+![Block diagram of AMD Instinct MI250 Accelerators with 3rd Generation AMD EPYC processor](../../data/conceptual/gpu-arch/image003.png "Block diagram of AMD Instinct MI250 Accelerators with 3rd Generation AMD EPYC processor")

 The preceding image shows the node-level architecture of a system with AMD
 EPYC processors in a dual-socket configuration and four AMD Instinct MI250
-GPUs. The MI250 OAMs attach to the host processors system via PCIe Gen 4
+accelerators. The MI250 OAMs attach to the host processors system via PCIe Gen 4
 x16 links (yellow lines). Depending on the system design, a PCIe switch may
 exist to make more PCIe lanes available for additional components like network
 interfaces and/or storage devices. Each GCD maintains its own PCIe x16 link to
--- a/docs/conceptual/gpu-arch/mi300-mi200-performance-counters.rst
+++ b/docs/conceptual/gpu-arch/mi300-mi200-performance-counters.rst
@@ -1,16 +1,16 @@
 .. meta::
-  :description: MI300 and MI200 Series performance counters and metrics
+  :description: MI300 and MI200 series performance counters and metrics
  :keywords: MI300, MI200, performance counters, command processor counters

 ***************************************************************************************************
-MI300 and MI200 Series performance counters and metrics
+MI300 and MI200 series performance counters and metrics
 ***************************************************************************************************

 This document lists and describes the hardware performance counters and derived metrics available
 for the AMD Instinct™ MI300 and MI200 GPU. You can also access this information using the
 :doc:`ROCprofiler-SDK <rocprofiler-sdk:how-to/using-rocprofv3>`.

-MI300 and MI200 Series performance counters
+MI300 and MI200 series performance counters
 ===============================================================

 Series performance counters include the following categories:
@@ -27,7 +27,7 @@ The following sections provide additional details for each category.

 .. note::

-  Preliminary validation of all MI300 and MI200 Series performance counters is in progress. Those with
+  Preliminary validation of all MI300 and MI200 series performance counters is in progress. Those with
  an asterisk (*) require further evaluation.

 .. _command-processor-counters:
@@ -171,7 +171,7 @@ Instruction mix
  "``SQ_INSTS_SMEM``", "Instr", "Number of scalar memory instructions issued"
  "``SQ_INSTS_SMEM_NORM``", "Instr", "Number of scalar memory instructions normalized to match ``smem_level`` issued"
  "``SQ_INSTS_FLAT``", "Instr", "Number of flat instructions issued"
-  "``SQ_INSTS_FLAT_LDS_ONLY``", "Instr", "**MI200 Series only** Number of FLAT instructions that read/write only from/to LDS issued. Works only if ``EARLY_TA_DONE`` is enabled."
+  "``SQ_INSTS_FLAT_LDS_ONLY``", "Instr", "**MI200 series only** Number of FLAT instructions that read/write only from/to LDS issued. Works only if ``EARLY_TA_DONE`` is enabled."
  "``SQ_INSTS_LDS``", "Instr", "Number of LDS instructions issued **(MI200: includes flat; MI300: does not include flat)**"
  "``SQ_INSTS_GDS``", "Instr", "Number of global data share instructions issued"
  "``SQ_INSTS_EXP_GDS``", "Instr", "Number of EXP and global data share instructions excluding skipped export instructions issued"
@@ -396,9 +396,9 @@ Texture cache per pipe counters
  "``TCP_UTCL1_TRANSLATION_MISS[n]``", "Req", "Number of unified translation cache (L1) translation misses", "0-15"
  "``TCP_UTCL1_PERMISSION_MISS[n]``", "Req", "Number of unified translation cache (L1) permission misses", "0-15"
  "``TCP_TOTAL_CACHE_ACCESSES[n]``", "Req", "Number of vector L1d cache accesses including hits and misses", "0-15"
-  "``TCP_TCP_LATENCY[n]``", "Cycles", "**MI200 Series only** Accumulated wave access latency to vL1D over all wavefronts", "0-15"
-  "``TCP_TCC_READ_REQ_LATENCY[n]``", "Cycles", "**MI200 Series only** Total vL1D to L2 request latency over all wavefronts for reads and atomics with return", "0-15"
-  "``TCP_TCC_WRITE_REQ_LATENCY[n]``", "Cycles", "**MI200 Series only** Total vL1D to L2 request latency over all wavefronts for writes and atomics without return", "0-15"
+  "``TCP_TCP_LATENCY[n]``", "Cycles", "**MI200 series only** Accumulated wave access latency to vL1D over all wavefronts", "0-15"
+  "``TCP_TCC_READ_REQ_LATENCY[n]``", "Cycles", "**MI200 series only** Total vL1D to L2 request latency over all wavefronts for reads and atomics with return", "0-15"
+  "``TCP_TCC_WRITE_REQ_LATENCY[n]``", "Cycles", "**MI200 series only** Total vL1D to L2 request latency over all wavefronts for writes and atomics without return", "0-15"
  "``TCP_TCC_READ_REQ[n]``", "Req", "Number of read requests to L2 cache", "0-15"
  "``TCP_TCC_WRITE_REQ[n]``", "Req", "Number of write requests to L2 cache", "0-15"
  "``TCP_TCC_ATOMIC_WITH_RET_REQ[n]``", "Req", "Number of atomic requests to L2 cache with return", "0-15"
@@ -560,7 +560,7 @@ Note the following:
  ``TCC_TAG_STALL[n]``, probes can stall the pipeline at a variety of places. There is no single point that
  can accurately measure the total stalls

-MI300 and MI200 Series derived metrics list
+MI300 and MI200 series derived metrics list
 ==============================================================

 .. csv-table::
--- a/docs/conceptual/gpu-arch/mi300.md
+++ b/docs/conceptual/gpu-arch/mi300.md
@@ -1,21 +1,21 @@
 ---
 myst:
  html_meta:
-    "description lang=en": "Learn about the AMD Instinct MI300 Series architecture."
+    "description lang=en": "Learn about the AMD Instinct MI300 series architecture."
    "keywords": "Instinct, MI300X, MI300A, microarchitecture, AMD, ROCm"
 ---

-# AMD Instinct™ MI300 Series microarchitecture
+# AMD Instinct™ MI300 series microarchitecture

-The AMD Instinct MI300 Series GPUs are based on the AMD CDNA 3
+The AMD Instinct MI300 series accelerators are based on the AMD CDNA 3
 architecture which was designed to deliver leadership performance for HPC, artificial intelligence (AI), and machine
-learning (ML) workloads. The AMD Instinct MI300 Series GPUs are well-suited for extreme scalability and compute performance, running
+learning (ML) workloads. The AMD Instinct MI300 series accelerators are well-suited for extreme scalability and compute performance, running
 on everything from individual servers to the world’s largest exascale supercomputers.

-With the MI300 Series, AMD is introducing the Accelerator Complex Die (XCD), which contains the
+With the MI300 series, AMD is introducing the Accelerator Complex Die (XCD), which contains the
 GPU computational elements of the processor along with the lower levels of the cache hierarchy.

-The following image depicts the structure of a single XCD in the AMD Instinct MI300 GPU Series.
+The following image depicts the structure of a single XCD in the AMD Instinct MI300 accelerator series.

 ```{figure} ../../data/shared/xcd-sys-arch.png
 ---
@@ -39,7 +39,7 @@ infrastructure) using the AMD Infinity Fabric™ technology as interconnect.
 The Matrix Cores inside the CDNA 3 CUs have significant improvements, emphasizing AI and machine
 learning, enhancing throughput of existing data types while adding support for new data types.
 CDNA 2 Matrix Cores support FP16 and BF16, while offering INT8 for inference. Compared to MI250X
-GPUs, CDNA 3 Matrix Cores triple the performance for FP16 and BF16, while providing a
+accelerators, CDNA 3 Matrix Cores triple the performance for FP16 and BF16, while providing a
 performance gain of 6.8 times for INT8. FP8 has a performance gain of 16 times compared to FP32,
 while TF32 has a gain of 4 times compared to FP32.

@@ -105,7 +105,7 @@ name: mi300-arch
 alt:
 align: center
 ---
-MI300 Series system architecture showing MI300A (left) with 6 XCDs and 3 CCDs, while the MI300X (right) has 8 XCDs.
+MI300 series system architecture showing MI300A (left) with 6 XCDs and 3 CCDs, while the MI300X (right) has 8 XCDs.
 ```

 ## Node-level architecture
@@ -116,11 +116,11 @@ name: mi300-node

 align: center
 ---
-MI300 Series node-level architecture showing 8 fully interconnected MI300X OAM modules connected to (optional) PCIEe switches via retimers and HGX connectors.
+MI300 series node-level architecture showing 8 fully interconnected MI300X OAM modules connected to (optional) PCIEe switches via retimers and HGX connectors.
 ```

 The image above shows the node-level architecture of a system with AMD EPYC processors in a
-dual-socket configuration and eight AMD Instinct MI300X GPUs. The MI300X OAMs attach to the
+dual-socket configuration and eight AMD Instinct MI300X accelerators. The MI300X OAMs attach to the
 host system via PCIe Gen 5 x16 links (yellow lines). The GPUs are using seven high-bandwidth,
 low-latency AMD Infinity Fabric™ links (red lines) to form a fully connected 8-GPU system.

--- a/docs/conceptual/gpu-arch/mi350-performance-counters.rst
+++ b/docs/conceptual/gpu-arch/mi350-performance-counters.rst
@@ -1,12 +1,12 @@
 .. meta::
-  :description: MI355 Series performance counters and metrics
+  :description: MI355 series performance counters and metrics
  :keywords: MI355, MI355X, MI3XX

 ***********************************
-MI350 Series performance counters
+MI350 series performance counters
 ***********************************

-This topic lists and describes the hardware performance counters and derived metrics available on the AMD Instinct MI350 and MI355 GPUs. These counters are available for profiling using `ROCprofiler-SDK <https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/latest/index.html>`_ and `ROCm Compute Profiler <https://rocm.docs.amd.com/projects/rocprofiler-compute/en/latest/>`_.
+This topic lists and describes the hardware performance counters and derived metrics available on the AMD Instinct MI350 and MI355 accelerators. These counters are available for profiling using `ROCprofiler-SDK <https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/latest/index.html>`_ and `ROCm Compute Profiler <https://rocm.docs.amd.com/projects/rocprofiler-compute/en/latest/>`_.

 The following sections list the performance counters based on the IP blocks.

--- a/docs/conf.py
+++ b/docs/conf.py
@@ -89,15 +89,15 @@ project = "ROCm Documentation"
 project_path = os.path.abspath(".").replace("\\", "/")
 author = "Advanced Micro Devices, Inc."
 copyright = "Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved."
-version = "7.0.2"
-release = "7.0.2"
+version = "7.0.0"
+release = "7.0.0"
 setting_all_article_info = True
 all_article_info_os = ["linux", "windows"]
 all_article_info_author = ""

 # pages with specific settings
 article_pages = [
-    {"file": "about/release-notes", "os": ["linux"], "date": "2025-10-10"},
+    {"file": "about/release-notes", "os": ["linux"], "date": "2025-09-16"},
    {"file": "release/changelog", "os": ["linux"],},
    {"file": "compatibility/compatibility-matrix", "os": ["linux"]},
    {"file": "compatibility/ml-compatibility/pytorch-compatibility", "os": ["linux"]},
@@ -110,15 +110,11 @@ article_pages = [
    {"file": "compatibility/ml-compatibility/taichi-compatibility", "os": ["linux"]},
    {"file": "compatibility/ml-compatibility/ray-compatibility", "os": ["linux"]},
    {"file": "compatibility/ml-compatibility/llama-cpp-compatibility", "os": ["linux"]},
-    {"file": "compatibility/ml-compatibility/flashinfer-compatibility", "os": ["linux"]},
    {"file": "how-to/deep-learning-rocm", "os": ["linux"]},

    {"file": "how-to/rocm-for-ai/index", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/install", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/system-setup/index", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/system-setup/multi-node-setup", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/system-setup/prerequisite-system-validation", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/system-setup/system-health-check", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/system-health-check", "os": ["linux"]},

    {"file": "how-to/rocm-for-ai/training/index", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/train-a-model", "os": ["linux"]},
@@ -131,9 +127,7 @@ article_pages = [
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.5", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.6", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.7", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-primus-migration-guide", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/primus-megatron-v25.7", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/primus-megatron", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history", "os": ["linux"]},
@@ -234,7 +228,7 @@ suppress_warnings = ["autosectionlabel.*"]

 html_context = {
    "project_path" : {project_path},
-    "gpu_type" : [('AMD Instinct GPUs', 'intrinsic'), ('AMD gfx families', 'gfx'), ('NVIDIA families', 'nvidia') ],
+    "gpu_type" : [('AMD Instinct accelerators', 'intrinsic'), ('AMD gfx families', 'gfx'), ('NVIDIA families', 'nvidia') ],
    "atomics_type" : [('HW atomics', 'hw-atomics'), ('CAS emulation', 'cas-atomics')],
    "pcie_type" : [('No PCIe atomics', 'nopcie'), ('PCIe atomics', 'pcie')],
    "memory_type" : [('Device DRAM', 'device-dram'), ('Migratable Host DRAM', 'migratable-host-dram'), ('Pinned Host DRAM', 'pinned-host-dram')],
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.1_20250909-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.1_20250909-benchmark-models.yaml
@@ -1,188 +0,0 @@
-dockers:
-  - pull_tag: rocm/vllm:rocm6.4.1_vllm_0.10.1_20250909
-    docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.1_20250909/images/sha256-1113268572e26d59b205792047bea0e61e018e79aeadceba118b7bf23cb3715c
-    components:
-      ROCm: 6.4.1
-      vLLM: 0.10.1 (0.10.1rc2.dev409+g0b6bf6691.rocm641)
-      PyTorch: 2.7.0+gitf717b2a
-      hipBLASLt: 0.15
-model_groups:
-  - group: Meta Llama
-    tag: llama
-    models:
-    - model: Llama 3.1 8B
-      mad_tag: pyt_vllm_llama-3.1-8b
-      model_repo: meta-llama/Llama-3.1-8B-Instruct
-      url: https://huggingface.co/meta-llama/Llama-3.1-8B
-      precision: float16
-      config:
-        tp: 1
-        dtype: auto
-        kv_cache_dtype: auto
-        max_seq_len_to_capture: 131072
-        max_num_batched_tokens: 131072
-        max_model_len: 8192
-    - model: Llama 3.1 70B
-      mad_tag: pyt_vllm_llama-3.1-70b
-      model_repo: meta-llama/Llama-3.1-70B-Instruct
-      url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
-      precision: float16
-      config:
-        tp: 8
-        dtype: auto
-        kv_cache_dtype: auto
-        max_seq_len_to_capture: 131072
-        max_num_batched_tokens: 131072
-        max_model_len: 8192
-    - model: Llama 3.1 405B
-      mad_tag: pyt_vllm_llama-3.1-405b
-      model_repo: meta-llama/Llama-3.1-405B-Instruct
-      url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
-      precision: float16
-      config:
-        tp: 8
-        dtype: auto
-        kv_cache_dtype: auto
-        max_seq_len_to_capture: 131072
-        max_num_batched_tokens: 131072
-        max_model_len: 8192
-    - model: Llama 2 70B
-      mad_tag: pyt_vllm_llama-2-70b
-      model_repo: meta-llama/Llama-2-70b-chat-hf
-      url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
-      precision: float16
-      config:
-        tp: 8
-        dtype: auto
-        kv_cache_dtype: auto
-        max_seq_len_to_capture: 4096
-        max_num_batched_tokens: 4096
-        max_model_len: 4096
-    - model: Llama 3.1 8B FP8
-      mad_tag: pyt_vllm_llama-3.1-8b_fp8
-      model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
-      url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
-      precision: float8
-      config:
-        tp: 1
-        dtype: auto
-        kv_cache_dtype: fp8
-        max_seq_len_to_capture: 131072
-        max_num_batched_tokens: 131072
-        max_model_len: 8192
-    - model: Llama 3.1 70B FP8
-      mad_tag: pyt_vllm_llama-3.1-70b_fp8
-      model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
-      url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
-      precision: float8
-      config:
-        tp: 8
-        dtype: auto
-        kv_cache_dtype: fp8
-        max_seq_len_to_capture: 131072
-        max_num_batched_tokens: 131072
-        max_model_len: 8192
-    - model: Llama 3.1 405B FP8
-      mad_tag: pyt_vllm_llama-3.1-405b_fp8
-      model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
-      url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
-      precision: float8
-      config:
-        tp: 8
-        dtype: auto
-        kv_cache_dtype: fp8
-        max_seq_len_to_capture: 131072
-        max_num_batched_tokens: 131072
-        max_model_len: 8192
-  - group: Mistral AI
-    tag: mistral
-    models:
-    - model: Mixtral MoE 8x7B
-      mad_tag: pyt_vllm_mixtral-8x7b
-      model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
-      url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
-      precision: float16
-      config:
-        tp: 8
-        dtype: auto
-        kv_cache_dtype: auto
-        max_seq_len_to_capture: 32768
-        max_num_batched_tokens: 32768
-        max_model_len: 8192
-    - model: Mixtral MoE 8x22B
-      mad_tag: pyt_vllm_mixtral-8x22b
-      model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
-      url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
-      precision: float16
-      config:
-        tp: 8
-        dtype: auto
-        kv_cache_dtype: auto
-        max_seq_len_to_capture: 65536
-        max_num_batched_tokens: 65536
-        max_model_len: 8192
-    - model: Mixtral MoE 8x7B FP8
-      mad_tag: pyt_vllm_mixtral-8x7b_fp8
-      model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
-      url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
-      precision: float8
-      config:
-        tp: 8
-        dtype: auto
-        kv_cache_dtype: fp8
-        max_seq_len_to_capture: 32768
-        max_num_batched_tokens: 32768
-        max_model_len: 8192
-    - model: Mixtral MoE 8x22B FP8
-      mad_tag: pyt_vllm_mixtral-8x22b_fp8
-      model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
-      url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
-      precision: float8
-      config:
-        tp: 8
-        dtype: auto
-        kv_cache_dtype: fp8
-        max_seq_len_to_capture: 65536
-        max_num_batched_tokens: 65536
-        max_model_len: 8192
-  - group: Qwen
-    tag: qwen
-    models:
-    - model: QwQ-32B
-      mad_tag: pyt_vllm_qwq-32b
-      model_repo: Qwen/QwQ-32B
-      url: https://huggingface.co/Qwen/QwQ-32B
-      precision: float16
-      config:
-        tp: 1
-        dtype: auto
-        kv_cache_dtype: auto
-        max_seq_len_to_capture: 131072
-        max_num_batched_tokens: 131072
-        max_model_len: 8192
-    - model: Qwen3 30B A3B
-      mad_tag: pyt_vllm_qwen3-30b-a3b
-      model_repo: Qwen/Qwen3-30B-A3B
-      url: https://huggingface.co/Qwen/Qwen3-30B-A3B
-      precision: float16
-      config:
-        tp: 1
-        dtype: auto
-        kv_cache_dtype: auto
-        max_seq_len_to_capture: 32768
-        max_num_batched_tokens: 32768
-        max_model_len: 8192
-  - group: Microsoft Phi
-    tag: phi
-    models:
-    - model: Phi-4
-      mad_tag: pyt_vllm_phi-4
-      model_repo: microsoft/phi-4
-      url: https://huggingface.co/microsoft/phi-4
-      config:
-        tp: 1
-        dtype: auto
-        kv_cache_dtype: auto
-        max_seq_len_to_capture: 16384
-        max_num_batched_tokens: 16384
-        max_model_len: 8192
--- a/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
@@ -1,316 +1,188 @@
 dockers:
-  - pull_tag: rocm/vllm:rocm7.0.0_vllm_0.10.2_20251006
-    docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm7.0.0_vllm_0.10.2_20251006/images/sha256-94fd001964e1cf55c3224a445b1fb5be31a7dac302315255db8422d813edd7f5
+  - pull_tag: rocm/vllm:rocm6.4.1_vllm_0.10.1_20250909
+    docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.1_20250909/images/sha256-1113268572e26d59b205792047bea0e61e018e79aeadceba118b7bf23cb3715c
    components:
-      ROCm: 7.0.0
-      vLLM: 0.10.2 (0.11.0rc2.dev160+g790d22168.rocm700)
-      PyTorch: 2.9.0a0+git1c57644
-      hipBLASLt: 1.0.0
-    dockerfile:
-      commit: 790d22168820507f3105fef29596549378cfe399
+      ROCm: 6.4.1
+      vLLM: 0.10.1 (0.10.1rc2.dev409+g0b6bf6691.rocm641)
+      PyTorch: 2.7.0+gitf717b2a
+      hipBLASLt: 0.15
 model_groups:
  - group: Meta Llama
    tag: llama
    models:
-      - model: Llama 2 70B
-        mad_tag: pyt_vllm_llama-2-70b
-        model_repo: meta-llama/Llama-2-70b-chat-hf
-        url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
-        precision: float16
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 4096
-          max_model_len: 4096
-      - model: Llama 3.1 8B
-        mad_tag: pyt_vllm_llama-3.1-8b
-        model_repo: meta-llama/Llama-3.1-8B-Instruct
-        url: https://huggingface.co/meta-llama/Llama-3.1-8B
-        precision: float16
-        config:
-          tp: 1
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 131072
-          max_model_len: 8192
-      - model: Llama 3.1 8B FP8
-        mad_tag: pyt_vllm_llama-3.1-8b_fp8
-        model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
-        url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
-        precision: float8
-        config:
-          tp: 1
-          dtype: auto
-          kv_cache_dtype: fp8
-          max_num_batched_tokens: 131072
-          max_model_len: 8192
-      - model: Llama 3.1 405B
-        mad_tag: pyt_vllm_llama-3.1-405b
-        model_repo: meta-llama/Llama-3.1-405B-Instruct
-        url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
-        precision: float16
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 131072
-          max_model_len: 8192
-      - model: Llama 3.1 405B FP8
-        mad_tag: pyt_vllm_llama-3.1-405b_fp8
-        model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
-        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
-        precision: float8
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: fp8
-          max_num_batched_tokens: 131072
-          max_model_len: 8192
-      - model: Llama 3.1 405B MXFP4
-        mad_tag: pyt_vllm_llama-3.1-405b_fp4
-        model_repo: amd/Llama-3.1-405B-Instruct-MXFP4-Preview
-        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-MXFP4-Preview
-        precision: float4
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: fp8
-          max_num_batched_tokens: 131072
-          max_model_len: 8192
-      - model: Llama 3.3 70B
-        mad_tag: pyt_vllm_llama-3.3-70b
-        model_repo: meta-llama/Llama-3.3-70B-Instruct
-        url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
-        precision: float16
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 131072
-          max_model_len: 8192
-      - model: Llama 3.3 70B FP8
-        mad_tag: pyt_vllm_llama-3.3-70b_fp8
-        model_repo: amd/Llama-3.3-70B-Instruct-FP8-KV
-        url: https://huggingface.co/amd/Llama-3.3-70B-Instruct-FP8-KV
-        precision: float8
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: fp8
-          max_num_batched_tokens: 131072
-          max_model_len: 8192
-      - model: Llama 3.3 70B MXFP4
-        mad_tag: pyt_vllm_llama-3.3-70b_fp4
-        model_repo: amd/Llama-3.3-70B-Instruct-MXFP4-Preview
-        url: https://huggingface.co/amd/Llama-3.3-70B-Instruct-MXFP4-Preview
-        precision: float4
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: fp8
-          max_num_batched_tokens: 131072
-          max_model_len: 8192
-      - model: Llama 4 Scout 17Bx16E
-        mad_tag: pyt_vllm_llama-4-scout-17b-16e
-        model_repo: meta-llama/Llama-4-Scout-17B-16E-Instruct
-        url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct
-        precision: float16
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 32768
-          max_model_len: 8192
-      - model: Llama 4 Maverick 17Bx128E
-        mad_tag: pyt_vllm_llama-4-maverick-17b-128e
-        model_repo: meta-llama/Llama-4-Maverick-17B-128E-Instruct
-        url: https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct
-        precision: float16
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 32768
-          max_model_len: 8192
-      - model: Llama 4 Maverick 17Bx128E FP8
-        mad_tag: pyt_vllm_llama-4-maverick-17b-128e_fp8
-        model_repo: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
-        url: https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
-        precision: float8
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: fp8
-          max_num_batched_tokens: 131072
-          max_model_len: 8192
-  - group: DeepSeek
-    tag: deepseek
-    models:
-      - model: DeepSeek R1 0528 FP8
-        mad_tag: pyt_vllm_deepseek-r1
-        model_repo: deepseek-ai/DeepSeek-R1-0528
-        url: https://huggingface.co/deepseek-ai/DeepSeek-R1-0528
-        precision: float8
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: fp8
-          max_num_seqs: 1024
-          max_num_batched_tokens: 131072
-          max_model_len: 8192
-  - group: OpenAI GPT OSS
-    tag: gpt-oss
-    models:
-      - model: GPT OSS 20B
-        mad_tag: pyt_vllm_gpt-oss-20b
-        model_repo: openai/gpt-oss-20b
-        url: https://huggingface.co/openai/gpt-oss-20b
-        precision: bfloat16
-        config:
-          tp: 1
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 8192
-          max_model_len: 8192
-      - model: GPT OSS 120B
-        mad_tag: pyt_vllm_gpt-oss-120b
-        model_repo: openai/gpt-oss-120b
-        url: https://huggingface.co/openai/gpt-oss-120b
-        precision: bfloat16
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 8192
-          max_model_len: 8192
+    - model: Llama 3.1 8B
+      mad_tag: pyt_vllm_llama-3.1-8b
+      model_repo: meta-llama/Llama-3.1-8B-Instruct
+      url: https://huggingface.co/meta-llama/Llama-3.1-8B
+      precision: float16
+      config:
+        tp: 1
+        dtype: auto
+        kv_cache_dtype: auto
+        max_seq_len_to_capture: 131072
+        max_num_batched_tokens: 131072
+        max_model_len: 8192
+    - model: Llama 3.1 70B
+      mad_tag: pyt_vllm_llama-3.1-70b
+      model_repo: meta-llama/Llama-3.1-70B-Instruct
+      url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
+      precision: float16
+      config:
+        tp: 8
+        dtype: auto
+        kv_cache_dtype: auto
+        max_seq_len_to_capture: 131072
+        max_num_batched_tokens: 131072
+        max_model_len: 8192
+    - model: Llama 3.1 405B
+      mad_tag: pyt_vllm_llama-3.1-405b
+      model_repo: meta-llama/Llama-3.1-405B-Instruct
+      url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
+      precision: float16
+      config:
+        tp: 8
+        dtype: auto
+        kv_cache_dtype: auto
+        max_seq_len_to_capture: 131072
+        max_num_batched_tokens: 131072
+        max_model_len: 8192
+    - model: Llama 2 70B
+      mad_tag: pyt_vllm_llama-2-70b
+      model_repo: meta-llama/Llama-2-70b-chat-hf
+      url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
+      precision: float16
+      config:
+        tp: 8
+        dtype: auto
+        kv_cache_dtype: auto
+        max_seq_len_to_capture: 4096
+        max_num_batched_tokens: 4096
+        max_model_len: 4096
+    - model: Llama 3.1 8B FP8
+      mad_tag: pyt_vllm_llama-3.1-8b_fp8
+      model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
+      url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
+      precision: float8
+      config:
+        tp: 1
+        dtype: auto
+        kv_cache_dtype: fp8
+        max_seq_len_to_capture: 131072
+        max_num_batched_tokens: 131072
+        max_model_len: 8192
+    - model: Llama 3.1 70B FP8
+      mad_tag: pyt_vllm_llama-3.1-70b_fp8
+      model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
+      url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
+      precision: float8
+      config:
+        tp: 8
+        dtype: auto
+        kv_cache_dtype: fp8
+        max_seq_len_to_capture: 131072
+        max_num_batched_tokens: 131072
+        max_model_len: 8192
+    - model: Llama 3.1 405B FP8
+      mad_tag: pyt_vllm_llama-3.1-405b_fp8
+      model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
+      url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
+      precision: float8
+      config:
+        tp: 8
+        dtype: auto
+        kv_cache_dtype: fp8
+        max_seq_len_to_capture: 131072
+        max_num_batched_tokens: 131072
+        max_model_len: 8192
  - group: Mistral AI
    tag: mistral
    models:
-      - model: Mixtral MoE 8x7B
-        mad_tag: pyt_vllm_mixtral-8x7b
-        model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
-        url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
-        precision: float16
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 32768
-          max_model_len: 8192
-      - model: Mixtral MoE 8x7B FP8
-        mad_tag: pyt_vllm_mixtral-8x7b_fp8
-        model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
-        url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
-        precision: float8
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: fp8
-          max_num_batched_tokens: 32768
-          max_model_len: 8192
-      - model: Mixtral MoE 8x22B
-        mad_tag: pyt_vllm_mixtral-8x22b
-        model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
-        url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
-        precision: float16
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 65536
-          max_model_len: 8192
-      - model: Mixtral MoE 8x22B FP8
-        mad_tag: pyt_vllm_mixtral-8x22b_fp8
-        model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
-        url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
-        precision: float8
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: fp8
-          max_num_batched_tokens: 65536
-          max_model_len: 8192
+    - model: Mixtral MoE 8x7B
+      mad_tag: pyt_vllm_mixtral-8x7b
+      model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
+      url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
+      precision: float16
+      config:
+        tp: 8
+        dtype: auto
+        kv_cache_dtype: auto
+        max_seq_len_to_capture: 32768
+        max_num_batched_tokens: 32768
+        max_model_len: 8192
+    - model: Mixtral MoE 8x22B
+      mad_tag: pyt_vllm_mixtral-8x22b
+      model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
+      url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
+      precision: float16
+      config:
+        tp: 8
+        dtype: auto
+        kv_cache_dtype: auto
+        max_seq_len_to_capture: 65536
+        max_num_batched_tokens: 65536
+        max_model_len: 8192
+    - model: Mixtral MoE 8x7B FP8
+      mad_tag: pyt_vllm_mixtral-8x7b_fp8
+      model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+      url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+      precision: float8
+      config:
+        tp: 8
+        dtype: auto
+        kv_cache_dtype: fp8
+        max_seq_len_to_capture: 32768
+        max_num_batched_tokens: 32768
+        max_model_len: 8192
+    - model: Mixtral MoE 8x22B FP8
+      mad_tag: pyt_vllm_mixtral-8x22b_fp8
+      model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+      url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+      precision: float8
+      config:
+        tp: 8
+        dtype: auto
+        kv_cache_dtype: fp8
+        max_seq_len_to_capture: 65536
+        max_num_batched_tokens: 65536
+        max_model_len: 8192
  - group: Qwen
    tag: qwen
    models:
-      - model: Qwen3 8B
-        mad_tag: pyt_vllm_qwen3-8b
-        model_repo: Qwen/Qwen3-8B
-        url: https://huggingface.co/Qwen/Qwen3-8B
-        precision: float16
-        config:
-          tp: 1
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 40960
-          max_model_len: 8192
-      - model: Qwen3 32B
-        mad_tag: pyt_vllm_qwen3-32b
-        model_repo: Qwen/Qwen3-32b
-        url: https://huggingface.co/Qwen/Qwen3-32B
-        precision: float16
-        config:
-          tp: 1
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 40960
-          max_model_len: 8192
-      - model: Qwen3 30B A3B
-        mad_tag: pyt_vllm_qwen3-30b-a3b
-        model_repo: Qwen/Qwen3-30B-A3B
-        url: https://huggingface.co/Qwen/Qwen3-30B-A3B
-        precision: float16
-        config:
-          tp: 1
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 40960
-          max_model_len: 8192
-      - model: Qwen3 30B A3B FP8
-        mad_tag: pyt_vllm_qwen3-30b-a3b_fp8
-        model_repo: Qwen/Qwen3-30B-A3B-FP8
-        url: https://huggingface.co/Qwen/Qwen3-30B-A3B-FP8
-        precision: float16
-        config:
-          tp: 1
-          dtype: auto
-          kv_cache_dtype: fp8
-          max_num_batched_tokens: 40960
-          max_model_len: 8192
-      - model: Qwen3 235B A22B
-        mad_tag: pyt_vllm_qwen3-235b-a22b
-        model_repo: Qwen/Qwen3-235B-A22B
-        url: https://huggingface.co/Qwen/Qwen3-235B-A22B
-        precision: float16
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 40960
-          max_model_len: 8192
-      - model: Qwen3 235B A22B FP8
-        mad_tag: pyt_vllm_qwen3-235b-a22b_fp8
-        model_repo: Qwen/Qwen3-235B-A22B-FP8
-        url: https://huggingface.co/Qwen/Qwen3-235B-A22B-FP8
-        precision: float8
-        config:
-          tp: 8
-          dtype: auto
-          kv_cache_dtype: fp8
-          max_num_batched_tokens: 40960
-          max_model_len: 8192
+    - model: QwQ-32B
+      mad_tag: pyt_vllm_qwq-32b
+      model_repo: Qwen/QwQ-32B
+      url: https://huggingface.co/Qwen/QwQ-32B
+      precision: float16
+      config:
+        tp: 1
+        dtype: auto
+        kv_cache_dtype: auto
+        max_seq_len_to_capture: 131072
+        max_num_batched_tokens: 131072
+        max_model_len: 8192
+    - model: Qwen3 30B A3B
+      mad_tag: pyt_vllm_qwen3-30b-a3b
+      model_repo: Qwen/Qwen3-30B-A3B
+      url: https://huggingface.co/Qwen/Qwen3-30B-A3B
+      precision: float16
+      config:
+        tp: 1
+        dtype: auto
+        kv_cache_dtype: auto
+        max_seq_len_to_capture: 32768
+        max_num_batched_tokens: 32768
+        max_model_len: 8192
  - group: Microsoft Phi
    tag: phi
    models:
-      - model: Phi-4
-        mad_tag: pyt_vllm_phi-4
-        model_repo: microsoft/phi-4
-        url: https://huggingface.co/microsoft/phi-4
-        precision: float16
-        config:
-          tp: 1
-          dtype: auto
-          kv_cache_dtype: auto
-          max_num_batched_tokens: 16384
-          max_model_len: 8192
+    - model: Phi-4
+      mad_tag: pyt_vllm_phi-4
+      model_repo: microsoft/phi-4
+      url: https://huggingface.co/microsoft/phi-4
+      config:
+        tp: 1
+        dtype: auto
+        kv_cache_dtype: auto
+        max_seq_len_to_capture: 16384
+        max_num_batched_tokens: 16384
+        max_model_len: 8192
--- a/docs/data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
@@ -1,16 +1,47 @@
 dockers:
-  - pull_tag: rocm/jax-training:maxtext-v25.9
+  - pull_tag: rocm/jax-training:maxtext-v25.7
    docker_hub_url: https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7/images/sha256-45f4c727d4019a63fc47313d3a5f5a5105569539294ddfd2d742218212ae9025
    components:
-      ROCm: 7.0.0
-      JAX: 0.6.2
-      Python: 3.10.18
-      Transformer Engine: 2.2.0.dev0+c91bac54
+      ROCm: 6.4.1
+      JAX: 0.5.0
+      Python: 3.10.12
+      Transformer Engine: 2.1.0+90d703dd
      hipBLASLt: 1.x.x
+  - pull_tag: rocm/jax-training:maxtext-v25.7-jax060
+    docker_hub_url: https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7/images/sha256-45f4c727d4019a63fc47313d3a5f5a5105569539294ddfd2d742218212ae9025
+    components:
+      ROCm: 6.4.1
+      JAX: 0.6.0
+      Python: 3.10.12
+      Transformer Engine: 2.1.0+90d703dd
+      hipBLASLt: 1.1.0-499ece1c21
 model_groups:
  - group: Meta Llama
    tag: llama
    models:
+      - model: Llama 3.3 70B
+        mad_tag: jax_maxtext_train_llama-3.3-70b
+        model_repo: Llama-3.3-70B
+        precision: bf16
+        doc_options: ["single-node"]
+      - model: Llama 3.1 8B
+        mad_tag: jax_maxtext_train_llama-3.1-8b
+        model_repo: Llama-3.1-8B
+        precision: bf16
+        doc_options: ["single-node"]
+      - model: Llama 3.1 70B
+        mad_tag: jax_maxtext_train_llama-3.1-70b
+        model_repo: Llama-3.1-70B
+        precision: bf16
+        doc_options: ["single-node"]
+      - model: Llama 3 8B
+        mad_tag: jax_maxtext_train_llama-3-8b
+        multinode_training_script: llama3_8b_multinode.sh
+        doc_options: ["multi-node"]
+      - model: Llama 3 70B
+        mad_tag: jax_maxtext_train_llama-3-70b
+        multinode_training_script: llama3_70b_multinode.sh
+        doc_options: ["multi-node"]
      - model: Llama 2 7B
        mad_tag: jax_maxtext_train_llama-2-7b
        model_repo: Llama-2-7B
@@ -23,29 +54,6 @@ model_groups:
        precision: bf16
        multinode_training_script: llama2_70b_multinode.sh
        doc_options: ["single-node", "multi-node"]
-      - model: Llama 3 8B (multi-node)
-        mad_tag: jax_maxtext_train_llama-3-8b
-        multinode_training_script: llama3_8b_multinode.sh
-        doc_options: ["multi-node"]
-      - model: Llama 3 70B (multi-node)
-        mad_tag: jax_maxtext_train_llama-3-70b
-        multinode_training_script: llama3_70b_multinode.sh
-        doc_options: ["multi-node"]
-      - model: Llama 3.1 8B
-        mad_tag: jax_maxtext_train_llama-3.1-8b
-        model_repo: Llama-3.1-8B
-        precision: bf16
-        doc_options: ["single-node"]
-      - model: Llama 3.1 70B
-        mad_tag: jax_maxtext_train_llama-3.1-70b
-        model_repo: Llama-3.1-70B
-        precision: bf16
-        doc_options: ["single-node"]
-      - model: Llama 3.3 70B
-        mad_tag: jax_maxtext_train_llama-3.3-70b
-        model_repo: Llama-3.3-70B
-        precision: bf16
-        doc_options: ["single-node"]
  - group: DeepSeek
    tag: deepseek
    models:
--- a/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
@@ -1,21 +1,15 @@
 dockers:
-  MI355X and MI350X:
-    pull_tag: rocm/megatron-lm:v25.9_gfx950
-    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
-    components: &docker_components
-      ROCm: 7.0.0
-      Primus: aab4234
-      PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
+  - pull_tag: rocm/megatron-lm:v25.7_py310
+    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a
+    components:
+      ROCm: 6.4.2
+      Primus: v0.1.0-rc1
+      PyTorch: 2.8.0a0+gitd06a406
      Python: "3.10"
-      Transformer Engine: 2.2.0.dev0+54dd2bdc
-      Flash Attention: 2.8.3
-      hipBLASLt: 911283acd1
-      Triton: 3.4.0+rocm7.0.0.git56765e8c
-      RCCL: 2.26.6
-  MI325X and MI300X:
-    pull_tag: rocm/megatron-lm:v25.9_gfx942
-    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
-    components: *docker_components
+      Transformer Engine: 2.1.0.dev0+ba586519
+      hipBLASLt: 37ba1d36
+      Triton: 3.3.0
+      RCCL: 2.22.3
 model_groups:
  - group: Meta Llama
    tag: llama
@@ -26,6 +20,8 @@ model_groups:
        mad_tag: pyt_megatron_lm_train_llama-3.1-8b
      - model: Llama 3.1 70B
        mad_tag: pyt_megatron_lm_train_llama-3.1-70b
+      - model: Llama 3.1 70B (proxy)
+        mad_tag: pyt_megatron_lm_train_llama-3.1-70b-proxy
      - model: Llama 2 7B
        mad_tag: pyt_megatron_lm_train_llama-2-7b
      - model: Llama 2 70B
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/jax-maxtext-v25.7-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/jax-maxtext-v25.7-benchmark-models.yaml
@@ -1,72 +0,0 @@
-dockers:
-  - pull_tag: rocm/jax-training:maxtext-v25.7-jax060
-    docker_hub_url: https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7/images/sha256-45f4c727d4019a63fc47313d3a5f5a5105569539294ddfd2d742218212ae9025
-    components:
-      ROCm: 6.4.1
-      JAX: 0.6.0
-      Python: 3.10.12
-      Transformer Engine: 2.1.0+90d703dd
-      hipBLASLt: 1.1.0-499ece1c21
-  - pull_tag: rocm/jax-training:maxtext-v25.7
-    docker_hub_url: https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7/images/sha256-45f4c727d4019a63fc47313d3a5f5a5105569539294ddfd2d742218212ae9025
-    components:
-      ROCm: 6.4.1
-      JAX: 0.5.0
-      Python: 3.10.12
-      Transformer Engine: 2.1.0+90d703dd
-      hipBLASLt: 1.x.x
-model_groups:
-  - group: Meta Llama
-    tag: llama
-    models:
-      - model: Llama 3.3 70B
-        mad_tag: jax_maxtext_train_llama-3.3-70b
-        model_repo: Llama-3.3-70B
-        precision: bf16
-        doc_options: ["single-node"]
-      - model: Llama 3.1 8B
-        mad_tag: jax_maxtext_train_llama-3.1-8b
-        model_repo: Llama-3.1-8B
-        precision: bf16
-        doc_options: ["single-node"]
-      - model: Llama 3.1 70B
-        mad_tag: jax_maxtext_train_llama-3.1-70b
-        model_repo: Llama-3.1-70B
-        precision: bf16
-        doc_options: ["single-node"]
-      - model: Llama 3 8B
-        mad_tag: jax_maxtext_train_llama-3-8b
-        multinode_training_script: llama3_8b_multinode.sh
-        doc_options: ["multi-node"]
-      - model: Llama 3 70B
-        mad_tag: jax_maxtext_train_llama-3-70b
-        multinode_training_script: llama3_70b_multinode.sh
-        doc_options: ["multi-node"]
-      - model: Llama 2 7B
-        mad_tag: jax_maxtext_train_llama-2-7b
-        model_repo: Llama-2-7B
-        precision: bf16
-        multinode_training_script: llama2_7b_multinode.sh
-        doc_options: ["single-node", "multi-node"]
-      - model: Llama 2 70B
-        mad_tag: jax_maxtext_train_llama-2-70b
-        model_repo: Llama-2-70B
-        precision: bf16
-        multinode_training_script: llama2_70b_multinode.sh
-        doc_options: ["single-node", "multi-node"]
-  - group: DeepSeek
-    tag: deepseek
-    models:
-      - model: DeepSeek-V2-Lite (16B)
-        mad_tag: jax_maxtext_train_deepseek-v2-lite-16b
-        model_repo: DeepSeek-V2-lite
-        precision: bf16
-        doc_options: ["single-node"]
-  - group: Mistral AI
-    tag: mistral
-    models:
-      - model: Mixtral 8x7B
-        mad_tag: jax_maxtext_train_mixtral-8x7b
-        model_repo: Mixtral-8x7B
-        precision: bf16
-        doc_options: ["single-node"]
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.7-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.7-benchmark-models.yaml
@@ -1,49 +0,0 @@
-dockers:
-  - pull_tag: rocm/megatron-lm:v25.7_py310
-    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a
-    components:
-      ROCm: 6.4.2
-      Primus: v0.1.0-rc1
-      PyTorch: 2.8.0a0+gitd06a406
-      Python: "3.10"
-      Transformer Engine: 2.1.0.dev0+ba586519
-      hipBLASLt: 37ba1d36
-      Triton: 3.3.0
-      RCCL: 2.22.3
-model_groups:
-  - group: Meta Llama
-    tag: llama
-    models:
-      - model: Llama 3.3 70B
-        mad_tag: pyt_megatron_lm_train_llama-3.3-70b
-      - model: Llama 3.1 8B
-        mad_tag: pyt_megatron_lm_train_llama-3.1-8b
-      - model: Llama 3.1 70B
-        mad_tag: pyt_megatron_lm_train_llama-3.1-70b
-      - model: Llama 3.1 70B (proxy)
-        mad_tag: pyt_megatron_lm_train_llama-3.1-70b-proxy
-      - model: Llama 2 7B
-        mad_tag: pyt_megatron_lm_train_llama-2-7b
-      - model: Llama 2 70B
-        mad_tag: pyt_megatron_lm_train_llama-2-70b
-  - group: DeepSeek
-    tag: deepseek
-    models:
-      - model: DeepSeek-V3 (proxy)
-        mad_tag: pyt_megatron_lm_train_deepseek-v3-proxy
-      - model: DeepSeek-V2-Lite
-        mad_tag: pyt_megatron_lm_train_deepseek-v2-lite-16b
-  - group: Mistral AI
-    tag: mistral
-    models:
-      - model: Mixtral 8x7B
-        mad_tag: pyt_megatron_lm_train_mixtral-8x7b
-      - model: Mixtral 8x22B (proxy)
-        mad_tag: pyt_megatron_lm_train_mixtral-8x22b-proxy
-  - group: Qwen
-    tag: qwen
-    models:
-      - model: Qwen 2.5 7B
-        mad_tag: pyt_megatron_lm_train_qwen2.5-7b
-      - model: Qwen 2.5 72B
-        mad_tag: pyt_megatron_lm_train_qwen2.5-72b
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.8-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.8-benchmark-models.yaml
@@ -1,48 +0,0 @@
-dockers:
-  - pull_tag: rocm/megatron-lm:v25.8_py310
-    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.8_py310/images/sha256-50fc824361054e445e86d5d88d5f58817f61f8ec83ad4a7e43ea38bbc4a142c0
-    components:
-      ROCm: 6.4.3
-      PyTorch: 2.8.0a0+gitd06a406
-      Python: "3.10"
-      Transformer Engine: 2.2.0.dev0+54dd2bdc
-      hipBLASLt: d1b517fc7a
-      Triton: 3.3.0
-      RCCL: 2.22.3
-model_groups:
-  - group: Meta Llama
-    tag: llama
-    models:
-      - model: Llama 3.3 70B
-        mad_tag: pyt_megatron_lm_train_llama-3.3-70b
-      - model: Llama 3.1 8B
-        mad_tag: pyt_megatron_lm_train_llama-3.1-8b
-      - model: Llama 3.1 70B
-        mad_tag: pyt_megatron_lm_train_llama-3.1-70b
-      - model: Llama 3.1 70B (proxy)
-        mad_tag: pyt_megatron_lm_train_llama-3.1-70b-proxy
-      - model: Llama 2 7B
-        mad_tag: pyt_megatron_lm_train_llama-2-7b
-      - model: Llama 2 70B
-        mad_tag: pyt_megatron_lm_train_llama-2-70b
-  - group: DeepSeek
-    tag: deepseek
-    models:
-      - model: DeepSeek-V3 (proxy)
-        mad_tag: pyt_megatron_lm_train_deepseek-v3-proxy
-      - model: DeepSeek-V2-Lite
-        mad_tag: pyt_megatron_lm_train_deepseek-v2-lite-16b
-  - group: Mistral AI
-    tag: mistral
-    models:
-      - model: Mixtral 8x7B
-        mad_tag: pyt_megatron_lm_train_mixtral-8x7b
-      - model: Mixtral 8x22B (proxy)
-        mad_tag: pyt_megatron_lm_train_mixtral-8x22b-proxy
-  - group: Qwen
-    tag: qwen
-    models:
-      - model: Qwen 2.5 7B
-        mad_tag: pyt_megatron_lm_train_qwen2.5-7b
-      - model: Qwen 2.5 72B
-        mad_tag: pyt_megatron_lm_train_qwen2.5-72b
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.7-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.7-benchmark-models.yaml
@@ -1,58 +0,0 @@
-dockers:
-  - pull_tag: rocm/megatron-lm:v25.7_py310
-    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a
-    components:
-      ROCm: 6.4.2
-      Primus: v0.1.0-rc1
-      PyTorch: 2.8.0a0+gitd06a406
-      Python: "3.10"
-      Transformer Engine: 2.1.0.dev0+ba586519
-      hipBLASLt: 37ba1d36
-      Triton: 3.3.0
-      RCCL: 2.22.3
-model_groups:
-  - group: Meta Llama
-    tag: llama
-    models:
-      - model: Llama 3.3 70B
-        mad_tag: primus_pyt_megatron_lm_train_llama-3.3-70b
-        config_name: llama3.3_70B-pretrain.yaml
-      - model: Llama 3.1 70B
-        mad_tag: primus_pyt_megatron_lm_train_llama-3.1-70b
-        config_name: llama3.1_70B-pretrain.yaml
-      - model: Llama 3.1 8B
-        mad_tag: primus_pyt_megatron_lm_train_llama-3.1-8b
-        config_name: llama3.1_8B-pretrain.yaml
-      - model: Llama 2 7B
-        mad_tag: primus_pyt_megatron_lm_train_llama-2-7b
-        config_name: llama2_7B-pretrain.yaml
-      - model: Llama 2 70B
-        mad_tag: primus_pyt_megatron_lm_train_llama-2-70b
-        config_name: llama2_70B-pretrain.yaml
-  - group: DeepSeek
-    tag: deepseek
-    models:
-      - model: DeepSeek-V3 (proxy)
-        mad_tag: primus_pyt_megatron_lm_train_deepseek-v3-proxy
-        config_name: deepseek_v3-pretrain.yaml
-      - model: DeepSeek-V2-Lite
-        mad_tag: primus_pyt_megatron_lm_train_deepseek-v2-lite-16b
-        config_name: deepseek_v2_lite-pretrain.yaml
-  - group: Mistral AI
-    tag: mistral
-    models:
-      - model: Mixtral 8x7B
-        mad_tag: primus_pyt_megatron_lm_train_mixtral-8x7b
-        config_name: mixtral_8x7B_v0.1-pretrain.yaml
-      - model: Mixtral 8x22B (proxy)
-        mad_tag: primus_pyt_megatron_lm_train_mixtral-8x22b-proxy
-        config_name: mixtral_8x22B_v0.1-pretrain.yaml
-  - group: Qwen
-    tag: qwen
-    models:
-      - model: Qwen 2.5 7B
-        mad_tag: primus_pyt_megatron_lm_train_qwen2.5-7b
-        config_name: primus_qwen2.5_7B-pretrain.yaml
-      - model: Qwen 2.5 72B
-        mad_tag: primus_pyt_megatron_lm_train_qwen2.5-72b
-        config_name: qwen2.5_72B-pretrain.yaml
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.8-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-megatron-v25.8-benchmark-models.yaml
@@ -1,58 +0,0 @@
-dockers:
-  - pull_tag: rocm/megatron-lm:v25.8_py310
-    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.8_py310/images/sha256-50fc824361054e445e86d5d88d5f58817f61f8ec83ad4a7e43ea38bbc4a142c0
-    components:
-      ROCm: 6.4.3
-      Primus: 927a717
-      PyTorch: 2.8.0a0+gitd06a406
-      Python: "3.10"
-      Transformer Engine: 2.2.0.dev0+54dd2bdc
-      hipBLASLt: d1b517fc7a
-      Triton: 3.3.0
-      RCCL: 2.22.3
-model_groups:
-  - group: Meta Llama
-    tag: llama
-    models:
-      - model: Llama 3.3 70B
-        mad_tag: primus_pyt_megatron_lm_train_llama-3.3-70b
-        config_name: llama3.3_70B-pretrain.yaml
-      - model: Llama 3.1 70B
-        mad_tag: primus_pyt_megatron_lm_train_llama-3.1-70b
-        config_name: llama3.1_70B-pretrain.yaml
-      - model: Llama 3.1 8B
-        mad_tag: primus_pyt_megatron_lm_train_llama-3.1-8b
-        config_name: llama3.1_8B-pretrain.yaml
-      - model: Llama 2 7B
-        mad_tag: primus_pyt_megatron_lm_train_llama-2-7b
-        config_name: llama2_7B-pretrain.yaml
-      - model: Llama 2 70B
-        mad_tag: primus_pyt_megatron_lm_train_llama-2-70b
-        config_name: llama2_70B-pretrain.yaml
-  - group: DeepSeek
-    tag: deepseek
-    models:
-      - model: DeepSeek-V3 (proxy)
-        mad_tag: primus_pyt_megatron_lm_train_deepseek-v3-proxy
-        config_name: deepseek_v3-pretrain.yaml
-      - model: DeepSeek-V2-Lite
-        mad_tag: primus_pyt_megatron_lm_train_deepseek-v2-lite-16b
-        config_name: deepseek_v2_lite-pretrain.yaml
-  - group: Mistral AI
-    tag: mistral
-    models:
-      - model: Mixtral 8x7B
-        mad_tag: primus_pyt_megatron_lm_train_mixtral-8x7b
-        config_name: mixtral_8x7B_v0.1-pretrain.yaml
-      - model: Mixtral 8x22B (proxy)
-        mad_tag: primus_pyt_megatron_lm_train_mixtral-8x22b-proxy
-        config_name: mixtral_8x22B_v0.1-pretrain.yaml
-  - group: Qwen
-    tag: qwen
-    models:
-      - model: Qwen 2.5 7B
-        mad_tag: primus_pyt_megatron_lm_train_qwen2.5-7b
-        config_name: primus_qwen2.5_7B-pretrain.yaml
-      - model: Qwen 2.5 72B
-        mad_tag: primus_pyt_megatron_lm_train_qwen2.5-72b
-        config_name: qwen2.5_72B-pretrain.yaml
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.8-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/primus-pytorch-v25.8-benchmark-models.yaml
@@ -1,24 +0,0 @@
-dockers:
-  - pull_tag: rocm/pytorch-training:v25.8
-    docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.8/images/sha256-5082ae01d73fec6972b0d84e5dad78c0926820dcf3c19f301d6c8eb892e573c5
-    components:
-      ROCm: 6.4.3
-      PyTorch: 2.8.0a0+gitd06a406
-      Python: 3.10.18
-      Transformer Engine: 2.2.0.dev0+a1e66aae
-      Flash Attention: 3.0.0.post1
-      hipBLASLt: 1.1.0-d1b517fc7a
-model_groups:
-  - group: Meta Llama
-    tag: llama
-    models:
-    - model: Llama 3.1 8B
-      mad_tag: primus_pyt_train_llama-3.1-8b
-      model_repo: Llama-3.1-8B
-      url: https://huggingface.co/meta-llama/Llama-3.1-8B
-      precision: BF16
-    - model: Llama 3.1 70B
-      mad_tag: primus_pyt_train_llama-3.1-70b
-      model_repo: Llama-3.1-70B
-      url: https://huggingface.co/meta-llama/Llama-3.1-70B
-      precision: BF16
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.8-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.8-benchmark-models.yaml
@@ -1,178 +0,0 @@
-dockers:
-  - pull_tag: rocm/pytorch-training:v25.8
-    docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.8/images/sha256-5082ae01d73fec6972b0d84e5dad78c0926820dcf3c19f301d6c8eb892e573c5
-    components:
-      ROCm: 6.4.3
-      PyTorch: 2.8.0a0+gitd06a406
-      Python: 3.10.18
-      Transformer Engine: 2.2.0.dev0+a1e66aae
-      Flash Attention: 3.0.0.post1
-      hipBLASLt: 1.1.0-d1b517fc7a
-model_groups:
-  - group: Meta Llama
-    tag: llama
-    models:
-    - model: Llama 4 Scout 17B-16E
-      mad_tag: pyt_train_llama-4-scout-17b-16e
-      model_repo: Llama-4-17B_16E
-      url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Llama 3.3 70B
-      mad_tag: pyt_train_llama-3.3-70b
-      model_repo: Llama-3.3-70B
-      url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
-    - model: Llama 3.2 1B
-      mad_tag: pyt_train_llama-3.2-1b
-      model_repo: Llama-3.2-1B
-      url: https://huggingface.co/meta-llama/Llama-3.2-1B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Llama 3.2 3B
-      mad_tag: pyt_train_llama-3.2-3b
-      model_repo: Llama-3.2-3B
-      url: https://huggingface.co/meta-llama/Llama-3.2-3B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Llama 3.2 Vision 11B
-      mad_tag: pyt_train_llama-3.2-vision-11b
-      model_repo: Llama-3.2-Vision-11B
-      url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision
-      precision: BF16
-      training_modes: [finetune_fw]
-    - model: Llama 3.2 Vision 90B
-      mad_tag: pyt_train_llama-3.2-vision-90b
-      model_repo: Llama-3.2-Vision-90B
-      url: https://huggingface.co/meta-llama/Llama-3.2-90B-Vision
-      precision: BF16
-      training_modes: [finetune_fw]
-    - model: Llama 3.1 8B
-      mad_tag: pyt_train_llama-3.1-8b
-      model_repo: Llama-3.1-8B
-      url: https://huggingface.co/meta-llama/Llama-3.1-8B
-      precision: BF16
-      training_modes: [pretrain, finetune_fw, finetune_lora, HF_pretrain]
-    - model: Llama 3.1 70B
-      mad_tag: pyt_train_llama-3.1-70b
-      model_repo: Llama-3.1-70B
-      url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
-      precision: BF16
-      training_modes: [pretrain, finetune_fw, finetune_lora]
-    - model: Llama 3.1 405B
-      mad_tag: pyt_train_llama-3.1-405b
-      model_repo: Llama-3.1-405B
-      url: https://huggingface.co/meta-llama/Llama-3.1-405B
-      precision: BF16
-      training_modes: [finetune_qlora]
-    - model: Llama 3 8B
-      mad_tag: pyt_train_llama-3-8b
-      model_repo: Llama-3-8B
-      url: https://huggingface.co/meta-llama/Meta-Llama-3-8B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Llama 3 70B
-      mad_tag: pyt_train_llama-3-70b
-      model_repo: Llama-3-70B
-      url: https://huggingface.co/meta-llama/Meta-Llama-3-70B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Llama 2 7B
-      mad_tag: pyt_train_llama-2-7b
-      model_repo: Llama-2-7B
-      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
-    - model: Llama 2 13B
-      mad_tag: pyt_train_llama-2-13b
-      model_repo: Llama-2-13B
-      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Llama 2 70B
-      mad_tag: pyt_train_llama-2-70b
-      model_repo: Llama-2-70B
-      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
-      precision: BF16
-      training_modes: [finetune_lora, finetune_qlora]
-  - group: OpenAI
-    tag: openai
-    models:
-    - model: GPT OSS 20B
-      mad_tag: pyt_train_gpt_oss_20b
-      model_repo: GPT-OSS-20B
-      url: https://huggingface.co/openai/gpt-oss-20b
-      precision: BF16
-      training_modes: [HF_finetune_lora]
-    - model: GPT OSS 120B
-      mad_tag: pyt_train_gpt_oss_120b
-      model_repo: GPT-OSS-120B
-      url: https://huggingface.co/openai/gpt-oss-120b
-      precision: BF16
-      training_modes: [HF_finetune_lora]
-  - group: Qwen
-    tag: qwen
-    models:
-    - model: Qwen 3 8B
-      mad_tag: pyt_train_qwen3-8b
-      model_repo: Qwen3-8B
-      url: https://huggingface.co/Qwen/Qwen3-8B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Qwen 3 32B
-      mad_tag: pyt_train_qwen3-32b
-      model_repo: Qwen3-32
-      url: https://huggingface.co/Qwen/Qwen3-32B
-      precision: BF16
-      training_modes: [finetune_lora]
-    - model: Qwen 2.5 32B
-      mad_tag: pyt_train_qwen2.5-32b
-      model_repo: Qwen2.5-32B
-      url: https://huggingface.co/Qwen/Qwen2.5-32B
-      precision: BF16
-      training_modes: [finetune_lora]
-    - model: Qwen 2.5 72B
-      mad_tag: pyt_train_qwen2.5-72b
-      model_repo: Qwen2.5-72B
-      url: https://huggingface.co/Qwen/Qwen2.5-72B
-      precision: BF16
-      training_modes: [finetune_lora]
-    - model: Qwen 2 1.5B
-      mad_tag: pyt_train_qwen2-1.5b
-      model_repo: Qwen2-1.5B
-      url: https://huggingface.co/Qwen/Qwen2-1.5B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Qwen 2 7B
-      mad_tag: pyt_train_qwen2-7b
-      model_repo: Qwen2-7B
-      url: https://huggingface.co/Qwen/Qwen2-7B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-  - group: Stable Diffusion
-    tag: sd
-    models:
-    - model: Stable Diffusion XL
-      mad_tag: pyt_huggingface_stable_diffusion_xl_2k_lora_finetuning
-      model_repo: SDXL
-      url: https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0
-      precision: BF16
-      training_modes: [finetune_lora]
-  - group: Flux
-    tag: flux
-    models:
-    - model: FLUX.1-dev
-      mad_tag: pyt_train_flux
-      model_repo: Flux
-      url: https://huggingface.co/black-forest-labs/FLUX.1-dev
-      precision: BF16
-      training_modes: [pretrain]
-  - group: NCF
-    tag: ncf
-    models:
-    - model: NCF
-      mad_tag: pyt_ncf_training
-      model_repo:
-      url: https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Recommendation/NCF
-      precision: FP32
--- a/docs/data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
@@ -1,22 +1,15 @@
 dockers:
-  MI355X and MI350X:
-    pull_tag: rocm/primus:v25.9_gfx950
-    docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
-    components: &docker_components
-      ROCm: 7.0.0
-      Primus: 0.3.0
-      Primus Turbo: 0.1.1
-      PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
+  - pull_tag: rocm/megatron-lm:v25.7_py310
+    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a
+    components:
+      ROCm: 6.4.2
+      Primus: v0.1.0-rc1
+      PyTorch: 2.8.0a0+gitd06a406
      Python: "3.10"
-      Transformer Engine: 2.2.0.dev0+54dd2bdc
-      Flash Attention: 2.8.3
-      hipBLASLt: 911283acd1
-      Triton: 3.4.0+rocm7.0.0.git56765e8c
-      RCCL: 2.26.6
-  MI325X and MI300X:
-    pull_tag: rocm/primus:v25.9_gfx942
-    docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
-    components: *docker_components
+      Transformer Engine: 2.1.0.dev0+ba586519
+      hipBLASLt: 37ba1d36
+      Triton: 3.3.0
+      RCCL: 2.22.3
 model_groups:
  - group: Meta Llama
    tag: llama
--- a/docs/data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/primus-pytorch-benchmark-models.yaml
@@ -1,39 +1,24 @@
 dockers:
-  MI355X and MI350X:
-    pull_tag: rocm/primus:v25.9_gfx950
-    docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
-    components: &docker_components
-      ROCm: 7.0.0
-      Primus: 0.3.0
-      Primus Turbo: 0.1.1
-      PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
-      Python: "3.10"
-      Transformer Engine: 2.2.0.dev0+54dd2bdc
-      Flash Attention: 2.8.3
-      hipBLASLt: 911283acd1
-      Triton: 3.4.0+rocm7.0.0.git56765e8c
-      RCCL: 2.26.6
-  MI325X and MI300X:
-    pull_tag: rocm/primus:v25.9_gfx942
-    docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
-    components: *docker_components
+  - pull_tag: rocm/pytorch-training:v25.8
+    docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.8/images/sha256-5082ae01d73fec6972b0d84e5dad78c0926820dcf3c19f301d6c8eb892e573c5
+    components:
+      ROCm: 6.4.3
+      PyTorch: 2.8.0a0+gitd06a406
+      Python: 3.10.18
+      Transformer Engine: 2.2.0.dev0+a1e66aae
+      Flash Attention: 3.0.0.post1
+      hipBLASLt: 1.1.0-d1b517fc7a
 model_groups:
  - group: Meta Llama
    tag: llama
    models:
    - model: Llama 3.1 8B
      mad_tag: primus_pyt_train_llama-3.1-8b
-      model_repo: meta-llama/Llama-3.1-8B
+      model_repo: Llama-3.1-8B
      url: https://huggingface.co/meta-llama/Llama-3.1-8B
      precision: BF16
-      config_file:
-        bf16: "./llama3_8b_fsdp_bf16.toml"
-        fp8: "./llama3_8b_fsdp_fp8.toml"
    - model: Llama 3.1 70B
      mad_tag: primus_pyt_train_llama-3.1-70b
-      model_repo: meta-llama/Llama-3.1-70B
+      model_repo: Llama-3.1-70B
      url: https://huggingface.co/meta-llama/Llama-3.1-70B
      precision: BF16
-      config_file:
-        bf16: "./llama3_70b_fsdp_bf16.toml"
-        fp8: "./llama3_70b_fsdp_fp8.toml"
--- a/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
@@ -1,21 +1,13 @@
 dockers:
-  MI355X and MI350X:
-    pull_tag: rocm/pytorch-training:v25.9_gfx950
-    docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
-    components: &docker_components
-      ROCm: 7.0.0
-      Primus: aab4234
-      PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
-      Python: "3.10"
-      Transformer Engine: 2.2.0.dev0+54dd2bdc
-      Flash Attention: 2.8.3
-      hipBLASLt: 911283acd1
-      Triton: 3.4.0+rocm7.0.0.git56765e8c
-      RCCL: 2.26.6
-  MI325X and MI300X:
-    pull_tag: rocm/pytorch-training:v25.9_gfx942
-    docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
-    components: *docker_components
+  - pull_tag: rocm/pytorch-training:v25.8
+    docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.8/images/sha256-5082ae01d73fec6972b0d84e5dad78c0926820dcf3c19f301d6c8eb892e573c5
+    components:
+      ROCm: 6.4.3
+      PyTorch: 2.8.0a0+gitd06a406
+      Python: 3.10.18
+      Transformer Engine: 2.2.0.dev0+a1e66aae
+      Flash Attention: 3.0.0.post1
+      hipBLASLt: 1.1.0-d1b517fc7a
 model_groups:
  - group: Meta Llama
    tag: llama
@@ -158,15 +150,6 @@ model_groups:
      url: https://huggingface.co/Qwen/Qwen2-7B
      precision: BF16
      training_modes: [finetune_fw, finetune_lora]
-  - group: Stable Diffusion
-    tag: sd
-    models:
-    - model: Stable Diffusion XL
-      mad_tag: pyt_huggingface_stable_diffusion_xl_2k_lora_finetuning
-      model_repo: SDXL
-      url: https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0
-      precision: BF16
-      training_modes: [posttrain-p]
  - group: Flux
    tag: flux
    models:
@@ -175,7 +158,7 @@ model_groups:
      model_repo: Flux
      url: https://huggingface.co/black-forest-labs/FLUX.1-dev
      precision: BF16
-      training_modes: [posttrain-p]
+      training_modes: [pretrain]
  - group: NCF
    tag: ncf
    models:
--- a/docs/data/reference/gpu-atomics-operation/cas-atomics_nopcie_instinct.csv
+++ b/docs/data/reference/gpu-atomics-operation/cas-atomics_nopcie_instinct.csv
@@ -1,4 +1,4 @@
-Atomic,MI100,MI200 PCIe,MI200 A+A,MI300X Series,MI300A,MI350X Series
+Atomic,MI100,MI200 PCIe,MI200 A+A,MI300X series,MI300A,MI350X series
 32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
 32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
 32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
--- a/docs/data/reference/gpu-atomics-operation/cas-atomics_pcie_instinct.csv
+++ b/docs/data/reference/gpu-atomics-operation/cas-atomics_pcie_instinct.csv
@@ -1,4 +1,4 @@
-Atomic,MI100,MI200 PCIe,MI200 A+A,MI300X Series,MI300A,MI350X Series
+Atomic,MI100,MI200 PCIe,MI200 A+A,MI300X series,MI300A,MI350X series
 32 bit atomicAdd,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
 32 bit atomicSub,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
 32 bit atomicMin,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS,✅ CAS
--- a/docs/data/reference/gpu-atomics-operation/hw-atomics_nopcie_instinct.csv
+++ b/docs/data/reference/gpu-atomics-operation/hw-atomics_nopcie_instinct.csv
@@ -1,4 +1,4 @@
-Atomic,MI100,MI200 PCIe,MI200 A+A,MI300X Series,MI300A,MI350X Series
+Atomic,MI100,MI200 PCIe,MI200 A+A,MI300X series,MI300A,MI350X series
 32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
 32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
 32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
--- a/docs/data/reference/gpu-atomics-operation/hw-atomics_pcie_instinct.csv
+++ b/docs/data/reference/gpu-atomics-operation/hw-atomics_pcie_instinct.csv
@@ -1,4 +1,4 @@
-Atomic,MI100,MI200 PCIe,MI200 A+A,MI300X Series,MI300A,MI350X Series
+Atomic,MI100,MI200 PCIe,MI200 A+A,MI300X series,MI300A,MI350X series
 32 bit atomicAdd,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
 32 bit atomicSub,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
 32 bit atomicMin,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native,✅ Native
--- a/docs/how-to/deep-learning-rocm.rst
+++ b/docs/how-to/deep-learning-rocm.rst
@@ -10,7 +10,7 @@ Deep learning frameworks provide environments for machine learning, training, fi

 ROCm offers a complete ecosystem for developing and running deep learning applications efficiently. It also provides ROCm-compatible versions of popular frameworks and libraries, such as PyTorch, TensorFlow, JAX, and others.

-The AMD ROCm organization actively contributes to open-source development and collaborates closely with framework organizations. This collaboration ensures that framework-specific optimizations effectively leverage AMD GPUs.
+The AMD ROCm organization actively contributes to open-source development and collaborates closely with framework organizations. This collaboration ensures that framework-specific optimizations effectively leverage AMD GPUs and accelerators.

 The table below summarizes information about ROCm-enabled deep learning frameworks. It includes details on ROCm compatibility and third-party tool support, installation steps and options, and links to GitHub resources. For a complete list of supported framework versions on ROCm, see the :doc:`Compatibility matrix <../compatibility/compatibility-matrix>` topic.

@@ -128,22 +128,10 @@ The table below summarizes information about ROCm-enabled deep learning framewor
          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/llama-cpp-install.html"><i class="fas fa-link fa-lg"></i></a>
      - 
        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/llama-cpp-install.html#use-a-prebuilt-docker-image-with-llama-cpp-pre-installed>`__
-        - `ROCm Base Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/llama-cpp-install.html#build-your-own-docker-image>`__
      - .. raw:: html

          <a href="https://github.com/ROCm/llama.cpp"><i class="fab fa-github fa-lg"></i></a>

-    * - `FlashInfer <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/flashinfer-compatibility.html>`__
-      - .. raw:: html
-
-          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/flashinfer-install.html"><i class="fas fa-link fa-lg"></i></a>
-      - 
-        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/flashinfer-install.html#use-a-prebuilt-docker-image-with-flashinfer-pre-installed>`__
-        - `ROCm Base Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/flashinfer-install.html#build-your-own-docker-image>`__
-      - .. raw:: html
-
-          <a href="https://github.com/ROCm/flashinfer"><i class="fab fa-github fa-lg"></i></a>
-
 Learn how to use your ROCm deep learning environment for training, fine-tuning, inference, and performance optimization
 through the following guides.

--- a/docs/how-to/gpu-performance/mi300x.rst
+++ b/docs/how-to/gpu-performance/mi300x.rst
@@ -1,5 +1,5 @@
 .. meta::
-   :description: How to configure MI300X GPUs to fully leverage their capabilities and achieve optimal performance.
+   :description: How to configure MI300X accelerators to fully leverage their capabilities and achieve optimal performance.
   :keywords: ROCm, AI, machine learning, MI300X, LLM, usage, tutorial, optimization, tuning

 **************************************
@@ -7,11 +7,11 @@ AMD Instinct MI300X performance guides
 **************************************

 The following performance guides provide essential guidance on the necessary
-steps to properly `configure your system for AMD Instinct™ MI300X GPUs
+steps to properly `configure your system for AMD Instinct™ MI300X accelerators
 <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
 They include detailed instructions on system settings and application
 :doc:`workload tuning </how-to/rocm-for-ai/inference-optimization/workload>` to
-help you leverage the maximum capabilities of these GPUs and achieve
+help you leverage the maximum capabilities of these accelerators and achieve
 superior performance.

 * `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`__
@@ -19,9 +19,9 @@ superior performance.
  your AMD Instinct MI300X system for performance.

 * :doc:`/how-to/rocm-for-ai/inference-optimization/workload` covers steps to
-  optimize the performance of AMD Instinct MI300X Series GPUs for HPC
+  optimize the performance of AMD Instinct MI300X series accelerators for HPC
  and deep learning operations.

 * :doc:`/how-to/rocm-for-ai/inference/benchmark-docker/vllm` introduces a preconfigured
  environment for LLM inference, designed to help you test performance with
-  popular models on AMD Instinct MI300X Series GPUs.
+  popular models on AMD Instinct MI300X series accelerators.
--- a/docs/how-to/programming_guide.rst
+++ b/docs/how-to/programming_guide.rst
@@ -25,7 +25,7 @@ execute on AMD GPUs while maintaining compatibility with CUDA-based systems.
 OpenCL (Open Computing Language) is an open standard for cross-platform,
 parallel programming of diverse processors. ROCm supports OpenCL for developers
 who want to use standard frameworks across different hardware platforms,
-including CPUs, GPUs, and APUs. For more information, see
+including CPUs, GPUs, and other accelerators. For more information, see
 `OpenCL <https://www.khronos.org/opencl/>`_.

 Python bindings can be found at https://github.com/ROCm/hip-python.
--- a/docs/how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference.rst
+++ b/docs/how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference.rst
@@ -11,10 +11,10 @@ Fine-tuning using ROCm involves leveraging AMD's GPU-accelerated :doc:`libraries
 ecosystem for deep learning development, including open-source libraries for optimized deep learning operations and
 ROCm-aware versions of :doc:`deep learning frameworks <../../deep-learning-rocm>` such as PyTorch, TensorFlow, and JAX.

-Single-accelerator systems, such as a machine equipped with a single GPU, are commonly used for
+Single-accelerator systems, such as a machine equipped with a single accelerator or GPU, are commonly used for
 smaller-scale deep learning tasks, including fine-tuning pre-trained models and running inference on moderately
 sized datasets. See :doc:`single-gpu-fine-tuning-and-inference`.

-Multi-accelerator systems, on the other hand, consist of multiple GPUs working in parallel. These systems are
+Multi-accelerator systems, on the other hand, consist of multiple accelerators working in parallel. These systems are
 typically used in LLMs and other large-scale deep learning tasks where performance, scalability, and the handling of
 massive datasets are crucial. See :doc:`multi-gpu-fine-tuning-and-inference`.
--- a/docs/how-to/rocm-for-ai/fine-tuning/multi-gpu-fine-tuning-and-inference.rst
+++ b/docs/how-to/rocm-for-ai/fine-tuning/multi-gpu-fine-tuning-and-inference.rst
@@ -3,11 +3,11 @@
   :keywords: ROCm, LLM, fine-tuning, usage, tutorial, multi-GPU, distributed, inference, accelerators, PyTorch, HuggingFace, torchtune

 *****************************************************
-Fine-tuning and inference using multiple GPUs
+Fine-tuning and inference using multiple accelerators
 *****************************************************

 This section explains how to fine-tune a model on a multi-accelerator system. See
-:doc:`Single-accelerator fine-tuning <single-gpu-fine-tuning-and-inference>` for a single GPU setup.
+:doc:`Single-accelerator fine-tuning <single-gpu-fine-tuning-and-inference>` for a single accelerator or GPU setup.

 .. _fine-tuning-llms-multi-gpu-env:

@@ -20,7 +20,7 @@ This section was tested using the following hardware and software environment.
   :stub-columns: 1

   * - Hardware
-     - 4 AMD Instinct MI300X GPUs
+     - 4 AMD Instinct MI300X accelerators

   * - Software
     - ROCm 6.1, Ubuntu 22.04, PyTorch 2.1.2, Python 3.10
@@ -40,13 +40,13 @@ Setting up the base implementation environment
   :doc:`PyTorch installation guide <rocm-install-on-linux:install/3rd-party/pytorch-install>`. For consistent
   installation, it’s recommended to use official ROCm prebuilt Docker images with the framework pre-installed.

-#. In the Docker container, check the availability of ROCm-capable GPUs using the following command.
+#. In the Docker container, check the availability of ROCM-capable accelerators using the following command.

   .. code-block:: shell

      rocm-smi --showproductname

-#. Check that your GPUs are available to PyTorch.
+#. Check that your accelerators are available to PyTorch.

   .. code-block:: python

@@ -66,7 +66,7 @@ Setting up the base implementation environment
 .. tip::

   During training and inference, you can check the memory usage by running the ``rocm-smi`` command in your terminal.
-   This tool helps you see shows which GPUs are involved.
+   This tool helps you see shows which accelerators or GPUs are involved.


 .. _fine-tuning-llms-multi-gpu-hugging-face-accelerate:
@@ -74,9 +74,9 @@ Setting up the base implementation environment
 Hugging Face Accelerate for fine-tuning and inference
 ===========================================================

-`Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`__ is a library that simplifies turning raw
-PyTorch code for a single GPU into code for multiple GPUs for LLM fine-tuning and inference. It is
-integrated with `Transformers <https://huggingface.co/docs/transformers/en/index>`__, so you can scale your PyTorch
+`Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_ is a library that simplifies turning raw
+PyTorch code for a single accelerator into code for multiple accelerators for LLM fine-tuning and inference. It is
+integrated with `Transformers <https://huggingface.co/docs/transformers/en/index>`_ allowing you to scale your PyTorch
 code while maintaining performance and flexibility.

 As a brief example of model fine-tuning and inference using multiple GPUs, let's use Transformers and load in the Llama
@@ -107,7 +107,7 @@ Now, it's important to adjust how you load the model. Add the ``device_map`` par
   (``"auto"``, ``"balanced"``, ``"balanced_low_0"``, ``"sequential"``).

   It's recommended to set the ``device_map`` parameter to ``“auto”`` to allow Accelerate to automatically and
-   efficiently allocate the model given the available resources (four GPUs in this case).
+   efficiently allocate the model given the available resources (4 accelerators in this case).

   When you have more GPU memory available than the model size, here is the difference between each ``device_map``
   option:
@@ -130,8 +130,8 @@ After loading the model in this way, the model is fully ready to use the resourc
 torchtune for fine-tuning and inference
 =============================================

-`torchtune <https://pytorch.org/torchtune/main/>`_ is a PyTorch-native library for easy single and multi-GPU 
-model fine-tuning and inference with LLMs.
+`torchtune <https://pytorch.org/torchtune/main/>`_ is a PyTorch-native library for easy single and multi-accelerator or
+GPU model fine-tuning and inference with LLMs.

 #. Install torchtune using pip.

--- a/docs/how-to/rocm-for-ai/fine-tuning/overview.rst
+++ b/docs/how-to/rocm-for-ai/fine-tuning/overview.rst
@@ -30,7 +30,7 @@ The challenge of fine-tuning models

 However, the computational cost of fine-tuning is still high, especially for complex models and large datasets, which
 poses distinct challenges related to substantial computational and memory requirements. This might be a barrier for
-GPUs with low computing power or limited device memory resources.
+accelerators or GPUs with low computing power or limited device memory resources.

 For example, suppose we have a language model with 7 billion (7B) parameters, represented by a weight matrix :math:`W`.
 During backpropagation, the model needs to learn a :math:`ΔW` matrix, which updates the original weights to minimize the
@@ -84,8 +84,8 @@ Walkthrough
 ===========

 To demonstrate the benefits of LoRA and the ideal compute compatibility of using PEFT and TRL libraries on AMD
-ROCm-compatible GPUs, let's step through a comprehensive implementation of the fine-tuning process
-using the Llama 2 7B model with LoRA tailored specifically for question-and-answer tasks on AMD MI300X GPUs.
+ROCm-compatible accelerators and GPUs, let's step through a comprehensive implementation of the fine-tuning process
+using the Llama 2 7B model with LoRA tailored specifically for question-and-answer tasks on AMD MI300X accelerators.

 Before starting, review and understand the key components of this walkthrough:

--- a/docs/how-to/rocm-for-ai/fine-tuning/single-gpu-fine-tuning-and-inference.rst
+++ b/docs/how-to/rocm-for-ai/fine-tuning/single-gpu-fine-tuning-and-inference.rst
@@ -3,11 +3,12 @@
   :keywords: ROCm, LLM, fine-tuning, usage, tutorial, single-GPU, LoRA, PEFT, inference, SFTTrainer

 ****************************************************
-Fine-tuning and inference using a single GPU
+Fine-tuning and inference using a single accelerator
 ****************************************************

 This section explains model fine-tuning and inference techniques on a single-accelerator system. See
-:doc:`Multi-accelerator fine-tuning <multi-gpu-fine-tuning-and-inference>` for a setup with multiple GPUs.
+:doc:`Multi-accelerator fine-tuning <multi-gpu-fine-tuning-and-inference>` for a setup with multiple accelerators or
+GPUs.

 .. _fine-tuning-llms-single-gpu-env:

@@ -20,7 +21,7 @@ This section was tested using the following hardware and software environment.
   :stub-columns: 1

   * - Hardware
-     - AMD Instinct MI300X GPU
+     - AMD Instinct MI300X accelerator

   * - Software
     - ROCm 6.1, Ubuntu 22.04, PyTorch 2.1.2, Python 3.10
@@ -40,7 +41,7 @@ Setting up the base implementation environment
   :doc:`PyTorch installation guide <rocm-install-on-linux:install/3rd-party/pytorch-install>`. For a consistent
   installation, it’s recommended to use official ROCm prebuilt Docker images with the framework pre-installed.

-#. In the Docker container, check the availability of ROCm-capable GPUs using the following command.
+#. In the Docker container, check the availability of ROCm-capable accelerators using the following command.

   .. code-block:: shell

@@ -52,14 +53,14 @@ Setting up the base implementation environment

      ============================ ROCm System Management Interface ============================
      ====================================== Product Info ======================================
-      GPU[0]          : Card Series:          AMD Instinct MI300X OAM
+      GPU[0]          : Card series:          AMD Instinct MI300X OAM
      GPU[0]          : Card model:           0x74a1
      GPU[0]          : Card vendor:          Advanced Micro Devices, Inc. [AMD/ATI]
      GPU[0]          : Card SKU:             MI3SRIOV
      ==========================================================================================
      ================================== End of ROCm SMI Log ===================================

-#. Check that your GPUs are available to PyTorch.
+#. Check that your accelerators are available to PyTorch.

   .. code-block:: python

@@ -501,9 +502,9 @@ Let's look at achieving model inference using these types of models.
         # Token generation
         print(pipe("What is a large language model?")[0]["generated_text"])

-If using multiple GPUs, see
+If using multiple accelerators, see
 :ref:`Multi-accelerator fine-tuning and inference <fine-tuning-llms-multi-gpu-hugging-face-accelerate>` to explore
-popular libraries that simplify fine-tuning and inference in a multiple-GPU system.
+popular libraries that simplify fine-tuning and inference in a multi-accelerator system.

 Read more about inference frameworks like vLLM and Hugging Face TGI in
 :doc:`LLM inference frameworks <../inference/llm-inference-frameworks>`.
--- a/docs/how-to/rocm-for-ai/inference-optimization/model-acceleration-libraries.rst
+++ b/docs/how-to/rocm-for-ai/inference-optimization/model-acceleration-libraries.rst
@@ -45,7 +45,7 @@ ROCm provides two different implementations of Flash Attention 2 modules. They c
         # Install from source
         git clone https://github.com/ROCm/flash-attention.git
         cd flash-attention/
-         GPU_ARCHS=gfx942 python setup.py install #MI300 Series
+         GPU_ARCHS=gfx942 python setup.py install #MI300 series

      Hugging Face Transformers can easily deploy the CK Flash Attention 2 module by passing an argument
      ``attn_implementation="flash_attention_2"`` in the ``from_pretrained`` class.
@@ -526,7 +526,7 @@ follow these instructions:
   python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning split_table_batched_embeddings_test.py

 To run the FBGEMM_GPU ``uvm`` test, use these commands. These tests only support the AMD MI210 and 
-more recent GPUs. 
+more recent accelerators. 

 .. code-block:: shell

--- a/docs/how-to/rocm-for-ai/inference-optimization/model-quantization.rst
+++ b/docs/how-to/rocm-for-ai/inference-optimization/model-quantization.rst
@@ -7,7 +7,7 @@ Model quantization techniques
 *****************************

 Quantization reduces the model size compared to its native full-precision version, making it easier to fit large models
-onto GPUs with limited memory usage. This section explains how to perform LLM quantization using AMD Quark, GPTQ
+onto accelerators or GPUs with limited memory usage. This section explains how to perform LLM quantization using AMD Quark, GPTQ
 and bitsandbytes on AMD Instinct hardware.

 .. _quantize-llms-quark:
@@ -311,7 +311,7 @@ ExLlama-v2 support
 ExLlama is a Python/C++/CUDA implementation of the Llama model that is
 designed for faster inference with 4-bit GPTQ weights. The ExLlama
 kernel is activated by default when users create a ``GPTQConfig`` object. To
-boost inference speed even further on Instinct GPUs, use the ExLlama-v2
+boost inference speed even further on Instinct accelerators, use the ExLlama-v2
 kernels by configuring the ``exllama_config`` parameter as the following.

 .. code-block:: python
@@ -332,7 +332,7 @@ The `ROCm-aware bitsandbytes <https://github.com/ROCm/bitsandbytes>`_ library is
 a lightweight Python wrapper around CUDA custom functions, in particular 8-bit optimizer, matrix multiplication, and
 8-bit and 4-bit quantization functions. The library includes quantization primitives for 8-bit and 4-bit operations
 through ``bitsandbytes.nn.Linear8bitLt`` and ``bitsandbytes.nn.Linear4bit`` and 8-bit optimizers through the
-``bitsandbytes.optim`` module. These modules are supported on AMD Instinct GPUs.
+``bitsandbytes.optim`` module. These modules are supported on AMD Instinct accelerators.

 Installing bitsandbytes
 -----------------------
--- a/docs/how-to/rocm-for-ai/inference-optimization/optimizing-with-composable-kernel.md
+++ b/docs/how-to/rocm-for-ai/inference-optimization/optimizing-with-composable-kernel.md
@@ -9,13 +9,13 @@ myst:

 The AMD ROCm Composable Kernel (CK) library provides a programming model for writing performance-critical kernels for machine learning workloads. It generates a general-purpose kernel during the compilation phase through a C++ template, enabling developers to achieve operation fusions on different data precisions.

-This article gives a high-level overview of CK General Matrix Multiplication (GEMM) kernel based on the design example of `03_gemm_bias_relu`. It also outlines the steps to construct the kernel and run it. Moreover, the article provides a detailed implementation of running SmoothQuant quantized INT8 models on AMD Instinct MI300X GPUs using CK.
+This article gives a high-level overview of CK General Matrix Multiplication (GEMM) kernel based on the design example of `03_gemm_bias_relu`. It also outlines the steps to construct the kernel and run it. Moreover, the article provides a detailed implementation of running SmoothQuant quantized INT8 models on AMD Instinct MI300X accelerators using CK.

 ## High-level overview: a CK GEMM instance

 GEMM is a fundamental block in linear algebra, machine learning, and deep neural networks. It is defined as the operation:
 {math}`E = α \times (A \times B) + β \times (D)`, with A and B as matrix inputs, α and β as scalar inputs, and D as a pre-existing matrix.
-Take the commonly used linear transformation in a fully connected layer as an example. These terms correspond to input activation (A), weight (B), bias (D), and output (E), respectively. The example employs a `DeviceGemmMultipleD_Xdl_CShuffle` struct from CK library as the fundamental instance to explore the compute capability of AMD Instinct GPUs for the computation of GEMM. The implementation of the instance contains two phases:
+Take the commonly used linear transformation in a fully connected layer as an example. These terms correspond to input activation (A), weight (B), bias (D), and output (E), respectively. The example employs a `DeviceGemmMultipleD_Xdl_CShuffle` struct from CK library as the fundamental instance to explore the compute capability of AMD Instinct accelerators for the computation of GEMM. The implementation of the instance contains two phases:

 - [Template parameter definition](#template-parameter-definition)
 - [Instantiating and running the templated kernel](#instantiating-and-running-the-templated-kernel)
@@ -108,7 +108,7 @@ These parameters include Block Size, M/N/K Per Block, M/N per XDL, AK1, BK1, etc

 - Block Size determines the number of threads in the thread block.
 - M/N/K Per Block determines the size of tile that each thread block is responsible for calculating.
- M/N Per XDL refers to M/N size for Instinct GPU Matrix Fused Multiply Add (MFMA) instructions operating on a per-wavefront basis.
+- M/N Per XDL refers to M/N size for Instinct accelerator Matrix Fused Multiply Add (MFMA) instructions operating on a per-wavefront basis.
 - A/B K1 is related to the data type. It can be any value ranging from 1 to K Per Block. To achieve the optimal load/store performance, 128bit per load is suggested. In addition, the A/B loading parameters must be changed accordingly to match the A/B K1 value; otherwise, it will result in compilation errors.

 Conditions for achieving computational load balancing on different hardware platforms can vary.
@@ -133,7 +133,7 @@ Templated kernel launching consists of kernel instantiation, making arguments by

 ## Developing fused INT8 kernels for SmoothQuant models

-[SmoothQuant](https://github.com/mit-han-lab/smoothquant) (SQ) is a quantization algorithm that enables an INT8 quantization of both weights and activations for all the matrix multiplications in LLM. The required GPU kernel functionalities used to accelerate the inference of SQ models on Instinct GPUs are shown in the following table.
+[SmoothQuant](https://github.com/mit-han-lab/smoothquant) (SQ) is a quantization algorithm that enables an INT8 quantization of both weights and activations for all the matrix multiplications in LLM. The required GPU kernel functionalities used to accelerate the inference of SQ models on Instinct accelerators are shown in the following table.

 :::{table} Functionalities used to implement SmoothQuant model inference.

@@ -164,7 +164,7 @@ The CK library contains many fundamental instances that implement different func

 Second, consider whether the format of input data meets your actual calculation needs. For SQ models, the 8-bit integer data format (INT8) is applied for matrix calculations.

-Third, consider the platform for implementing CK instances. The instances suffixed with `xdl` only run on AMD Instinct GPUs after being compiled and cannot run on Radeon-Series GPUs. This is due to the underlying device-specific instruction sets for implementing these basic instances.
+Third, consider the platform for implementing CK instances. The instances suffixed with `xdl` only run on AMD Instinct accelerators after being compiled and cannot run on Radeon-series GPUs. This is due to the underlying device-specific instruction sets for implementing these basic instances.

 Here, we use [DeviceBatchedGemmMultiD_Xdl](https://github.com/ROCm/composable_kernel/tree/develop/example/24_batched_gemm) as the fundamental instance to implement the functionalities in the previous table.

@@ -435,7 +435,7 @@ The implementation architecture of running SmoothQuant models on MI300X GPUs is
 ### Figure 7
 ================ -->
 ```{figure} ../../../data/how-to/llm-fine-tuning-optimization/ck-inference_flow.jpg
-The implementation architecture of running SmoothQuant models on AMD MI300X GPUs.
+The implementation architecture of running SmoothQuant models on AMD MI300X accelerators.
 ```

 For the target [SQ quantized model](https://huggingface.co/mit-han-lab/opt-13b-smoothquant), each decoder layer contains three major components: attention calculation, layer normalization, and linear transformation in fully connected layers.  The corresponding implementation classes for these components are:
@@ -447,21 +447,21 @@ For the target [SQ quantized model](https://huggingface.co/mit-han-lab/opt-13b-s
 These classes' underlying implementation logits will harness the functions in previous table. Note that for the example, the `LayerNormQ` module is implemented by the torch native module.

 Testing environment:
-The hardware platform used for testing equips with 256 AMD EPYC 9534 64-Core Processor, 8 AMD Instinct MI300X GPUs and 1.5T memory. The testing was done in a publicly available Docker image from Docker Hub:
+The hardware platform used for testing equips with 256 AMD EPYC 9534 64-Core Processor, 8 AMD Instinct MI300X accelerators and 1.5T memory. The testing was done in a publicly available Docker image from Docker Hub:
 [`rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2`](https://hub.docker.com/layers/rocm/pytorch/rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2/images/sha256-f6ea7cee8aae299c7f6368187df7beed29928850c3929c81e6f24b34271d652b)

 The tested models are OPT-1.3B, 2.7B, 6.7B and 13B FP16 models and the corresponding SmoothQuant INT8 OPT models were obtained from Hugging Face.

 Note that since the default values were used for the tunable parameters of the fundamental instance, the performance of the INT8 kernel is suboptimal.

-Figure 8 shows the performance comparisons between the original FP16 and the SmoothQuant-quantized INT8 models on a single MI300X GPU. The GPU memory footprints of SmoothQuant-quantized models are significantly reduced. It also indicates the per-sample inference latency is significantly reduced for all SmoothQuant-quantized OPT models (illustrated in (b)). Notably, the performance of the CK instance-based INT8 kernel steadily improves with an increase in model size.
+Figure 8 shows the performance comparisons between the original FP16 and the SmoothQuant-quantized INT8 models on a single MI300X accelerator. The GPU memory footprints of SmoothQuant-quantized models are significantly reduced. It also indicates the per-sample inference latency is significantly reduced for all SmoothQuant-quantized OPT models (illustrated in (b)). Notably, the performance of the CK instance-based INT8 kernel steadily improves with an increase in model size.

 <!-- 
 ================
 ### Figure 8
 ================ -->
 ```{figure} ../../../data/how-to/llm-fine-tuning-optimization/ck-comparisons.jpg
-Performance comparisons between the original FP16 and the SmoothQuant-quantized INT8 models on a single MI300X GPU.
+Performance comparisons between the original FP16 and the SmoothQuant-quantized INT8 models on a single MI300X accelerator.
 ```

 For accuracy comparisons between the original FP16 and INT8 models, the evaluation is done by using the first 1,000 samples from the LAMBADA dataset's validation set. We employ the same Last Token Prediction Accuracy method introduced in [SmoothQuant Real-INT8 Inference for PyTorch](https://github.com/mit-han-lab/smoothquant/blob/main/examples/smoothquant_opt_real_int8_demo.ipynb) as our evaluation metric. The comparison results are shown in Table 2.
@@ -482,4 +482,4 @@ CK provides a rich set of template parameters for generating flexible accelerate

 CK supports multiple instruction sets of AMD Instinct GPUs, operator fusion and different data precisions. Its composability helps users quickly construct operator performance verification.

-With CK, you can build more effective AI applications with higher flexibility and better performance on different AMD GPU platforms.
+With CK, you can build more effective AI applications with higher flexibility and better performance on different AMD accelerator platforms.
--- a/docs/how-to/rocm-for-ai/inference-optimization/workload.rst
+++ b/docs/how-to/rocm-for-ai/inference-optimization/workload.rst
@@ -1,15 +1,15 @@
 .. meta::
-   :description: Learn about workload tuning on AMD Instinct MI300X GPUs for optimal performance.
+   :description: Learn about workload tuning on AMD Instinct MI300X accelerators for optimal performance.
   :keywords: AMD, Instinct, MI300X, HPC, tuning, BIOS settings, NBIO, ROCm,
              environment variable, performance, HIP, Triton, PyTorch TunableOp, vLLM, RCCL,
-              MIOpen, GPU, resource utilization
+              MIOpen, accelerator, GPU, resource utilization

 *****************************************
 AMD Instinct MI300X workload optimization
 *****************************************

 This document provides guidelines for optimizing the performance of AMD
-Instinct™ MI300X GPUs, with a particular focus on GPU kernel
+Instinct™ MI300X accelerators, with a particular focus on GPU kernel
 programming, high-performance computing (HPC), and deep learning operations
 using PyTorch. It delves into specific workloads such as
 :ref:`model inference <mi300x-vllm-optimization>`, offering strategies to
@@ -25,7 +25,7 @@ Workload tuning strategy

 By following a structured approach, you can systematically address
 performance issues and enhance the efficiency of your workloads on AMD Instinct
-MI300X GPUs.
+MI300X accelerators.

 Measure the current workload
 ----------------------------
@@ -86,7 +86,7 @@ Optimize model inference with vLLM
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

 vLLM provides tools and techniques specifically designed for efficient model
-inference on AMD Instinct MI300X GPUs. See :ref:`fine-tuning-llms-vllm`
+inference on AMD Instinct MI300X accelerators. See :ref:`fine-tuning-llms-vllm`
 for installation guidance. Optimizing performance with vLLM
 involves configuring tensor parallelism, leveraging advanced features, and
 ensuring efficient execution. Here’s how to optimize vLLM performance:
@@ -239,7 +239,7 @@ benchmarking process.

 With AMD's profiling tools, developers are able to gain important insight into how efficiently their application is
 using hardware resources and effectively diagnose potential bottlenecks contributing to poor performance. Developers
-working with AMD Instinct GPUs have multiple tools depending on their specific profiling needs; these include:
+working with AMD Instinct accelerators have multiple tools depending on their specific profiling needs; these include:

 * :ref:`ROCProfiler <mi300x-rocprof>`

@@ -257,11 +257,11 @@ metrics, commonly called *performance counters*. These counters quantify the per
 showcasing which pieces of the computational pipeline and memory hierarchy are being utilized.

 Your ROCm installation contains a script or executable command called ``rocprof`` which provides the ability to list all
-available hardware counters for your specific GPU, and run applications while collecting counters during
+available hardware counters for your specific accelerator or GPU, and run applications while collecting counters during
 their execution.

 This ``rocprof`` utility also depends on the :doc:`ROCTracer and ROC-TX libraries <roctracer:index>`, giving it the
-ability to collect timeline traces of the GPU software stack as well as user-annotated code regions.
+ability to collect timeline traces of the accelerator software stack as well as user-annotated code regions.

 .. note::

@@ -276,16 +276,16 @@ ROCm Compute Profiler
 ^^^^^^^^^^^^^^^^^^^^^

 :doc:`ROCm Compute Profiler <rocprofiler-compute:index>` is a system performance profiler for high-performance computing (HPC) and
-machine learning (ML) workloads using Instinct GPUs. Under the hood, ROCm Compute Profiler uses
+machine learning (ML) workloads using Instinct accelerators. Under the hood, ROCm Compute Profiler uses
 :ref:`ROCProfiler <mi300x-rocprof>` to collect hardware performance counters. The ROCm Compute Profiler tool performs
 system profiling based on all approved hardware counters for Instinct
-GPU architectures. It provides high level performance analysis features including System Speed-of-Light, IP
+accelerator architectures. It provides high level performance analysis features including System Speed-of-Light, IP
 block Speed-of-Light, Memory Chart Analysis, Roofline Analysis, Baseline Comparisons, and more.

 ROCm Compute Profiler takes the guesswork out of profiling by removing the need to provide text input files with lists of counters
 to collect and analyze raw CSV output files as is the case with ROCProfiler. Instead, ROCm Compute Profiler automates the collection
 of all available hardware counters in one command and provides graphical interfaces to help users understand and
-analyze bottlenecks and stressors for their computational workloads on AMD Instinct GPUs.
+analyze bottlenecks and stressors for their computational workloads on AMD Instinct accelerators.

 .. note::

@@ -411,7 +411,7 @@ for additional performance tips. :ref:`fine-tuning-llms-vllm` describes vLLM
 usage with ROCm.

 ROCm provides a prebuilt optimized Docker image for validating the performance
-of LLM inference with vLLM on MI300X Series GPUs. The Docker image includes
+of LLM inference with vLLM on MI300X series accelerators. The Docker image includes
 ROCm, vLLM, and PyTorch. For more information, see
 :doc:`/how-to/rocm-for-ai/inference/benchmark-docker/vllm`.

@@ -449,7 +449,7 @@ Maximizing vLLM instances on a single node
 The general guideline is to maximize per-node throughput by running as many vLLM instances as possible.
 However, running too many instances might lead to insufficient memory for the KV-cache, which can affect performance.

-The Instinct MI300X GPU is equipped with 192 GB of HBM3 memory capacity and bandwidth.
+The Instinct MI300X accelerator is equipped with 192GB of HBM3 memory capacity and bandwidth.
 For models that fit in one GPU -- to maximize the accumulated throughput -- you can run as many as eight vLLM instances
 simultaneously on one MI300X node (with eight GPUs). To do so, use the GPU isolation environment
 variable ``CUDA_VISIBLE_DEVICES``.
@@ -468,7 +468,7 @@ The total throughput achieved by running ``N`` instances of vLLM is generally mu
 single vLLM instance across ``N`` GPUs simultaneously (that is, configuring ``tensor_parallel_size`` as N or
 using the ``-tp`` N option, where ``1 < N ≤ 8``).

-vLLM on MI300X GPUs can run a variety of model weights, including Llama 2 (7b, 13b, 70b), Llama 3 (8b, 70b), Qwen2 (7b, 72b), Mixtral-8x7b, Mixtral-8x22b, and so on.
+vLLM on MI300X accelerators can run a variety of model weights, including Llama 2 (7b, 13b, 70b), Llama 3 (8b, 70b), Qwen2 (7b, 72b), Mixtral-8x7b, Mixtral-8x22b, and so on.
 Notable configurations include Llama2-70b and Llama3-70b models on a single MI300X GPU, and the Llama3.1 405b model can fit on one single node with 8 MI300X GPUs.

 .. _mi300x-vllm-gpu-memory-utilization:
@@ -917,7 +917,7 @@ ROCm library tuning involves optimizing the performance of routine computational
 operations (such as ``GEMM``) provided by ROCm libraries like
 :ref:`hipBLASLt <mi300x-hipblaslt>`, :ref:`Composable Kernel <mi300x-ck>`,
 :ref:`MIOpen <mi300x-miopen>`, and :ref:`RCCL <mi300x-rccl>`. This tuning aims
-to maximize efficiency and throughput on Instinct MI300X GPUs to gain 
+to maximize efficiency and throughput on Instinct MI300X accelerators to gain 
 improved application performance.

 .. _mi300x-library-gemm:
@@ -1451,7 +1451,7 @@ you can only use a fraction of the potential bandwidth on the node.
 The following figure shows an
 :doc:`MI300X node-level architecture </conceptual/gpu-arch/mi300>` of a
 system with AMD EPYC processors in a dual-socket configuration and eight
-AMD Instinct MI300X GPUs. The MI300X OAMs attach to the host system via
+AMD Instinct MI300X accelerators. The MI300X OAMs attach to the host system via
 PCIe Gen 5 x16 links (yellow lines). The GPUs use seven high-bandwidth,
 low-latency AMD Infinity Fabric™ links (red lines) to form a fully connected
 8-GPU system.
@@ -1460,7 +1460,7 @@ low-latency AMD Infinity Fabric™ links (red lines) to form a fully connected

 .. figure:: ../../../data/shared/mi300-node-level-arch.png

-   MI300 Series node-level architecture showing 8 fully interconnected MI300X
+   MI300 series node-level architecture showing 8 fully interconnected MI300X
   OAM modules connected to (optional) PCIe switches via re-timers and HGX
   connectors.

@@ -1653,7 +1653,7 @@ Auto-tunable kernel configuration involves adjusting memory access and computati
 resources assigned to each compute unit. It encompasses the usage of
 :ref:`LDS <mi300x-cu-fig>`, register, and task scheduling on a compute unit.

-The GPU contains global memory, local data share (LDS), and
+The accelerator or GPU contains global memory, local data share (LDS), and
 registers. Global memory has high access latency, but is large. LDS access has
 much lower latency, but is smaller. It is a fast on-CU software-managed memory
 that can be used to efficiently share data between all work items in a block.
@@ -1666,11 +1666,11 @@ Register access is the fastest yet smallest among the three.
   Schematic representation of a CU in the CDNA2 or CDNA3 architecture.

 The following is a list of kernel arguments used for tuning performance and
-resource allocation on AMD GPUs, which helps in optimizing the
+resource allocation on AMD accelerators, which helps in optimizing the
 efficiency and throughput of various computational kernels.

 ``num_stages=n``
-   Adjusts the number of pipeline stages for different types of kernels. On AMD GPUs, set ``num_stages``
+   Adjusts the number of pipeline stages for different types of kernels. On AMD accelerators, set ``num_stages``
   according to the following rules:

   * For kernels with a single GEMM, set to ``2``.
@@ -1697,15 +1697,15 @@ efficiency and throughput of various computational kernels.
   * The occupancy of the kernel is limited by VGPR usage, and

   * The current VGPR usage is only a few above a boundary in
-     :ref:`Occupancy related to VGPR usage in an Instinct MI300X GPU <mi300x-occupancy-vgpr-table>`.
+     :ref:`Occupancy related to VGPR usage in an Instinct MI300X accelerator <mi300x-occupancy-vgpr-table>`.

 .. _mi300x-occupancy-vgpr-table:

 .. figure:: ../../../data/shared/occupancy-vgpr.png
-   :alt: Occupancy related to VGPR usage in an Instinct MI300X GPU.
+   :alt: Occupancy related to VGPR usage in an Instinct MI300X accelerator.
   :align: center

-   Occupancy related to VGPRs usage on an Instinct MI300X GPU
+   Occupancy related to VGPRs usage on an Instinct MI300X accelerator

 For example, according to the table, each Execution Unit (EU) has 512 available
 VGPRs, which are allocated in blocks of 16. If the current VGPR usage is 170,
@@ -1730,7 +1730,7 @@ VGPR usage so that it might fit 3 waves per EU.

   -  ``matrix_instr_nonkdim = 32``: ``mfma_32x32`` is used.

-   For GEMM kernels on an MI300X GPU, ``mfma_16x16`` typically outperforms ``mfma_32x32``, even for large
+   For GEMM kernels on an MI300X accelerator, ``mfma_16x16`` typically outperforms ``mfma_32x32``, even for large
   tile/GEMM sizes.


@@ -1749,7 +1749,7 @@ the number of CUs a kernel can distribute its task across.

   XCD-level system architecture showing 40 compute units,
   each with 32 KB L1 cache, a unified compute system with 4 ACE compute
-   GPUs, shared 4MB of L2 cache, and a hardware scheduler (HWS).
+   accelerators, shared 4MB of L2 cache, and a hardware scheduler (HWS).

 You can query hardware resources with the command ``rocminfo`` in the
 ``/opt/rocm/bin`` directory. For instance, query the number of CUs, number of
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.0-20250812.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.0-20250812.rst
@@ -1,7 +1,7 @@
 :orphan:

 .. meta::
-   :description: Learn how to validate LLM inference performance on MI300X GPUs using AMD MAD and the
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
                 ROCm vLLM Docker image.
   :keywords: model, MAD, automation, dashboarding, validate

@@ -23,9 +23,9 @@ vLLM inference performance testing

   The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
   a prebuilt, optimized environment for validating large language model (LLM)
-   inference performance on AMD Instinct™ MI300X Series GPUs. This ROCm vLLM
-   Docker image integrates vLLM and PyTorch tailored specifically for MI300X Series
-   GPUs and includes the following components:
+   inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
+   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
+   accelerators and includes the following components:

   .. list-table::
      :header-rows: 1
@@ -47,7 +47,7 @@ vLLM inference performance testing

 With this Docker image, you can quickly test the :ref:`expected
 inference performance numbers <vllm-benchmark-performance-measurements-812>` for
-MI300X Series GPUs.
+MI300X series accelerators.

 What's new
 ==========
@@ -139,7 +139,7 @@ page provides reference throughput and serving measurements for inferencing popu
   The performance data presented in
   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
   only reflects the latest version of this inference benchmarking environment.
-   The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X GPUs or ROCm software.
+   The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.

 System validation
 =================
@@ -424,7 +424,7 @@ Further reading
 - To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.

 - To learn more about system settings and management practices to configure your system for
-  AMD Instinct MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+  AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.

 - For application performance optimization strategies for HPC and AI workloads,
  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.1-20250909.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.1-20250909.rst
@@ -1,448 +0,0 @@
-:orphan:
-
-.. meta::
-   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the ROCm vLLM Docker image.
-   :keywords: model, MAD, automation, dashboarding, validate
-
-**********************************
-vLLM inference performance testing
-**********************************
-
-.. caution::
-
-   This documentation does not reflect the latest version of ROCm vLLM
-   inference performance documentation. See :doc:`../vllm` for the latest version.
-
-.. _vllm-benchmark-unified-docker-909:
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.1_20250909-benchmark-models.yaml
-
-   {% set docker = data.dockers[0] %}
-
-   The `ROCm vLLM Docker <{{ docker.docker_hub_url }}>`_ image offers
-   a prebuilt, optimized environment for validating large language model (LLM)
-   inference performance on AMD Instinct™ MI300X Series accelerators. This ROCm vLLM
-   Docker image integrates vLLM and PyTorch tailored specifically for MI300X Series
-   accelerators and includes the following components:
-
-   .. list-table::
-      :header-rows: 1
-
-      * - Software component
-        - Version
-
-      {% for component_name, component_version in docker.components.items() %}
-      * - {{ component_name }}
-        - {{ component_version }}
-      {% endfor %}
-
-With this Docker image, you can quickly test the :ref:`expected
-inference performance numbers <vllm-benchmark-performance-measurements-909>` for
-MI300X Series accelerators.
-
-What's new
-==========
-
-The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <vllm-history>`.
-
-* Upgraded to vLLM v0.10.1.
-
-* Set ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1`` by default for better performance.
-
-* Set ``VLLM_ROCM_USE_AITER_RMSNORM=0`` by default to avoid various issues with torch compile.
-
-.. _vllm-benchmark-supported-models-909:
-
-Supported models
-================
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.1_20250909-benchmark-models.yaml
-
-   {% set docker = data.dockers[0] %}
-   {% set model_groups = data.model_groups %}
-
-   .. _vllm-benchmark-available-models-909:
-
-   The following models are supported for inference performance benchmarking
-   with vLLM and ROCm. Some instructions, commands, and recommendations in this
-   documentation might vary by model -- select one to get started.
-
-   .. raw:: html
-
-      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
-         <div class="row gx-0">
-            <div class="col-2 me-1 px-2 model-param-head">Model</div>
-            <div class="row col-10 pe-0">
-      {% for model_group in model_groups %}
-               <div class="col-3 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
-      {% endfor %}
-            </div>
-         </div>
-
-         <div class="row gx-0 pt-1">
-            <div class="col-2 me-1 px-2 model-param-head">Variant</div>
-            <div class="row col-10 pe-0">
-      {% for model_group in model_groups %}
-         {% set models = model_group.models %}
-         {% for model in models %}
-            {% if models|length % 3 == 0 %}
-               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-            {% else %}
-               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-            {% endif %}
-         {% endfor %}
-      {% endfor %}
-            </div>
-         </div>
-      </div>
-
-   .. _vllm-benchmark-vllm-909:
-
-   {% for model_group in model_groups %}
-      {% for model in model_group.models %}
-
-   .. container:: model-doc {{ model.mad_tag }}
-
-      .. note::
-
-         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
-         Some models require access authorization prior to use via an external license agreement through a third party.
-      {% if model.precision == "float8" and model.model_repo.startswith("amd") %}
-         This model uses FP8 quantization via `AMD Quark <https://quark.docs.amd.com/latest/>`__ for efficient inference on AMD accelerators.
-      {% endif %}
-
-      {% endfor %}
-   {% endfor %}
-
-.. _vllm-benchmark-performance-measurements-909:
-
-Performance measurements
-========================
-
-To evaluate performance, the
-`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
-page provides reference throughput and serving measurements for inferencing popular AI models.
-
-.. important::
-
-   The performance data presented in
-   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
-   only reflects the latest version of this inference benchmarking environment.
-   The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
-
-System validation
-=================
-
-Before running AI workloads, it's important to validate that your AMD hardware is configured
-correctly and performing optimally.
-
-If you have already validated your system settings, including aspects like NUMA auto-balancing, you
-can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
-optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
-before starting training.
-
-To test for optimal performance, consult the recommended :ref:`System health benchmarks
-<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
-system's configuration.
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.1_20250909-benchmark-models.yaml
-
-   {% set docker = data.dockers[0] %}
-   {% set model_groups = data.model_groups %}
-
-   Pull the Docker image
-   =====================
-
-   Download the `ROCm vLLM Docker image <{{ docker.docker_hub_url }}>`_.
-   Use the following command to pull the Docker image from Docker Hub.
-
-   .. code-block:: shell
-
-      docker pull {{ docker.pull_tag }}
-
-   Benchmarking
-   ============
-
-   Once the setup is complete, choose between two options to reproduce the
-   benchmark results:
-
-   .. _vllm-benchmark-mad-909:
-
-   {% for model_group in model_groups %}
-      {% for model in model_group.models %}
-
-   .. container:: model-doc {{model.mad_tag}}
-
-      .. tab-set::
-
-         .. tab-item:: MAD-integrated benchmarking
-
-            The following run command is tailored to {{ model.model }}.
-            See :ref:`vllm-benchmark-supported-models-909` to switch to another available model.
-
-            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
-               directory and install the required packages on the host machine.
-
-               .. code-block:: shell
-
-                  git clone https://github.com/ROCm/MAD
-                  cd MAD
-                  pip install -r requirements.txt
-
-            2. Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
-               using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
-
-               .. code-block:: shell
-
-                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
-                  madengine run \
-                      --tags {{model.mad_tag}} \
-                      --keep-model-dir \
-                      --live-output \
-                      --timeout 28800
-
-            MAD launches a Docker container with the name
-            ``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the
-            model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv``
-            and ``{{ model.mad_tag }}_serving.csv``.
-
-            Although the :ref:`available models
-            <vllm-benchmark-available-models-909>` are preconfigured to collect
-            offline throughput and online serving performance data, you can
-            also change the benchmarking parameters. See the standalone
-            benchmarking tab for more information.
-
-            {% if model.tunableop %}
-
-            .. note::
-
-               For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
-               TunableOp automatically explores different implementations and configurations of certain PyTorch
-               operators to find the fastest one for your hardware.
-
-               By default, ``{{model.mad_tag}}`` runs with TunableOp disabled (see
-               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To enable it, include
-               the ``--tunableop on`` argument in your run.
-
-               Enabling TunableOp triggers a two-pass run -- a warm-up followed by the
-               performance-collection run.
-
-            {% endif %}
-
-         .. tab-item:: Standalone benchmarking
-
-            The following commands are optimized for {{ model.model }}.
-            See :ref:`vllm-benchmark-supported-models-909` to switch to another available model.
-
-            .. seealso::
-
-               For more information on configuration, see the `config files
-               <https://github.com/ROCm/MAD/tree/develop/scripts/vllm/configs>`__
-               in the MAD repository. Refer to the `vLLM engine <https://docs.vllm.ai/en/latest/configuration/engine_args.html#engineargs>`__
-               for descriptions of available configuration options
-               and `Benchmarking vLLM <https://github.com/vllm-project/vllm/blob/main/benchmarks/README.md>`__ for
-               additional benchmarking information.
-
-            .. rubric:: Launch the container
-
-            You can run the vLLM benchmark tool independently by starting the
-            `Docker container <{{ docker.docker_hub_url }}>`_ as shown
-            in the following snippet.
-
-            .. code-block:: shell
-
-               docker pull {{ docker.pull_tag }}
-               docker run -it \
-                   --device=/dev/kfd \
-                   --device=/dev/dri \
-                   --group-add video \
-                   --shm-size 16G \
-                   --security-opt seccomp=unconfined \
-                   --security-opt apparmor=unconfined \
-                   --cap-add=SYS_PTRACE \
-                   -v $(pwd):/workspace \
-                   --env HUGGINGFACE_HUB_CACHE=/workspace \
-                   --name test \
-                   {{ docker.pull_tag }}
-
-            .. rubric:: Throughput command
-
-            Use the following command to start the throughput benchmark.
-
-            .. code-block:: shell
-
-               model={{ model.model_repo }}
-               tp={{ model.config.tp }}
-               num_prompts=1024
-               in=128
-               out=128
-               dtype={{ model.config.dtype }}
-               kv_cache_dtype={{ model.config.kv_cache_dtype }}
-               max_num_seqs=1024
-               max_seq_len_to_capture={{ model.config.max_seq_len_to_capture }}
-               max_num_batched_tokens={{ model.config.max_num_batched_tokens }}
-               max_model_len={{ model.config.max_model_len }}
-
-               vllm bench throughput --model $model \
-                   -tp $tp \
-                   --num-prompts $num_prompts \
-                   --input-len $in \
-                   --output-len $out \
-                   --dtype $dtype \
-                   --kv-cache-dtype $kv_cache_dtype \
-                   --max-num-seqs $max_num_seqs \
-                   --max-seq-len-to-capture $max_seq_len_to_capture \
-                   --max-num-batched-tokens $max_num_batched_tokens \
-                   --max-model-len $max_model_len \
-                   --trust-remote-code \
-                   --output-json ${model}_throughput.json \
-                   --gpu-memory-utilization 0.9
-
-            .. rubric:: Serving command
-
-            1. Start the server using the following command:
-
-               .. code-block:: shell
-
-                  model={{ model.model_repo }}
-                  tp={{ model.config.tp }}
-                  dtype={{ model.config.dtype }}
-                  kv_cache_dtype={{ model.config.kv_cache_dtype }}
-                  max_num_seqs=256
-                  max_seq_len_to_capture={{ model.config.max_seq_len_to_capture }}
-                  max_num_batched_tokens={{ model.config.max_num_batched_tokens }}
-                  max_model_len={{ model.config.max_model_len }}
-
-                  vllm serve $model \
-                      -tp $tp \
-                      --dtype $dtype \
-                      --kv-cache-dtype $kv_cache_dtype \
-                      --max-num-seqs $max_num_seqs \
-                      --max-seq-len-to-capture $max_seq_len_to_capture \
-                      --max-num-batched-tokens $max_num_batched_tokens \
-                      --max-model-len $max_model_len \
-                      --no-enable-prefix-caching \
-                      --swap-space 16 \
-                      --disable-log-requests \
-                      --trust-remote-code \
-                      --gpu-memory-utilization 0.9
-
-               Wait until the model has loaded and the server is ready to accept requests.
-
-            2. On another terminal on the same machine, run the benchmark:
-
-               .. code-block:: shell
-
-                  # Connect to the container
-                  docker exec -it test bash
-
-                  # Wait for the server to start
-                  until curl -s http://localhost:8000/v1/models; do sleep 30; done
-
-                  # Run the benchmark
-                  model={{ model.model_repo }}
-                  max_concurrency=1
-                  num_prompts=10
-                  in=128
-                  out=128
-                  vllm bench serve --model $model \
-                      --percentile-metrics "ttft,tpot,itl,e2el" \
-                      --dataset-name random \
-                      --ignore-eos \
-                      --max-concurrency $max_concurrency \
-                      --num-prompts $num_prompts \
-                      --random-input-len $in \
-                      --random-output-len $out \
-                      --trust-remote-code \
-                      --save-result \
-                      --result-filename ${model}_serving.json
-
-            .. note::
-
-               For improved performance with certain Mixture of Experts models, such as Mixtral 8x22B,
-               try adding ``export VLLM_ROCM_USE_AITER=1`` to your commands.
-
-               If you encounter the following error, pass your access-authorized Hugging
-               Face token to the gated models.
-
-               .. code-block::
-
-                  OSError: You are trying to access a gated repo.
-
-                  # pass your HF_TOKEN
-                  export HF_TOKEN=$your_personal_hf_token
-
-            .. raw:: html
-
-               <style>
-               mjx-container[jax="CHTML"][display="true"] {
-                  text-align: left;
-                  margin: 0;
-               }
-               </style>
-
-            .. note::
-
-               Throughput is calculated as:
-
-               - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
-
-               - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
-      {% endfor %}
-   {% endfor %}
-
-Advanced usage
-==============
-
-For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
-see the developer's guide at `<https://github.com/ROCm/vllm/blob/documentation/docs/dev-docker/README.md>`__.
-
-Reproducing the Docker image
----------------------------
-
-To reproduce this ROCm/vLLM Docker image release, follow these steps:
-
-1. Clone the `vLLM repository <https://github.com/ROCm/vllm>`__.
-
-   .. code-block:: shell
-
-      git clone https://github.com/ROCm/vllm.git
-
-2. Checkout the specific release commit.
-
-   .. code-block:: shell
-
-      cd vllm
-      git checkout 6663000a391911eba96d7864a26ac42b07f6ef29
-
-3. Build the Docker image. Replace ``vllm-rocm`` with your desired image tag.
-
-   .. code-block:: shell
-
-      docker build -f docker/Dockerfile.rocm -t vllm-rocm .
-
-Further reading
-===============
-
- To learn more about the options for latency and throughput benchmark scripts,
-  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
-
- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
-
- To learn more about system settings and management practices to configure your system for
-  AMD Instinct MI300X Series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
-
- See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
-  a brief introduction to vLLM and optimization strategies.
-
- For application performance optimization strategies for HPC and AI workloads,
-  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
-
- For a list of other ready-made Docker images for AI with ROCm, see
-  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
-
-Previous versions
-=================
-
-See :doc:`vllm-history` to find documentation for previous releases
-of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.4.3.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.4.3.rst
@@ -1,7 +1,7 @@
 :orphan:

 .. meta::
-   :description: Learn how to validate LLM inference performance on MI300X GPUs using AMD MAD and the unified
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the unified
                 ROCm Docker image.
   :keywords: model, MAD, automation, dashboarding, validate

@@ -18,9 +18,9 @@ vLLM inference performance testing

 The `ROCm vLLM Docker <https://hub.docker.com/r/rocm/vllm/tags>`_ image offers
 a prebuilt, optimized environment designed for validating large language model
-(LLM) inference performance on the AMD Instinct™ MI300X GPU. This
+(LLM) inference performance on the AMD Instinct™ MI300X accelerator. This
 ROCm vLLM Docker image integrates vLLM and PyTorch tailored specifically for the
-MI300X GPU and includes the following components:
+MI300X accelerator and includes the following components:

 * `ROCm 6.2.0 <https://github.com/ROCm/ROCm>`_

@@ -31,7 +31,7 @@ MI300X GPU and includes the following components:
 * Tuning files (in CSV format)

 With this Docker image, you can quickly validate the expected inference
-performance numbers on the MI300X GPU. This topic also provides tips on
+performance numbers on the MI300X accelerator. This topic also provides tips on
 optimizing performance with popular AI models.

 .. _vllm-benchmark-vllm:
@@ -51,7 +51,7 @@ Getting started
 ===============

 Use the following procedures to reproduce the benchmark results on an
-MI300X GPU with the prebuilt vLLM Docker image.
+MI300X accelerator with the prebuilt vLLM Docker image.

 .. _vllm-benchmark-get-started:

@@ -267,7 +267,7 @@ Options

 .. _vllm-benchmark-run-benchmark-v043:

-Running the benchmark on the MI300X GPU
+Running the benchmark on the MI300X accelerator
 -----------------------------------------------

 Here are some examples of running the benchmark with various options.
@@ -328,7 +328,7 @@ Further reading
  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.

 - To learn more about system settings and management practices to configure your system for
-  MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_

 - To learn how to run community models from Hugging Face on AMD GPUs, see
  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.4.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.4.rst
@@ -1,7 +1,7 @@
 :orphan:

 .. meta::
-   :description: Learn how to validate LLM inference performance on MI300X GPUs using AMD MAD and the unified
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the unified
                 ROCm Docker image.
   :keywords: model, MAD, automation, dashboarding, validate

@@ -18,9 +18,9 @@ vLLM inference performance testing

 The `ROCm vLLM Docker <https://hub.docker.com/r/rocm/vllm/tags>`_ image offers
 a prebuilt, optimized environment designed for validating large language model
-(LLM) inference performance on the AMD Instinct™ MI300X GPU. This
+(LLM) inference performance on the AMD Instinct™ MI300X accelerator. This
 ROCm vLLM Docker image integrates vLLM and PyTorch tailored specifically for the
-MI300X GPU and includes the following components:
+MI300X accelerator and includes the following components:

 * `ROCm 6.2.1 <https://github.com/ROCm/ROCm>`_

@@ -31,7 +31,7 @@ MI300X GPU and includes the following components:
 * Tuning files (in CSV format)

 With this Docker image, you can quickly validate the expected inference
-performance numbers on the MI300X GPU. This topic also provides tips on
+performance numbers on the MI300X accelerator. This topic also provides tips on
 optimizing performance with popular AI models.

 .. hlist::
@@ -74,7 +74,7 @@ Getting started
 ===============

 Use the following procedures to reproduce the benchmark results on an
-MI300X GPU with the prebuilt vLLM Docker image.
+MI300X accelerator with the prebuilt vLLM Docker image.

 .. _vllm-benchmark-get-started:

@@ -332,7 +332,7 @@ Options

 .. _vllm-benchmark-run-benchmark-v064:

-Running the benchmark on the MI300X GPU
+Running the benchmark on the MI300X accelerator
 -----------------------------------------------

 Here are some examples of running the benchmark with various options.
@@ -398,7 +398,7 @@ Further reading
  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.

 - To learn more about system settings and management practices to configure your system for
-  MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_

 - To learn how to run community models from Hugging Face on AMD GPUs, see
  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.6.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.6.rst
@@ -1,7 +1,7 @@
 :orphan:

 .. meta::
-   :description: Learn how to validate LLM inference performance on MI300X GPUs using AMD MAD and the
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
                 ROCm vLLM Docker image.
   :keywords: model, MAD, automation, dashboarding, validate

@@ -18,9 +18,9 @@ LLM inference performance validation on AMD Instinct MI300X

 The `ROCm vLLM Docker <https://hub.docker.com/r/rocm/vllm/tags>`_ image offers
 a prebuilt, optimized environment for validating large language model (LLM)
-inference performance on the AMD Instinct™ MI300X GPU. This ROCm vLLM
+inference performance on the AMD Instinct™ MI300X accelerator. This ROCm vLLM
 Docker image integrates vLLM and PyTorch tailored specifically for the MI300X
-GPU and includes the following components:
+accelerator and includes the following components:

 * `ROCm 6.3.1 <https://github.com/ROCm/ROCm>`_

@@ -29,7 +29,7 @@ GPU and includes the following components:
 * `PyTorch 2.7.0 (2.7.0a0+git3a58512) <https://github.com/pytorch/pytorch>`_

 With this Docker image, you can quickly validate the expected inference
-performance numbers for the MI300X GPU. This topic also provides tips on
+performance numbers for the MI300X accelerator. This topic also provides tips on
 optimizing performance with popular AI models. For more information, see the lists of
 :ref:`available models for MAD-integrated benchmarking <vllm-benchmark-mad-v066-models>`
 and :ref:`standalone benchmarking <vllm-benchmark-standalone-v066-options>`.
@@ -47,7 +47,7 @@ Getting started
 ===============

 Use the following procedures to reproduce the benchmark results on an
-MI300X GPU with the prebuilt vLLM Docker image.
+MI300X accelerator with the prebuilt vLLM Docker image.

 .. _vllm-benchmark-get-started:

@@ -377,7 +377,7 @@ Options and available models

 .. _vllm-benchmark-run-benchmark-v066:

-Running the benchmark on the MI300X GPU
+Running the benchmark on the MI300X accelerator
 -----------------------------------------------

 Here are some examples of running the benchmark with various options.
@@ -443,7 +443,7 @@ Further reading
  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.

 - To learn more about system settings and management practices to configure your system for
-  MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_

 - To learn how to run community models from Hugging Face on AMD GPUs, see
  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.7.3-20250325.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.7.3-20250325.rst
@@ -1,7 +1,7 @@
 :orphan:

 .. meta::
-   :description: Learn how to validate LLM inference performance on MI300X GPUs using AMD MAD and the
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
                 ROCm vLLM Docker image.
   :keywords: model, MAD, automation, dashboarding, validate

@@ -23,9 +23,9 @@ vLLM inference performance testing

   The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
   a prebuilt, optimized environment for validating large language model (LLM)
-   inference performance on AMD Instinct™ MI300X Series GPU. This ROCm vLLM
-   Docker image integrates vLLM and PyTorch tailored specifically for MI300X Series
-   GPUs and includes the following components:
+   inference performance on AMD Instinct™ MI300X series accelerator. This ROCm vLLM
+   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
+   accelerators and includes the following components:

   * `ROCm {{ unified_docker.rocm_version }} <https://github.com/ROCm/ROCm>`_

@@ -37,7 +37,7 @@ vLLM inference performance testing

   With this Docker image, you can quickly test the :ref:`expected
   inference performance numbers <vllm-benchmark-performance-measurements-v073>` for
-   MI300X Series GPUs.
+   MI300X series accelerators.

   .. _vllm-benchmark-available-models-v073:

@@ -110,7 +110,7 @@ vLLM inference performance testing
      The performance data presented in
      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
      only reflects the :doc:`latest version of this inference benchmarking environment <../vllm>`.
-      The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X GPUs or ROCm software.
+      The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.

   Advanced features and known issues
   ==================================
@@ -122,7 +122,7 @@ vLLM inference performance testing
   ===============

   Use the following procedures to reproduce the benchmark results on an
-   MI300X GPU with the prebuilt vLLM Docker image.
+   MI300X accelerator with the prebuilt vLLM Docker image.

   .. _vllm-benchmark-get-started:

@@ -311,7 +311,7 @@ Further reading
  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.

 - To learn more about system settings and management practices to configure your system for
-  MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_

 - To learn how to run community models from Hugging Face on AMD GPUs, see
  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.3-20250415.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.3-20250415.rst
@@ -1,7 +1,7 @@
 :orphan:

 .. meta::
-   :description: Learn how to validate LLM inference performance on MI300X GPUs using AMD MAD and the
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
                 ROCm vLLM Docker image.
   :keywords: model, MAD, automation, dashboarding, validate

@@ -18,9 +18,9 @@ vLLM inference performance testing

   The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
   a prebuilt, optimized environment for validating large language model (LLM)
-   inference performance on AMD Instinct™ MI300X Series GPUs. This ROCm vLLM
-   Docker image integrates vLLM and PyTorch tailored specifically for MI300X Series
-   GPUs and includes the following components:
+   inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
+   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
+   accelerators and includes the following components:

   * `ROCm {{ unified_docker.rocm_version }} <https://github.com/ROCm/ROCm>`_

@@ -32,7 +32,7 @@ vLLM inference performance testing

   With this Docker image, you can quickly test the :ref:`expected
   inference performance numbers <vllm-benchmark-performance-measurements-v083>` for
-   MI300X Series GPUs.
+   MI300X series accelerators.

   .. _vllm-benchmark-available-models-v083:

@@ -105,7 +105,7 @@ vLLM inference performance testing
      The performance data presented in
      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
      only reflects the :doc:`latest version of this inference benchmarking environment <../vllm>`.
-      The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X GPUs or ROCm software.
+      The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.

   Advanced features and known issues
   ==================================
@@ -327,7 +327,7 @@ Further reading
  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.

 - To learn more about system settings and management practices to configure your system for
-  MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_

 - To learn how to run community models from Hugging Face on AMD GPUs, see
  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250513.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250513.rst
@@ -1,7 +1,7 @@
 :orphan:

 .. meta::
-   :description: Learn how to validate LLM inference performance on MI300X GPUs using AMD MAD and the
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
                 ROCm vLLM Docker image.
   :keywords: model, MAD, automation, dashboarding, validate

@@ -23,9 +23,9 @@ vLLM inference performance testing

   The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
   a prebuilt, optimized environment for validating large language model (LLM)
-   inference performance on AMD Instinct™ MI300X Series GPUs. This ROCm vLLM
-   Docker image integrates vLLM and PyTorch tailored specifically for MI300X Series
-   GPUs and includes the following components:
+   inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
+   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
+   accelerators and includes the following components:

   * `ROCm {{ unified_docker.rocm_version }} <https://github.com/ROCm/ROCm>`_

@@ -37,7 +37,7 @@ vLLM inference performance testing

   With this Docker image, you can quickly test the :ref:`expected
   inference performance numbers <vllm-benchmark-performance-measurements-v085-20250513>` for
-   MI300X Series GPUs.
+   MI300X series accelerators.

   .. _vllm-benchmark-available-models-v085-20250513:

@@ -114,7 +114,7 @@ vLLM inference performance testing
      The performance data presented in
      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
      only reflects the :doc:`latest version of this inference benchmarking environment <../vllm>`.
-      The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X GPUs or ROCm software.
+      The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.

   Advanced features and known issues
   ==================================
@@ -333,7 +333,7 @@ Further reading
  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.

 - To learn more about system settings and management practices to configure your system for
-  MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_

 - For application performance optimization strategies for HPC and AI workloads,
  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250521.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250521.rst
@@ -1,7 +1,7 @@
 :orphan:

 .. meta::
-   :description: Learn how to validate LLM inference performance on MI300X GPUs using AMD MAD and the
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
                 ROCm vLLM Docker image.
   :keywords: model, MAD, automation, dashboarding, validate

@@ -23,9 +23,9 @@ vLLM inference performance testing

   The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
   a prebuilt, optimized environment for validating large language model (LLM)
-   inference performance on AMD Instinct™ MI300X Series GPUs. This ROCm vLLM
-   Docker image integrates vLLM and PyTorch tailored specifically for MI300X Series
-   GPUs and includes the following components:
+   inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
+   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
+   accelerators and includes the following components:

   * `ROCm {{ unified_docker.rocm_version }} <https://github.com/ROCm/ROCm>`_

@@ -37,7 +37,7 @@ vLLM inference performance testing

   With this Docker image, you can quickly test the :ref:`expected
   inference performance numbers <vllm-benchmark-performance-measurements-v085-20250521>` for
-   MI300X Series GPUs.
+   MI300X series accelerators.

   .. _vllm-benchmark-available-models-v085-20250521:

@@ -114,13 +114,13 @@ vLLM inference performance testing
      The performance data presented in
      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
      should not be interpreted as the peak performance achievable by AMD
-      Instinct MI325X and MI300X GPUs or ROCm software.
+      Instinct MI325X and MI300X accelerators or ROCm software.

   Advanced features and known issues
   ==================================

   For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
-   see the developer's guide at `<https://github.com/ROCm/vllm/blob/7bb0618b1fe725b7d4fad9e525aa44da12c94a8b/docs/dev-docker/README.md>`__.
+   see the developer's guide at `<https://github.com/ROCm/vllm/blob/main/docs/dev-docker/README.md>`__.

   System validation
   =================
@@ -333,7 +333,7 @@ Further reading
  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.

 - To learn more about system settings and management practices to configure your system for
-  MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_

 - For application performance optimization strategies for HPC and AI workloads,
  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250605.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250605.rst
@@ -1,7 +1,7 @@
 :orphan:

 .. meta::
-   :description: Learn how to validate LLM inference performance on MI300X GPUs using AMD MAD and the
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
                 ROCm vLLM Docker image.
   :keywords: model, MAD, automation, dashboarding, validate

@@ -23,9 +23,9 @@ vLLM inference performance testing

   The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
   a prebuilt, optimized environment for validating large language model (LLM)
-   inference performance on AMD Instinct™ MI300X Series GPUs. This ROCm vLLM
-   Docker image integrates vLLM and PyTorch tailored specifically for MI300X Series
-   GPUs and includes the following components:
+   inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
+   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
+   accelerators and includes the following components:

   * `ROCm {{ unified_docker.rocm_version }} <https://github.com/ROCm/ROCm>`_

@@ -37,7 +37,7 @@ vLLM inference performance testing

   With this Docker image, you can quickly test the :ref:`expected
   inference performance numbers <vllm-benchmark-performance-measurements-v0901-20250605>` for
-   MI300X Series GPUs.
+   MI300X series accelerators.

   .. _vllm-benchmark-available-models-v0901-20250605:

@@ -113,7 +113,7 @@ vLLM inference performance testing
      The performance data presented in
      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
      only reflects the latest version of this inference benchmarking environment.
-      The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X GPUs or ROCm software.
+      The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.

   Advanced features and known issues
   ==================================
@@ -332,7 +332,7 @@ Further reading
  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.

 - To learn more about system settings and management practices to configure your system for
-  MI300X GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+  MI300X accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_

 - For application performance optimization strategies for HPC and AI workloads,
  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250702.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250702.rst
@@ -1,7 +1,7 @@
 :orphan:

 .. meta::
-   :description: Learn how to validate LLM inference performance on MI300X GPUs using AMD MAD and the
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
                 ROCm vLLM Docker image.
   :keywords: model, MAD, automation, dashboarding, validate

@@ -23,9 +23,9 @@ vLLM inference performance testing

   The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
   a prebuilt, optimized environment for validating large language model (LLM)
-   inference performance on AMD Instinct™ MI300X Series GPUs. This ROCm vLLM
-   Docker image integrates vLLM and PyTorch tailored specifically for MI300X Series
-   GPUs and includes the following components:
+   inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
+   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
+   accelerators and includes the following components:

   * `ROCm {{ unified_docker.rocm_version }} <https://github.com/ROCm/ROCm>`_

@@ -37,7 +37,7 @@ vLLM inference performance testing

   With this Docker image, you can quickly test the :ref:`expected
   inference performance numbers <vllm-benchmark-performance-measurements-20250702>` for
-   MI300X Series GPUs.
+   MI300X series accelerators.

   .. _vllm-benchmark-available-models-20250702:

@@ -113,7 +113,7 @@ vLLM inference performance testing
      The performance data presented in
      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
      only reflects the latest version of this inference benchmarking environment.
-      The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X GPUs or ROCm software.
+      The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.

   Advanced features and known issues
   ==================================
@@ -332,7 +332,7 @@ Further reading
  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.

 - To learn more about system settings and management practices to configure your system for
-  MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_

 - For application performance optimization strategies for HPC and AI workloads,
  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715.rst
@@ -1,7 +1,7 @@
 :orphan:

 .. meta::
-   :description: Learn how to validate LLM inference performance on MI300X GPUs using AMD MAD and the
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
                 ROCm vLLM Docker image.
   :keywords: model, MAD, automation, dashboarding, validate

@@ -23,9 +23,9 @@ vLLM inference performance testing

   The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
   a prebuilt, optimized environment for validating large language model (LLM)
-   inference performance on AMD Instinct™ MI300X Series GPUs. This ROCm vLLM
-   Docker image integrates vLLM and PyTorch tailored specifically for MI300X Series
-   GPUs and includes the following components:
+   inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
+   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
+   accelerators and includes the following components:

   .. list-table::
      :header-rows: 1
@@ -47,7 +47,7 @@ vLLM inference performance testing

 With this Docker image, you can quickly test the :ref:`expected
 inference performance numbers <vllm-benchmark-performance-measurements-715>` for
-MI300X Series GPUs.
+MI300X series accelerators.

 What's new
 ==========
@@ -145,7 +145,7 @@ page provides reference throughput and latency measurements for inferencing popu
   The performance data presented in
   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
   only reflects the latest version of this inference benchmarking environment.
-   The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X GPUs or ROCm software.
+   The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.

 System validation
 =================
@@ -429,7 +429,7 @@ Further reading
 - To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.

 - To learn more about system settings and management practices to configure your system for
-  AMD Instinct MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+  AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.

 - For application performance optimization strategies for HPC and AI workloads,
  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst
@@ -7,7 +7,7 @@ vLLM inference performance testing version history
 This table lists previous versions of the ROCm vLLM inference Docker image for
 inference performance testing. For detailed information about available models
 for benchmarking, see the version-specific documentation. You can find tagged
-previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub <https://hub.docker.com/r/rocm/vllm/tags>`__.
+previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.1_20250909/images/sha256-1113268572e26d59b205792047bea0e61e018e79aeadceba118b7bf23cb3715c>`__.

 .. list-table::
   :header-rows: 1
@@ -16,23 +16,14 @@ previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub <https://hub.
     - Components
     - Resources

-   * - ``rocm/vllm:rocm7.0.0_vllm_0.10.2_20251006``
-       (latest)
-     -
-       * ROCm 7.0.0
-       * vLLM 0.10.2
-       * PyTorch 2.9.0
-     -
-       * :doc:`Documentation <../vllm>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm7.0.0_vllm_0.10.2_20251006/images/sha256-94fd001964e1cf55c3224a445b1fb5be31a7dac302315255db8422d813edd7f5>`__
-
   * - ``rocm/vllm:rocm6.4.1_vllm_0.10.1_20250909``
+       (latest)
     -
       * ROCm 6.4.1
       * vLLM 0.10.1
       * PyTorch 2.7.0
     -
-       * :doc:`Documentation <vllm-0.10.1-20250909>`
+       * :doc:`Documentation <../vllm>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.1_20250909/images/sha256-1113268572e26d59b205792047bea0e61e018e79aeadceba118b7bf23cb3715c>`__

   * - ``rocm/vllm:rocm6.4.1_vllm_0.10.0_20250812``
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference.rst
@@ -1,5 +1,5 @@
 .. meta::
-   :description: Learn how to validate LLM inference performance on MI300X GPUs using AMD MAD and the
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
                 ROCm PyTorch Docker image.
   :keywords: model, MAD, automation, dashboarding, validate, pytorch

@@ -15,8 +15,8 @@ PyTorch inference performance testing
   {% set model_groups = data.pytorch_inference_benchmark.model_groups %}

   The `ROCm PyTorch Docker <https://hub.docker.com/r/rocm/pytorch/tags>`_ image offers a prebuilt,
-   optimized environment for testing model inference performance on AMD Instinct™ MI300X Series
-   GPUs. This guide demonstrates how to use the AMD Model Automation and Dashboarding (MAD)
+   optimized environment for testing model inference performance on AMD Instinct™ MI300X series
+   accelerators. This guide demonstrates how to use the AMD Model Automation and Dashboarding (MAD)
   tool with the ROCm PyTorch container to test inference performance on various models efficiently.

   .. _pytorch-inference-benchmark-available-models:
@@ -175,7 +175,7 @@ Further reading
 - To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.

 - To learn more about system settings and management practices to configure your system for
-  AMD Instinct MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+  AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.

 - For application performance optimization strategies for HPC and AI workloads,
  including inference with vLLM, see :doc:`../../inference-optimization/workload`.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/sglang-distributed.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/sglang-distributed.rst
@@ -22,8 +22,8 @@ improved efficiency and throughput.
   `SGLang <https://docs.sglang.ai>`__ is a high-performance inference and
   serving engine for large language models (LLMs) and vision models. The
   ROCm-enabled `SGLang base Docker image <{{ docker.docker_hub_url }}>`__
-   bundles SGLang with PyTorch, which is optimized for AMD Instinct MI300X Series
-   GPUs. It includes the following software components:
+   bundles SGLang with PyTorch, which is optimized for AMD Instinct MI300X series
+   accelerators. It includes the following software components:

   .. list-table::
      :header-rows: 1
@@ -37,7 +37,7 @@ improved efficiency and throughput.
      {% endfor %}

 The following guides on setting up and running SGLang and Mooncake for disaggregated
-distributed inference on a Slurm cluster using AMD Instinct MI300X Series GPUs backed by
+distributed inference on a Slurm cluster using AMD Instinct MI300X series accelerators backed by
 Mellanox CX-7 NICs.

 Prerequisites
@@ -111,7 +111,7 @@ Build the Docker image
 ----------------------

 Get the Dockerfile located in
-`<https://github.com/ROCm/MAD/blob/develop/docker/sglang_disagg_inference.ubuntu.amd.Dockerfile>`__.
+`<https://github.com/ROCm/MAD/blob/develop/docker/sglang_dissag_inference.ubuntu.amd.Dockerfile>`__.
 It uses `lmsysorg/sglang:v0.5.2rc1-rocm700-mi30x
 <https://hub.docker.com/layers/lmsysorg/sglang/v0.4.9.post1-rocm630/images/sha256-2f6b1748e4bcc70717875a7da76c87795fd8aa46a9646e08d38aa7232fc78538>`__
 as the base Docker image and installs the necessary components for Mooncake, etcd, and Mellanox network
@@ -122,26 +122,26 @@ drivers.
   git clone https://github.com/ROCm/MAD.git
   cd MAD/docker
   docker build \
-       -t sglang_disagg_pd_image \
-       -f sglang_disagg_inference.ubuntu.amd.Dockerfile .
+       -t sglang_dissag_pd_image \
+       -f sglang_dissag_inference.ubuntu.amd.Dockerfile .

 Benchmarking
 ============

-The `<https://github.com/ROCm/MAD/tree/develop/scripts/sglang_disagg>`__
+The `<https://github.com/ROCm/MAD/tree/develop/scripts/sglang_dissag>`__
 repository contains scripts to launch SGLang inference with prefill/decode
 disaggregation via Mooncake for supported models.

-* `scripts/sglang_dissag/run_xPyD_models.slurm <https://github.com/ROCm/MAD/blob/develop/scripts/sglang_disagg/run_xPyD_models.slurm>`__
+* `scripts/sglang_dissag/run_xPyD_models.slurm <https://github.com/ROCm/MAD/blob/develop/scripts/sglang_dissag/run_xPyD_models.slurm>`__
  -- the main Slurm batch script to launch Docker containers on all nodes using ``sbatch`` or ``salloc``.

-* `scripts/sglang_dissag/sglang_disagg_server.sh <https://github.com/ROCm/MAD/blob/develop/scripts/sglang_disagg/sglang_disagg_server.sh>`__
+* `scripts/sglang_dissag/sglang_disagg_server.sh <https://github.com/ROCm/MAD/blob/develop/scripts/sglang_dissag/sglang_disagg_server.sh>`__
  -- the entrypoint script that runs inside each container to start the correct service -- proxy, prefill, or decode.

-* `scripts/sglang_dissag/benchmark_xPyD.sh <https://github.com/ROCm/MAD/blob/develop/scripts/sglang_disagg/benchmark_xPyD.sh>`__
+* `scripts/sglang_dissag/benchmark_xPyD.sh <https://github.com/ROCm/MAD/blob/develop/scripts/sglang_dissag/benchmark_xPyD.sh>`__
  -- the benchmark script to run the GSM8K accuracy benchmark and the SGLang benchmarking tool for performance measurement.

-* `scripts/sglang_dissag/benchmark_parser.py <https://github.com/ROCm/MAD/blob/develop/scripts/sglang_disagg/benchmark_parser.py>`__
+* `scripts/sglang_dissag/benchmark_parser.py <https://github.com/ROCm/MAD/blob/develop/scripts/sglang_dissag/benchmark_parser.py>`__
  -- the log parser script to be run on the concurrency benchmark log file to generate tabulated data.

 Launch the service
@@ -163,10 +163,10 @@ allocated nodes.
         # Clone the MAD repo if you haven't already and
         # navigate to the scripts directory
         git clone https://github.com/ROCm/MAD.git
-         cd MAD/scripts/sglang_disagg/
+         cd MAD/scripts/sglang_dissag/

         # Slurm sbatch run command
-         export DOCKER_IMAGE_NAME=sglang_disagg_pd_image
+         export DOCKER_IMAGE_NAME=sglang_dissag_pd_image
         export xP=<num_prefill_nodes>
         export yD=<num_decode_nodes>
         export MODEL_NAME={{ model.model_repo }}
@@ -236,7 +236,7 @@ Further reading
 - See the base upstream Docker image on `Docker Hub <https://hub.docker.com/layers/lmsysorg/sglang/v0.5.2rc1-rocm700-mi30x/images/sha256-10c4ee502ddba44dd8c13325e6e03868bfe7f43d23d0a44780a8ee8b393f4729>`__.

 - To learn more about system settings and management practices to configure your system for
-  MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`__.
+  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`__.

 - For application performance optimization strategies for HPC and AI workloads,
  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/sglang.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/sglang.rst
@@ -1,5 +1,5 @@
 .. meta::
-   :description: Learn how to validate LLM inference performance on MI300X GPUs using AMD MAD and SGLang
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and SGLang
   :keywords: model, MAD, automation, dashboarding, validate

 *****************************************************************
@@ -15,8 +15,8 @@ SGLang inference performance testing DeepSeek-R1-Distill-Qwen-32B
   `SGLang <https://docs.sglang.ai>`__ is a high-performance inference and
   serving engine for large language models (LLMs) and vision models. The
   ROCm-enabled `SGLang Docker image <{{ docker.docker_hub_url }}>`__
-   bundles SGLang with PyTorch, optimized for AMD Instinct MI300X Series
-   GPUs. It includes the following software components:
+   bundles SGLang with PyTorch, optimized for AMD Instinct MI300X series
+   accelerators. It includes the following software components:

   .. list-table::
      :header-rows: 1
@@ -255,7 +255,7 @@ Further reading
 - To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.

 - To learn more about system settings and management practices to configure your system for
-  MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`__.
+  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`__.

 - For application performance optimization strategies for HPC and AI workloads,
  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
@@ -1,68 +1,50 @@
 .. meta::
-   :description: Learn how to validate LLM inference performance on MI300X GPUs using AMD MAD and the ROCm vLLM Docker image.
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the ROCm vLLM Docker image.
   :keywords: model, MAD, automation, dashboarding, validate

 **********************************
 vLLM inference performance testing
 **********************************

-.. _vllm-benchmark-unified-docker-930:
+.. _vllm-benchmark-unified-docker-909:

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml

   {% set docker = data.dockers[0] %}

-   The `ROCm vLLM Docker <{{ docker.docker_hub_url }}>`_ image offers a
-   prebuilt, optimized environment for validating large language model (LLM)
-   inference performance on AMD Instinct™ MI355X, MI350X, MI325X and MI300X
-   GPUs. This ROCm vLLM Docker image integrates vLLM and PyTorch tailored
-   specifically for AMD data center GPUs and includes the following components:
+   The `ROCm vLLM Docker <{{ docker.docker_hub_url }}>`_ image offers
+   a prebuilt, optimized environment for validating large language model (LLM)
+   inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
+   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
+   accelerators and includes the following components:

-   .. tab-set::
+   .. list-table::
+      :header-rows: 1

-      .. tab-item:: {{ docker.pull_tag }}
+      * - Software component
+        - Version

-         .. list-table::
-            :header-rows: 1
-
-            * - Software component
-              - Version
-
-            {% for component_name, component_version in docker.components.items() %}
-            * - {{ component_name }}
-              - {{ component_version }}
-            {% endfor %}
+      {% for component_name, component_version in docker.components.items() %}
+      * - {{ component_name }}
+        - {{ component_version }}
+      {% endfor %}

 With this Docker image, you can quickly test the :ref:`expected
-inference performance numbers <vllm-benchmark-performance-measurements-930>` for
-AMD Instinct GPUs.
+inference performance numbers <vllm-benchmark-performance-measurements-909>` for
+MI300X series accelerators.

 What's new
 ==========

 The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <previous-versions/vllm-history>`.

-* Added support for AMD Instinct MI355X and MI350X GPUs.
+* Upgraded to vLLM v0.10.1.

-* Added support and benchmarking instructions for the following models. See :ref:`vllm-benchmark-supported-models-930`.
+* Set ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1`` by default for better performance.

-  * Llama 4 Scout and Maverick
+* Set ``VLLM_ROCM_USE_AITER_RMSNORM=0`` by default to avoid various issues with torch compile.

-  * DeepSeek R1 0528 FP8
-
-  * MXFP4 models (MI355X and MI350X only): Llama 3.3 70B MXFP4 and Llama 3.1 405B MXFP4
-
-  * GPT OSS 20B and 120B
-
-  * Qwen 3 32B, 30B-A3B, and 235B-A22B
-
-* Removed the deprecated ``--max-seq-len-to-capture`` flag.
-
-* ``--gpu-memory-utilization`` is now configurable via the `configuration files
-  <https://github.com/ROCm/MAD/tree/develop/scripts/vllm/configs>`__ in the MAD
-  repository.
-
-.. _vllm-benchmark-supported-models-930:
+.. _vllm-benchmark-supported-models-909:

 Supported models
 ================
@@ -72,12 +54,11 @@ Supported models
   {% set docker = data.dockers[0] %}
   {% set model_groups = data.model_groups %}

-   .. _vllm-benchmark-available-models-930:
+   .. _vllm-benchmark-available-models-909:

   The following models are supported for inference performance benchmarking
   with vLLM and ROCm. Some instructions, commands, and recommendations in this
-   documentation might vary by model -- select one to get started. MXFP4 models
-   are only supported on MI355X and MI350X GPUs.
+   documentation might vary by model -- select one to get started.

   .. raw:: html

@@ -86,7 +67,7 @@ Supported models
            <div class="col-2 me-1 px-2 model-param-head">Model</div>
            <div class="row col-10 pe-0">
      {% for model_group in model_groups %}
-               <div class="col-4 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+               <div class="col-3 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
      {% endfor %}
            </div>
         </div>
@@ -108,35 +89,25 @@ Supported models
         </div>
      </div>

-   .. _vllm-benchmark-vllm-930:
+   .. _vllm-benchmark-vllm-909:

   {% for model_group in model_groups %}
      {% for model in model_group.models %}

   .. container:: model-doc {{ model.mad_tag }}

-
-      {% if model.precision == "float4" %}
-      .. important::
-
-         MXFP4 is supported only on MI355X and MI350X GPUs.
-      {% endif %}
-
      .. note::

         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
         Some models require access authorization prior to use via an external license agreement through a third party.
      {% if model.precision == "float8" and model.model_repo.startswith("amd") %}
-         This model uses FP8 quantization via `AMD Quark <https://quark.docs.amd.com/latest/>`__ for efficient inference on AMD GPUs.
-      {% endif %}
-      {% if model.precision == "float4" and model.model_repo.startswith("amd") %}
-         This model uses FP4 quantization via `AMD Quark <https://quark.docs.amd.com/latest/>`__ for efficient inference on AMD GPUs.
+         This model uses FP8 quantization via `AMD Quark <https://quark.docs.amd.com/latest/>`__ for efficient inference on AMD accelerators.
      {% endif %}

      {% endfor %}
   {% endfor %}

-.. _vllm-benchmark-performance-measurements-930:
+.. _vllm-benchmark-performance-measurements-909:

 Performance measurements
 ========================
@@ -150,7 +121,7 @@ page provides reference throughput and serving measurements for inferencing popu
   The performance data presented in
   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
   only reflects the latest version of this inference benchmarking environment.
-   The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct GPUs or ROCm software.
+   The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.

 System validation
 =================
@@ -167,12 +138,13 @@ To test for optimal performance, consult the recommended :ref:`System health ben
 <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
 system's configuration.

-Pull the Docker image
-=====================
-
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml

   {% set docker = data.dockers[0] %}
+   {% set model_groups = data.model_groups %}
+
+   Pull the Docker image
+   =====================

   Download the `ROCm vLLM Docker image <{{ docker.docker_hub_url }}>`_.
   Use the following command to pull the Docker image from Docker Hub.
@@ -181,18 +153,13 @@ Pull the Docker image

      docker pull {{ docker.pull_tag }}

-Benchmarking
-============
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
-
-   {% set docker = data.dockers[0] %}
-   {% set model_groups = data.model_groups %}
+   Benchmarking
+   ============

   Once the setup is complete, choose between two options to reproduce the
   benchmark results:

-   .. _vllm-benchmark-mad-930:
+   .. _vllm-benchmark-mad-909:

   {% for model_group in model_groups %}
      {% for model in model_group.models %}
@@ -204,7 +171,7 @@ Benchmarking
         .. tab-item:: MAD-integrated benchmarking

            The following run command is tailored to {{ model.model }}.
-            See :ref:`vllm-benchmark-supported-models-930` to switch to another available model.
+            See :ref:`vllm-benchmark-supported-models-909` to switch to another available model.

            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
               directory and install the required packages on the host machine.
@@ -215,9 +182,8 @@ Benchmarking
                  cd MAD
                  pip install -r requirements.txt

-            2. On the host machine, use this command to run the performance benchmark test on
-               the `{{model.model}} <{{ model.url }}>`_ model using one node with the
-               :literal:`{{model.precision}}` data type.
+            2. Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
+               using one GPU with the :literal:`{{model.precision}}` data type on the host machine.

               .. code-block:: shell

@@ -225,7 +191,8 @@ Benchmarking
                  madengine run \
                      --tags {{model.mad_tag}} \
                      --keep-model-dir \
-                      --live-output
+                      --live-output \
+                      --timeout 28800

            MAD launches a Docker container with the name
            ``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the
@@ -233,7 +200,7 @@ Benchmarking
            and ``{{ model.mad_tag }}_serving.csv``.

            Although the :ref:`available models
-            <vllm-benchmark-available-models-930>` are preconfigured to collect
+            <vllm-benchmark-available-models-909>` are preconfigured to collect
            offline throughput and online serving performance data, you can
            also change the benchmarking parameters. See the standalone
            benchmarking tab for more information.
@@ -258,12 +225,12 @@ Benchmarking
         .. tab-item:: Standalone benchmarking

            The following commands are optimized for {{ model.model }}.
-            See :ref:`vllm-benchmark-supported-models-930` to switch to another available model.
+            See :ref:`vllm-benchmark-supported-models-909` to switch to another available model.

            .. seealso::

               For more information on configuration, see the `config files
-               <https://github.com/ROCm/MAD/tree/develop/scripts/vllm/configs>`__
+               <https://github.com/ROCm/MAD-private/tree/develop/scripts/vllm/configs>`__
               in the MAD repository. Refer to the `vLLM engine <https://docs.vllm.ai/en/latest/configuration/engine_args.html#engineargs>`__
               for descriptions of available configuration options
               and `Benchmarking vLLM <https://github.com/vllm-project/vllm/blob/main/benchmarks/README.md>`__ for
@@ -299,12 +266,13 @@ Benchmarking

               model={{ model.model_repo }}
               tp={{ model.config.tp }}
-               num_prompts={{ model.config.num_prompts | default(1024) }}
-               in={{ model.config.in | default(128) }}
-               out={{ model.config.in | default(128) }}
-               dtype={{ model.config.dtype | default("auto") }}
+               num_prompts=1024
+               in=128
+               out=128
+               dtype={{ model.config.dtype }}
               kv_cache_dtype={{ model.config.kv_cache_dtype }}
-               max_num_seqs={{ model.config.max_num_seqs | default(1024) }}
+               max_num_seqs=1024
+               max_seq_len_to_capture={{ model.config.max_seq_len_to_capture }}
               max_num_batched_tokens={{ model.config.max_num_batched_tokens }}
               max_model_len={{ model.config.max_model_len }}

@@ -316,11 +284,12 @@ Benchmarking
                   --dtype $dtype \
                   --kv-cache-dtype $kv_cache_dtype \
                   --max-num-seqs $max_num_seqs \
+                   --max-seq-len-to-capture $max_seq_len_to_capture \
                   --max-num-batched-tokens $max_num_batched_tokens \
                   --max-model-len $max_model_len \
                   --trust-remote-code \
                   --output-json ${model}_throughput.json \
-                   --gpu-memory-utilization {{ model.config.gpu_memory_utilization | default(0.9) }}
+                   --gpu-memory-utilization 0.9

            .. rubric:: Serving command

@@ -333,6 +302,7 @@ Benchmarking
                  dtype={{ model.config.dtype }}
                  kv_cache_dtype={{ model.config.kv_cache_dtype }}
                  max_num_seqs=256
+                  max_seq_len_to_capture={{ model.config.max_seq_len_to_capture }}
                  max_num_batched_tokens={{ model.config.max_num_batched_tokens }}
                  max_model_len={{ model.config.max_model_len }}

@@ -341,6 +311,7 @@ Benchmarking
                      --dtype $dtype \
                      --kv-cache-dtype $kv_cache_dtype \
                      --max-num-seqs $max_num_seqs \
+                      --max-seq-len-to-capture $max_seq_len_to_capture \
                      --max-num-batched-tokens $max_num_batched_tokens \
                      --max-model-len $max_model_len \
                      --no-enable-prefix-caching \
@@ -381,9 +352,6 @@ Benchmarking

            .. note::

-               For improved performance with certain Mixture of Experts models, such as Mixtral 8x22B,
-               try adding ``export VLLM_ROCM_USE_AITER=1`` to your commands.
-
               If you encounter the following error, pass your access-authorized Hugging
               Face token to the gated models.

@@ -422,31 +390,26 @@ see the developer's guide at `<https://github.com/ROCm/vllm/blob/documentation/d
 Reproducing the Docker image
 ----------------------------

-To reproduce this ROCm-enabled vLLM Docker image release, follow these steps:
+To reproduce this ROCm/vLLM Docker image release, follow these steps:

-1. Clone the `vLLM repository <https://github.com/vllm-project/vllm>`__.
+1. Clone the `vLLM repository <https://github.com/ROCm/vllm>`__.
+
+   .. code-block:: shell
+
+      git clone https://github.com/ROCm/vllm.git
+
+2. Checkout the specific release commit.

   .. code-block:: shell

-      git clone https://github.com/vllm-project/vllm.git
      cd vllm
+      git checkout 6663000a391911eba96d7864a26ac42b07f6ef29

-2. Use the following command to build the image directly from the specified commit.
+3. Build the Docker image. Replace ``vllm-rocm`` with your desired image tag.

-   .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
+   .. code-block:: shell

-      {% set docker = data.dockers[0] %}
-      .. code-block:: shell
-
-         docker build -f docker/Dockerfile.rocm \
-             --build-arg REMOTE_VLLM=1 \
-             --build-arg VLLM_REPO=https://github.com/ROCm/vllm \
-             --build-arg VLLM_BRANCH="{{ docker.dockerfile.commit }}" \
-             -t vllm-rocm .
-
-   .. tip::
-
-      Replace ``vllm-rocm`` with your desired image tag.
+      docker build -f docker/Dockerfile.rocm -t vllm-rocm .

 Further reading
 ===============
@@ -457,7 +420,7 @@ Further reading
 - To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.

 - To learn more about system settings and management practices to configure your system for
-  AMD Instinct MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+  AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.

 - See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
  a brief introduction to vLLM and optimization strategies.
--- a/docs/how-to/rocm-for-ai/inference/deploy-your-model.rst
+++ b/docs/how-to/rocm-for-ai/inference/deploy-your-model.rst
@@ -44,9 +44,9 @@ Validating vLLM performance
 ---------------------------

 ROCm provides a prebuilt optimized Docker image for validating the performance of LLM inference with vLLM 
-on the MI300X GPU. The Docker image includes ROCm, vLLM, PyTorch, and tuning files in the CSV 
+on the MI300X accelerator. The Docker image includes ROCm, vLLM, PyTorch, and tuning files in the CSV 
 format. For more information, see the guide to 
-`LLM inference performance testing with vLLM on the AMD Instinct™ MI300X GPU <https://github.com/ROCm/MAD/blob/develop/benchmark/vllm/README.md>`_ 
+`LLM inference performance testing with vLLM on the AMD Instinct™ MI300X accelerator <https://github.com/ROCm/MAD/blob/develop/benchmark/vllm/README.md>`_ 
 on the ROCm GitHub repository.

 .. _rocm-for-ai-serve-hugging-face-tgi:
@@ -61,7 +61,7 @@ The `Hugging Face Text Generation Inference <https://huggingface.co/docs/text-ge
 TGI installation
 ----------------

-The easiest way to use Hugging Face TGI with ROCm on AMD Instinct GPUs is to use the official Docker image at
+The easiest way to use Hugging Face TGI with ROCm on AMD Instinct accelerators is to use the official Docker image at
 `<https://github.com/huggingface/text-generation-inference/pkgs/container/text-generation-inference>`__.

 TGI walkthrough
--- a/docs/how-to/rocm-for-ai/inference/hugging-face-models.rst
+++ b/docs/how-to/rocm-for-ai/inference/hugging-face-models.rst
@@ -10,7 +10,7 @@ Running models from Hugging Face
 transformer models. Hugging Face models and tools significantly enhance productivity, performance, and accessibility in
 developing and deploying AI solutions.

-This section describes how to run popular community transformer models from Hugging Face on AMD GPUs.
+This section describes how to run popular community transformer models from Hugging Face on AMD accelerators and GPUs.

 .. _rocm-for-ai-hugging-face-transformers:

@@ -62,11 +62,11 @@ Using Hugging Face with Optimum-AMD

 Optimum-AMD is the interface between Hugging Face libraries and the ROCm software stack.

-For a deeper dive into using Hugging Face libraries on AMD GPUs, refer to the
+For a deeper dive into using Hugging Face libraries on AMD accelerators and GPUs, refer to the
 `Optimum-AMD <https://huggingface.co/docs/optimum/main/en/amd/amdgpu/overview>`_ page on Hugging Face for guidance on
 using Flash Attention 2, GPTQ quantization and the ONNX Runtime integration.

-Hugging Face libraries natively support AMD Instinct GPUs. For other
+Hugging Face libraries natively support AMD Instinct accelerators. For other
 :doc:`ROCm-capable hardware <rocm-install-on-linux:reference/system-requirements>`, support is currently not
 validated, but most features are expected to work without issues.

@@ -139,7 +139,7 @@ To enable `GPTQ <https://arxiv.org/abs/2210.17323>`_, hosted wheels are availabl

      pip install auto-gptq --no-build-isolation --extra-index-url https://huggingface.github.io/autogptq-index/whl/rocm573/

-   Or, to install from source for AMD GPUs supporting ROCm, specify the ``ROCM_VERSION`` environment variable.
+   Or, to install from source for AMD accelerators supporting ROCm, specify the ``ROCM_VERSION`` environment variable.

   .. code-block:: shell

--- a/docs/how-to/rocm-for-ai/inference/index.rst
+++ b/docs/how-to/rocm-for-ai/inference/index.rst
@@ -9,7 +9,7 @@ AI inference is a process of deploying a trained machine learning model to make

 Understanding the ROCm™ software platform’s architecture and capabilities is vital for running AI inference. By leveraging the ROCm platform's capabilities, you can harness the power of high-performance computing and efficient resource management to run inference workloads, leading to faster predictions and classifications on real-time data.

-Throughout the following topics, this section provides a comprehensive guide to setting up and deploying AI inference on AMD GPUs. This includes instructions on how to install ROCm, how to use Hugging Face Transformers to manage pre-trained models for natural language processing (NLP) tasks, how to validate vLLM on AMD Instinct™ MI300X GPUs and illustrate how to deploy trained models in production environments. 
+Throughout the following topics, this section provides a comprehensive guide to setting up and deploying AI inference on AMD GPUs. This includes instructions on how to install ROCm, how to use Hugging Face Transformers to manage pre-trained models for natural language processing (NLP) tasks, how to validate vLLM on AMD Instinct™ MI300X accelerators and illustrate how to deploy trained models in production environments. 

 The AI Developer Hub contains `AMD ROCm tutorials <https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/>`_ for
 training, fine-tuning, and inference. It leverages popular machine learning frameworks on AMD GPUs.
--- a/docs/how-to/rocm-for-ai/inference/llm-inference-frameworks.rst
+++ b/docs/how-to/rocm-for-ai/inference/llm-inference-frameworks.rst
@@ -60,7 +60,7 @@ Installing vLLM
               vllm-rocm \
               bash

-      3. Inside the container, start the API server to run on a single GPU on port 8000 using the following command.
+      3. Inside the container, start the API server to run on a single accelerator on port 8000 using the following command.

         .. code-block:: shell

@@ -113,7 +113,7 @@ Installing vLLM
            python -m vllm.entrypoints.api_server --model /app/model --dtype float16 -tp 2 --port 8000 &

      4. To run multiple instances of API Servers, specify different ports for each server, and use ``ROCR_VISIBLE_DEVICES`` to
-         isolate each instance to a different GPU.
+         isolate each instance to a different accelerator.

         For example, to run two API servers, one on port 8000 using GPU 0 and 1, one on port 8001 using GPU 2 and 3, use a
         a command like the following.
@@ -140,7 +140,7 @@ Installing vLLM
   See :ref:`mi300x-vllm-optimization` for performance optimization tips.

   ROCm provides a prebuilt optimized Docker image for validating the performance of LLM inference with vLLM
-   on the MI300X GPU. The Docker image includes ROCm, vLLM, and PyTorch.
+   on the MI300X accelerator. The Docker image includes ROCm, vLLM, and PyTorch.
   For more information, see :doc:`/how-to/rocm-for-ai/inference/benchmark-docker/vllm`.

 .. _fine-tuning-llms-tgi:
@@ -178,7 +178,7 @@ Install TGI
   .. tab-item:: TGI on a single-accelerator system
      :sync: single

-      2. Inside the container, launch a model using TGI server on a single GPU.
+      2. Inside the container, launch a model using TGI server on a single accelerator.

         .. code-block:: shell

@@ -199,7 +199,7 @@ Install TGI

   .. tab-item:: TGI on a multi-accelerator system

-      2. Inside the container, launch a model using TGI server on multiple GPUs (four in this case).
+      2. Inside the container, launch a model using TGI server on multiple accelerators (4 in this case).

         .. code-block:: shell

--- a/docs/how-to/rocm-for-ai/install.rst
+++ b/docs/how-to/rocm-for-ai/install.rst
@@ -47,7 +47,7 @@ Deep learning frameworks
 ========================

 ROCm supports deep learning frameworks and libraries including `PyTorch
-<https://pytorch.org>`_, `TensorFlow
+<https://pytorch.org/blog/pytorch-for-amd-rocm-platform-now-available-as-python-package>`_, `TensorFlow
 <https://tensorflow.org>`_, `JAX <https://jax.readthedocs.io/en/latest>`_, and more.

 Review the :doc:`framework installation documentation <../deep-learning-rocm>`. For ease-of-use, it's recommended to use official ROCm prebuilt Docker
@@ -57,4 +57,4 @@ Next steps
 ==========

 After installing ROCm and your desired ML libraries -- and before running AI workloads -- conduct system health benchmarks
-to test the optimal performance of your AMD hardware. See :doc:`system-setup/index` to get started.
+to test the optimal performance of your AMD hardware. See :doc:`system-health-check` to get started.
--- a/docs/how-to/rocm-for-ai/system-setup/system-health-check.rst
+++ b/docs/how-to/rocm-for-ai/system-setup/system-health-check.rst
@@ -1,14 +1,12 @@
-:orphan:
-
 .. meta::
   :description: System health checks with RVS, RCCL tests, BabelStream, and TransferBench to validate AMD hardware performance running AI workloads.
   :keywords: gpu, accelerator, system, health, validation, bench, perf, performance, rvs, rccl, babel, mi300x, mi325x, flops, bandwidth, rbt, training, inference

 .. _rocm-for-ai-system-health-bench:

-*****************************************
-System health benchmarks for AI workloads
-*****************************************
+************************
+System health benchmarks
+************************

 Before running AI workloads, it is important to validate that your AMD hardware is configured correctly and is performing optimally. This topic outlines several system health benchmarks you can use to test key aspects like GPU compute capabilities (FLOPS), memory bandwidth, and interconnect performance. Many of these tests are part of the ROCm Validation Suite (RVS).

@@ -33,7 +31,7 @@ installed, run the following command:
   sudo apt install rocm-validation-suite

 See the `ROCm Validation Suite installation instructions <https://rocm.docs.amd.com/projects/ROCmValidationSuite/en/latest/install/installation.html>`_,
-and `System validation tests <https://instinct.docs.amd.com/projects/system-acceptance/en/latest/common/system-validation.html>`_
+and `System validation tests <https://instinct.docs.amd.com/projects/system-acceptance/en/latest/mi300x/system-validation.html#system-validation-tests>`_
 in the Instinct documentation for more detailed instructions.

 Benchmark, stress, and qualification tests
@@ -43,7 +41,7 @@ The GPU stress test runs various GEMM computations as workloads to stress the GP
 meets the configured target GFLOPS.

 Run the benchmark, stress, and qualification tests included with RVS. See the `Benchmark, stress, qualification
-<https://instinct.docs.amd.com/projects/system-acceptance/en/latest/common/system-validation.html#benchmark-stress-qualification>`_
+<https://instinct.docs.amd.com/projects/system-acceptance/en/latest/mi300x/system-validation.html#benchmark-stress-qualification>`_
 section of the Instinct documentation for usage instructions.

 BabelStream test
@@ -55,7 +53,7 @@ BabelStream tests are included with the RVS package as part of the `BABEL module
 <https://rocm.docs.amd.com/projects/ROCmValidationSuite/en/latest/conceptual/rvs-modules.html#babel-benchmark-test-babel-module>`_.

 For more information, see `Performance benchmarking
-<https://instinct.docs.amd.com/projects/system-acceptance/en/latest/common/system-validation.html#babelstream>`_
+<https://instinct.docs.amd.com/projects/system-acceptance/en/latest/mi300x/performance-bench.html#babelstream-benchmarking-results>`_
 in the Instinct documentation.

 RCCL tests
@@ -64,7 +62,7 @@ RCCL tests
 The ROCm Communication Collectives Library (RCCL) enables efficient multi-GPU
 communication. The `<https://github.com/ROCm/rccl-tests>`__ suite benchmarks
 the performance and verifies the correctness of these collective operations.
-This helps ensure optimal scaling for multi-GPU tasks.
+This helps ensure optimal scaling for multi-accelerator tasks.

 1. To get started, build RCCL-tests using the official instructions in the README at
   `<https://github.com/ROCm/rccl-tests?tab=readme-ov-file#build>`__ or use the
@@ -77,8 +75,8 @@ This helps ensure optimal scaling for multi-GPU tasks.
      make

 2. Run the suggested RCCL tests -- see `RCCL benchmarking
-   <https://instinct.docs.amd.com/projects/system-acceptance/en/latest/network/rdma-benchmarking.html#rccl-benchmarking-results>`_
-   in the AMD Instinct customer acceptance guide.
+   <https://instinct.docs.amd.com/projects/system-acceptance/en/latest/mi300x/performance-bench.html#rccl-benchmarking-results>`_
+   in the Instinct performance benchmarking documentation for instructions.

 TransferBench test
 ==================
--- a/docs/how-to/rocm-for-ai/system-setup/index.rst
+++ b/docs/how-to/rocm-for-ai/system-setup/index.rst
@@ -1,40 +0,0 @@
-.. meta::
-   :description: System setup and validation steps for AI training and inference on ROCm
-   :keywords: AMD Instinct, ROCm, GPU, AI, training, inference, benchmarking, performance, validation
-
-*************************************
-System setup for AI workloads on ROCm
-*************************************
-
-Before you begin training or inference on AMD Instinct™ GPUs, complete
-the following system setup and validation steps to ensure optimal performance.
-
-Prerequisite system validation
-==============================
-
-First, confirm that your system meets all software and hardware prerequisites.
-See :doc:`prerequisite-system-validation`.
-
-Docker images for AMD Instinct GPUs
-===================================
-
-AMD provides prebuilt Docker images for AMD Instinct™ MI300X and MI325X
-GPUs. These images include ROCm-enabled deep learning frameworks and
-essential software components. They support single-node and multi-node configurations
-and are ready for training and inference workloads out of the box.
-
-Multi-node training
-------------------
-
-For instructions on enabling multi-node training, see :doc:`multi-node-setup`.
-
-System optimization and validation
-==================================
-
-Before running workloads, verify that the system is configured correctly and
-operating at peak efficiency. Recommended steps include:
-
- Disabling NUMA auto-balancing
- Running system benchmarks to validate hardware performance
-
-For details on running system health checks, see :doc:`system-health-check`.
--- a/docs/how-to/rocm-for-ai/system-setup/multi-node-setup.rst
+++ b/docs/how-to/rocm-for-ai/system-setup/multi-node-setup.rst
@@ -1,320 +0,0 @@
-.. meta::
-   :description: Multi-node setup for AI training
-   :keywords: gpu, system, health, validation, bench, perf, performance, rvs, rccl, babel, mi300x, mi325x, flops, bandwidth, rbt, training
-
-.. _rocm-for-ai-multi-node-setup:
-
-*********************************
-Multi-node setup for AI workloads
-*********************************
-
-AMD provides ready-to-use Docker images for AMD Instinct™ MI300X and MI325X
-GPUs containing ROCm-capable deep learning frameworks and essential
-software components. These Docker images can run and leverage multiple nodes if
-they are available. This page describes how to enable the multi-node training
-of AI workloads on AMD Instinct GPUs.
-
-Prerequisites
-=============
-
-Before starting, ensure your environment meets the following requirements:
-
-* Multi-node networking: your cluster should have a configured multi-node network. For setup
-  instructions, see the `Multi-node network configuration for AMD Instinct
-  GPUs
-  <https://instinct.docs.amd.com/projects/gpu-cluster-networking/en/latest/how-to/multi-node-config.html>`__
-  guide in the Instinct documentation.
-
-* ROCm Docker container to simplify environment setup for AI workloads. See the following resources to get started:
-
-  * :doc:`Training a model with Megatron-LM and ROCm <../training/benchmark-docker/megatron-lm>`
-
-  * :doc:`Training a model with PyTorch and ROCm <../training/benchmark-docker/pytorch-training>`
-
-  * :doc:`Training a model with JAX MaxText and ROCm <../training/benchmark-docker/jax-maxtext>`
-
-* Slurm workload manager to run the :ref:`provided examples <multi-node-setup-training-examples>`.
-
-Install required packages
-=========================
-
-To run multi-node workloads, ensure you have all the required packages installed based on your
-network device. For example, on Ubuntu systems:
-
-.. code-block:: shell
-
-   apt install -y iproute2
-
-   apt install -y linux-headers-"$(uname -r)" libelf-dev
-
-   apt install -y gcc make libtool autoconf librdmacm-dev rdmacm-utils infiniband-diags ibverbs-utils perftest ethtool libibverbs-dev rdma-core strace libibmad5 libibnetdisc5 ibverbs-providers libibumad-dev libibumad3 libibverbs1 libnl-3-dev libnl-route-3-dev
-
-Compile and install the RoCE library
------------------------------------
-
-If you're using Broadcom NICs, you need to compile and install the RoCE (RDMA
-over Converged Ethernet) library. See `RoCE cluster network configuration guide
-for AMD Instinct GPUs
-<https://instinct.docs.amd.com/projects/gpu-cluster-networking/en/latest/how-to/roce-network-config.html>`__
-for more information.
-
-See the `Ethernet networking guide for AMD
-Instinct MI300X GPU clusters: Compiling Broadcom NIC software from source
-<https://docs.broadcom.com/doc/957608-AN2XX#page=81>`_ for more details.
-
-.. important::
-
-   It is crucial to install the exact same version of the RoCE library that
-   is installed on your host system. Also, ensure that the path to these
-   libraries on the host is correctly mounted into your Docker container.
-   Failure to do so can lead to compatibility issues and communication
-   failures.
-
-1. Set ``BUILD_DIR`` to the path on the host system where the Broadcom drivers and ``bnxt_rocelib`` source are located.
-   Then, navigate to the ``bnxt_rocelib`` directory.
-
-   .. code-block:: shell
-
-      export BUILD_DIR=/path/to/your/broadcom_drivers_on_host
-      cd $BUILD_DIR/drivers_linux/bnxt_rocelib/
-
-2. The ``bnxt_rocelib`` directory contains a version of ``libbnxt_re`` in a zipped ``.tar.gz`` file.
-
-   .. code-block:: shell
-
-      tar -xf libbnxt_re-a.b.c.d.tar.gz
-      cd libbnxt_re-a.b.c.d
-
-3. Compile and install the RoCE library.
-
-   .. code-block:: shell
-
-      sh autogen.sh
-      ./configure
-      make
-      find /usr/lib64/ /usr/lib -name "libbnxt_re-rdmav*.so" -exec mv {} {}.inbox \;
-      make install all
-      sh -c "echo /usr/local/lib >> /etc/ld.so.conf"
-      ldconfig
-      cp -f bnxt_re.driver /etc/libibverbs.d/
-      find . -name "*.so" -exec md5sum {} \;
-      BUILT_MD5SUM=$(find . -name "libbnxt_re-rdmav*.so" -exec md5sum {} \; | cut -d " " -f 1)
-
-Environment setup
-=================
-
-Before running multi-node workloads, set these essential environment variables:
-
-Master address
--------------
-
-By default, ``localhost`` is used for single-node configurations. Change
-``localhost`` to the master node's resolvable hostname or IP address:
-
-.. code-block:: bash
-
-   export MASTER_ADDR="${MASTER_ADDR:-localhost}"
-
-Number of nodes
---------------
-
-Set the number of nodes you want to train on (for example, ``2``, ``4``, or ``8``):
-
-.. code-block:: bash
-
-   export NNODES="${NNODES:-<num_nodes>}"
-
-Node ranks
----------
-
-Set the rank of each node (``0`` for master, ``1`` for the first worker node, and so on).
-Node ranks should be unique across all nodes in the cluster.
-
-.. code-block:: bash
-
-   export NODE_RANK="${NODE_RANK:-<node_rank>}"
-
-Network interface
-----------------
-
-Update the network interface in the script to match your system's network interface. To
-find your network interface, run the following (outside of any Docker container):
-
-.. code-block:: bash
-
-   ip a
-
-Look for an active interface (status "UP") with an IP address in the same subnet as
-your other nodes. Then, update the following variable in the script, for
-example:
-
-.. code-block:: bash
-
-   export NCCL_SOCKET_IFNAME=ens50f0np0
-
-This variable specifies which network interface to use for inter-node communication.
-Setting this variable to the incorrect interface can result in communication failures
-or significantly reduced performance.
-
-.. tip::
-
-  This command sets ``NCCL_SOCKET_IFNAME``'s value to the last RDMA interface.
-
-  .. code-block:: bash
-
-     export NCCL_SOCKET_IFNAME=$(rdma link show | awk '{print $NF}' | sort | tail -n1)
-
-RDMA/IB interface
-----------------
-
-Set the RDMA interfaces to be used for communication. NICs can come from different vendors and the names of the RDMA interface can be different. To get the list of all the RDMA/IB devices, run:
-
-.. code-block:: bash
-
-   ibv_devices
-
-The command below gets the list of all RDMA/IB devices and puts them in a
-comma-separated format. If
-(``rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7``) are your RDMA
-interfaces, then set:
-
-.. code-block:: bash
-
-   # If using Broadcom NIC
-   export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
-   # If using Mellanox NIC
-   # export NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_8,mlx5_9
-
-.. tip::
-
-  Alternatively, if you want to choose the RDMA interface automatically, you
-  can use the following. This command will sort the RDMA interfaces and then
-  select the first eight RDMA interfaces.
-
-  .. code-block:: bash
-
-     export NCCL_IB_HCA=$(ibv_devices | awk 'NR>2 {print $1}' | sort | head -n 8 | paste -sd,)
-
-Global ID index
---------------
-
-Update the global ID index if you're using RoCE.
-
-.. code-block:: bash
-
-   export NCCL_IB_GID_INDEX=3
-
-.. _multi-node-setup-training-examples:
-
-Multi-node training examples
-============================
-
-The following examples use the Slurm workload manager to launch jobs on
-multiple nodes. To run these scripts as-is, you must have a Slurm environment
-configured. The scripts are designed to work with both Broadcom Thor 2 and
-Mellanox NICs by automatically installing the required libraries and setting
-the necessary environment variables. For systems with Broadcom NICs, the
-scripts assume the host's RoCE library is located in the ``/opt`` directory.
-
-The following benchmarking examples demonstrate the training of a Llama 3 8B model
-across multiple 8-GPU nodes, using FSDP for intra-node parallelism and DP for
-inter-node parallelism.
-
-.. _rocm-for-ai-multi-node-setup-jax-train-example:
-
-JAX MaxText
-----------
-
-1. Download the desired multi-node benchmarking script from `<https://github.com/ROCm/MAD/tree/develop/scripts/jax-maxtext/gpu-rocm>`__.
-
-   .. code-block:: shell
-
-      wget https://raw.githubusercontent.com/ROCm/MAD/refs/heads/develop/scripts/jax-maxtext/gpu-rocm/llama3_8b_multinode.sh
-
-   Or clone the `<https://github.com/ROCm/MAD>`__ repository.
-
-   .. code-block:: shell
-
-      git clone https://github.com/ROCm/MAD
-      cd scripts/jax-maxtext/gpu-rocm
-
-2. Run the benchmark for multi-node training.
-
-   .. code-block:: shell
-
-      sbatch -N <num_nodes> llama3_8b_multinode.sh
-
-.. _rocm-for-ai-multi-node-setup-pyt-train-example:
-
-PyTorch training
----------------
-
-.. note::
-
-   The ROCm PyTorch Training Docker image now focuses on :doc:`Training a model
-   with Primus and PyTorch <../training/benchmark-docker/primus-pytorch>`. The
-   following example refers to the legacy workflow :ref:`Training a
-   model with PyTorch <amd-pytorch-training-multinode-examples>`.
-
-1. Download the ``run_multinode_train.sh`` benchmarking script from `<https://github.com/ROCm/MAD/tree/develop/scripts/pytorch_train>`__.
-
-   .. code-block:: shell
-
-      wget https://raw.githubusercontent.com/ROCm/MAD/refs/heads/develop/scripts/pytorch_train/run_multinode_train.sh
-
-   Or clone the `<https://github.com/ROCm/MAD>`__ repository.
-
-   .. code-block:: shell
-
-      git clone https://github.com/ROCm/MAD
-      cd scripts/pytorch_train
-
-2. Run the benchmark for multi-node training.
-
-   .. code-block:: shell
-
-      sbatch -N <num_nodes> run_multinode_train.sh
-
-.. seealso::
-
-   See :ref:`Training a model with PyTorch <amd-pytorch-multinode-examples>` for more examples and information.
-
-Megatron-LM
-----------
-
-.. note::
-
-   The Megatron-LM Docker image now focuses on :ref:`Training a model with
-   Primus and Megatron <amd-primus-megatron-multi-node-examples>`. The
-   following example refers to the legacy Megatron-LM :ref:`Training a model
-   with Megatron-LM <amd-megatron-lm-multi-node-examples>` and might have
-   limited support.
-
-1. Download the ``train_llama_slurm.sh`` benchmarking script from
-   `<https://github.com/ROCm/Megatron-LM/blob/rocm_dev/examples/llama/train_llama_slurm.sh>`__.
-
-2. Set the network interface parameters as per the above guidelines and run the script.
-
-   .. code-block:: shell
-
-      cd </path/to/your/Megatron-LM>
-      export NETWORK_INTERFACE=$NCCL_SOCKET_IFNAME
-      export NCCL_IB_HCA=$NCCL_IB_HCA
-      export IMAGE=docker.io/rocm/megatron-lm:latest OR your preferred image
-      export DATA_CACHE_PATH=/nfs/mounted/repo
-
-      sbatch –N <num_nodes> examples/llama/train_llama_slurm.sh <MODEL_SIZE> <MBS> <GBS> <SEQ_LENGTH> <FSDP> <RECOMPUTE>
-
-2. For example, to run a Llama 3 8B workload in BF16 precision, use the following command.
-
-   .. code-block:: shell
-
-      MODEL_NAME=llama3 sbatch –N 8 examples/llama/train_llama_slurm.sh 8 2 128 8192 0 0
-      # Other parameters, such as TP, FP8 datatype, can be adjusted in the script.
-
-Further reading
-===============
-
-* `Multi-node network configuration for AMD Instinct GPUs <https://instinct.docs.amd.com/projects/gpu-cluster-networking/en/latest/how-to/multi-node-config.html>`__
-
-* `Ethernet networking guide for AMD Instinct MI300X GPU clusters: Compiling Broadcom NIC software from source <https://docs.broadcom.com/doc/957608-AN2XX#page=81>`__
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
@@ -6,8 +6,14 @@
 Training a model with JAX MaxText on ROCm
 ******************************************

+MaxText is a high-performance, open-source framework built on the Google JAX
+machine learning library to train LLMs at scale. The MaxText framework for
+ROCm is an optimized fork of the upstream
+`<https://github.com/AI-Hypercomputer/maxtext>`__ enabling efficient AI workloads
+on AMD MI300X series accelerators.
+
 The MaxText for ROCm training Docker image
-provides a prebuilt environment for training on AMD Instinct MI355X, MI350X, MI325X, and MI300X GPUs,
+provides a prebuilt environment for training on AMD Instinct MI300X and MI325X accelerators,
 including essential components like JAX, XLA, ROCm libraries, and MaxText utilities.
 It includes the following software components:

@@ -19,7 +25,7 @@ It includes the following software components:
      {% for docker in dockers %}
      {% set jax_version = docker.components["JAX"] %}

-      .. tab-item:: ``{{ docker.pull_tag }}``
+      .. tab-item:: JAX {{ jax_version }}
         :sync: {{ docker.pull_tag }}

         .. list-table::
@@ -41,6 +47,10 @@ It includes the following software components:
            ``shardy=False`` during the training run. You can also follow the `migration
            guide <https://docs.jax.dev/en/latest/shardy_jax_migration.html>`__ to enable
            it.
+
+            The provided multi-node training scripts in this documentation are
+            not currently supported with JAX 0.6.0. For multi-node training, use the JAX 0.5.0
+            Docker image.
         {% endif %}

      {% endfor %}
@@ -55,15 +65,15 @@ MaxText with on ROCm provides the following key features to train large language

 - Multi-node support

- NANOO FP8 (for MI300X series GPUs) and FP8 (for MI355X and MI350X) quantization support
+- NANOO FP8 quantization support

-.. _amd-maxtext-model-support-v259:
+.. _amd-maxtext-model-support-v257:

 Supported models
 ================

-The following models are pre-optimized for performance on AMD Instinct
-GPUs. Some instructions, commands, and available training
+The following models are pre-optimized for performance on AMD Instinct MI300
+series accelerators. Some instructions, commands, and available training
 configurations in this documentation might vary by model -- select one to get
 started.

@@ -126,28 +136,89 @@ This Docker image is optimized for specific model configurations outlined
 as follows. Performance can vary for other training workloads, as AMD
 doesn’t validate configurations and run conditions outside those described.

-Pull the Docker image
---------------------
+.. _amd-maxtext-multi-node-setup-v257:

-Use the following command to pull the Docker image from Docker Hub.
+Multi-node setup
+----------------

-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
+For multi-node environments, ensure you have all the necessary packages for
+your network device, such as, RDMA. If you're not using a multi-node setup
+with RDMA, skip ahead to :ref:`amd-maxtext-get-started-v257`.

-   {% set docker = data.dockers[0] %}
+1. Install the following packages to build and install the RDMA driver.

   .. code-block:: shell

-      docker pull {{ docker.pull_tag }}
+      sudo apt install iproute2 -y
+      sudo apt install -y linux-headers-"$(uname-r)" libelf-dev
+      sudo apt install -y gcc make libtool autoconf librdmacm-dev rdmacm-utils infiniband-diags ibverbs-utils perftest ethtool libibverbs-dev rdma-core strace libibmad5 libibnetdisc5 ibverbs-providers libibumad-dev libibumad3 libibverbs1 libnl-3-dev libnl-route-3-dev

-.. _amd-maxtext-multi-node-setup-v259:
+   Refer to your NIC manufacturer's documentation for further steps on
+   compiling and installing the RoCE driver. For example, for Broadcom,
+   see `Compiling Broadcom NIC software from source <https://docs.broadcom.com/doc/957608-AN2XX#G3.484341>`_
+   in `Ethernet networking guide for AMD Instinct MI300X GPU clusters <https://docs.broadcom.com/doc/957608-AN2XX>`_.

-Multi-node configuration
------------------------
+2. Set the following environment variables.

-See :doc:`/how-to/rocm-for-ai/system-setup/multi-node-setup` to configure your
-environment for multi-node training.
+   a. Master address

-.. _amd-maxtext-get-started-v259:
+      Change ``localhost`` to the master node's resolvable hostname or IP address:
+
+      .. code-block:: bash
+
+         export MASTER_ADDR="${MASTER_ADDR:-localhost}"
+
+   b. Number of nodes
+
+      Set the number of nodes you want to train on (for example, ``2``, ``4``, or ``8``):
+
+      .. code-block:: bash
+
+         export NNODES="${NNODES:-1}"
+
+   c. Node ranks
+
+      Set the rank of each node (``0`` for master, ``1`` for the first worker node, and so on)
+      Node ranks should be unique across all nodes in the cluster.
+
+      .. code-block:: bash
+
+         export NODE_RANK="${NODE_RANK:-0}"
+
+   d. Network interface
+
+      Update the network interface in the script to match your system's network interface. To
+      find your network interface, run the following (outside of any Docker container):
+
+      .. code-block:: bash
+
+         ip a
+
+      Look for an active interface with an IP address in the same subnet as
+      your other nodes. Then, update the following variable in the script, for
+      example:
+
+      .. code-block:: bash
+
+         export NCCL_SOCKET_IFNAME=ens50f0np0
+
+      This variable specifies which network interface to use for inter-node communication.
+      Setting this variable to the incorrect interface can result in communication failures
+      or significantly reduced performance.
+
+   e. RDMA interface
+
+      Ensure the :ref:`required packages <amd-maxtext-multi-node-setup-v257>` are installed on all nodes.
+      Then, set the RDMA interfaces to use for communication.
+
+      .. code-block:: bash
+
+         # If using Broadcom NIC
+         export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
+         # If using Mellanox NIC
+         export NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_8,mlx5_9
+
+.. _amd-maxtext-get-started-v257:

 Benchmarking
 ============
@@ -159,7 +230,7 @@ benchmark results:

   .. _vllm-benchmark-mad:

-   {% set docker = data.dockers[0] %}
+   {% set dockers = data.dockers %}
   {% set model_groups = data.model_groups %}
   {% for model_group in model_groups %}
      {% for model in model_group.models %}
@@ -171,9 +242,6 @@ benchmark results:
         {% if model.mad_tag and "single-node" in model.doc_options %}
         .. tab-item:: MAD-integrated benchmarking

-            The following run command is tailored to {{ model.model }}.
-            See :ref:`amd-maxtext-model-support-v259` to switch to another available model.
-
            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
               directory and install the required packages on the host machine.

@@ -202,19 +270,22 @@ benchmark results:

         .. tab-item:: Standalone benchmarking

-            The following commands are optimized for {{ model.model }}. See
-            :ref:`amd-maxtext-model-support-v259` to switch to another
-            available model. Some instructions and resources might not be
-            available for all models and configurations.
-
            .. rubric:: Download the Docker image and required scripts

            Run the JAX MaxText benchmark tool independently by starting the
            Docker container as shown in the following snippet.

-            .. code-block:: shell
+            .. tab-set::
+               {% for docker in dockers %}
+               {% set jax_version = docker.components["JAX"] %}

-               docker pull {{ docker.pull_tag }}
+               .. tab-item:: JAX {{ jax_version }}
+                  :sync: {{ docker.pull_tag }}
+
+                  .. code-block:: shell
+
+                     docker pull {{ docker.pull_tag }}
+               {% endfor %}

            {% if model.model_repo and "single-node" in model.doc_options %}
            .. rubric:: Single node training
@@ -235,25 +306,33 @@ benchmark results:

            2. Launch the Docker container.

-               .. code-block:: shell
+               .. tab-set::
+                  {% for docker in dockers %}
+                  {% set jax_version = docker.components["JAX"] %}

-                  docker run -it \
-                      --device=/dev/dri \
-                      --device=/dev/kfd \
-                      --network host \
-                      --ipc host \
-                      --group-add video \
-                      --cap-add=SYS_PTRACE \
-                      --security-opt seccomp=unconfined \
-                      --privileged \
-                      -v $HOME:$HOME \
-                      -v $HOME/.ssh:/root/.ssh \
-                      -v $HF_HOME:/hf_cache \
-                      -e HF_HOME=/hf_cache \
-                      -e MAD_SECRETS_HFTOKEN=$MAD_SECRETS_HFTOKEN
-                      --shm-size 64G \
-                      --name training_env \
-                      {{ docker.pull_tag }}
+                  .. tab-item:: JAX {{ jax_version }}
+                     :sync: {{ docker.pull_tag }}
+
+                     .. code-block:: shell
+
+                        docker run -it \
+                            --device=/dev/dri \
+                            --device=/dev/kfd \
+                            --network host \
+                            --ipc host \
+                            --group-add video \
+                            --cap-add=SYS_PTRACE \
+                            --security-opt seccomp=unconfined \
+                            --privileged \
+                            -v $HOME:$HOME \
+                            -v $HOME/.ssh:/root/.ssh \
+                            -v $HF_HOME:/hf_cache \
+                            -e HF_HOME=/hf_cache \
+                            -e MAD_SECRETS_HFTOKEN=$MAD_SECRETS_HFTOKEN
+                            --shm-size 64G \
+                            --name training_env \
+                            {{ docker.pull_tag }}
+                  {% endfor %}

            3. In the Docker container, clone the ROCm MAD repository and navigate to the
               benchmark scripts directory at ``MAD/scripts/jax-maxtext``.
@@ -276,27 +355,17 @@ benchmark results:

                  ./jax-maxtext_benchmark_report.sh -m {{ model.model_repo }}

-               For quantized training, run the script with the appropriate option for your Instinct GPU.
+               For quantized training, use the following command:

-               .. tab-set::
+               .. code-block:: shell

-                  .. tab-item:: MI355X and MI350X
+                  ./jax-maxtext_benchmark_report.sh -m {{ model.model_repo }} -q nanoo_fp8

-                     For ``fp8`` quantized training on MI355X and MI350X GPUs, use the following command:
+               .. important::

-                     .. code-block:: shell
-
-                        ./jax-maxtext_benchmark_report.sh -m {{ model.model_repo }} -q fp8
-
-                  {% if model.model_repo not in ["Llama-3.1-70B", "Llama-3.3-70B"] %}
-                  .. tab-item:: MI325X and MI300X
-
-                     For ``nanoo_fp8`` quantized training on MI300X series GPUs, use the following command:
-
-                     .. code-block:: shell
-
-                        ./jax-maxtext_benchmark_report.sh -m {{ model.model_repo }} -q nanoo_fp8
-                  {% endif %}
+                  Quantized training is not supported with the JAX 0.6.0 Docker image; support
+                  will be added in a future release. For quantized training, use the JAX 0.5.0
+                  Docker image: ``rocm/jax-training:maxtext-v25.7``.

            {% endif %}
            {% if model.multinode_training_script and "multi-node" in model.doc_options %}
@@ -310,11 +379,11 @@ benchmark results:
               benchmark. Run them outside of any Docker container.

            1. Make sure ``$HF_HOME`` is set before running the test. See
-               `ROCm benchmarking <https://github.com/ROCm/MAD/blob/develop/scripts/jax-maxtext/gpu-rocm/readme.md>`__
+               `ROCm benchmarking <https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/readme.md>`__
               for more details on downloading the Llama models before running the
               benchmark.

-            2. To run multi-node training for {{ model.model }},
+            2. To run multi-node training for {{ model.model }}, 
               use the
               `multi-node training script <https://github.com/ROCm/MAD/blob/develop/scripts/jax-maxtext/gpu-rocm/{{ model.multinode_training_script }}>`__
               under the ``scripts/jax-maxtext/gpu-rocm/`` directory.
@@ -328,7 +397,7 @@ benchmark results:
         {% else %}
            .. rubric:: Multi-node training

-            For multi-node training examples, choose a model from :ref:`amd-maxtext-model-support-v259`
+            For multi-node training examples, choose a model from :ref:`amd-maxtext-model-support-v257`
            with an available `multi-node training script <https://github.com/ROCm/MAD/tree/develop/scripts/jax-maxtext/gpu-rocm>`__.
         {% endif %}
      {% endfor %}
@@ -337,10 +406,12 @@ benchmark results:
 Further reading
 ===============

+- See the ROCm/maxtext benchmarking README at `<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/readme.md>`__.
+
 - To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.

 - To learn more about system settings and management practices to configure your system for
-  AMD Instinct MI300X Series GPUs, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+  AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.

 - For a list of other ready-made Docker images for AI with ROCm, see
  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
randyh62	2e89f254ba	Update .wordlist.txt fix spelling	2025-09-16 12:57:00 -07:00
randyh62	ffc9b918fa	Update RELEASE.md	2025-09-16 12:52:42 -07:00
randyh62	28742033e6	Revert "Update RELEASE.md (#5330 )" This reverts commit `9f703e27bb`.	2025-09-16 12:51:10 -07:00
Peter Park	24dec07aef	Add NCF to pytorch training benchmark doc (#5352 ) (#5353 ) * add previous version (25.6) * fix template * Formatting and wording fixes * add caveats * update yaml * add note to pytorch-training * fix template * make model name shorter (cherry picked from commit `bab853a0d3`)	2025-09-16 13:33:07 -04:00
Pratik Basyal	9e1871a01b	Github Issue Links updated (#5350 ) (#5351 ) * 7.0.0 compatibility updated * GIM link updated	2025-09-16 13:04:24 -04:00
Peter Park	b0fdab6c8c	fix pldm note (#5346 ) (#5348 ) (cherry picked from commit `8c40d14d7e`)	2025-09-16 11:14:43 -05:00
Peter Park	4e45bf7838	Merge develop into docs/7.0.0 (#5340 ) * Post GA fixes develop (#5329) * Develop link updated * Release notes and compatibilty update * Compatibilitbity updated * RPP link updated * Compatibility updated for 7.0.0 (#5332) * Compatibility udpated * Minor fix * docs(PyTorch training v25.8): Add Primus and update PyTorch training benchmark docs (#5331) * pyt: update previous versions list update conf.py * pyt: update yaml and rst update update toc * update headings and anchors * pyt: update doc * update docker hub urls * docs: Add SGLang disaggregated P/D inference w/ Mooncake guide (#5335) * add main content * Update content and format add clarification update update data * fix fix fix * fix: deepseek v3 * add ki * Update docs/how-to/rocm-for-ai/inference/benchmark-docker/sglang-distributed.rst Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com> * Update docs/how-to/rocm-for-ai/inference/benchmark-docker/sglang-distributed.rst Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com> * Update docs/how-to/rocm-for-ai/inference/benchmark-docker/sglang-distributed.rst Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com> * Update docs/how-to/rocm-for-ai/inference/benchmark-docker/sglang-distributed.rst Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com> * Update docs/how-to/rocm-for-ai/inference/benchmark-docker/sglang-distributed.rst Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com> * Update docs/how-to/rocm-for-ai/inference/benchmark-docker/sglang-distributed.rst Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com> * Update docs/how-to/rocm-for-ai/inference/benchmark-docker/sglang-distributed.rst Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com> * Update docs/how-to/rocm-for-ai/inference/benchmark-docker/sglang-distributed.rst Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com> * Update docs/how-to/rocm-for-ai/inference/benchmark-docker/sglang-distributed.rst Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com> * Update docs/how-to/rocm-for-ai/inference/benchmark-docker/sglang-distributed.rst Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com> --------- Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com> --------- Co-authored-by: Pratik Basyal <prbasyal@amd.com> Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com>	2025-09-16 10:42:08 -05:00
Pratik Basyal	ef75f43c5e	700 compatibility matrix fix (#5333 ) * Post GA fixes develop (#5329) * Develop link updated * Release notes and compatibilty update * Compatibilitbity updated * RPP link updated * Compatibility updated for 7.0.0 (#5332) * Compatibility udpated * Minor fix	2025-09-16 10:18:35 -05:00
randyh62	9f703e27bb	Update RELEASE.md (#5330 ) update llvm-project link URL	2025-09-16 07:32:03 -07:00
anisha-amd	1214bd84ed	Docs: deep learning table fix	2025-09-16 09:26:45 -04:00
Parag Bhandari	f8cb05fd07	Merge branch 'roc-7.0.x' into docs/7.0.0	2025-09-16 08:20:03 -04:00
Pratik Basyal	22a9ab4626	700 reset link [Develop] (#5325 ) (#5327 ) * TOC link update and manifest removed * Link reset * Changelog synced	2025-09-16 08:10:42 -04:00
Parag Bhandari	63d8f852da	Merge branch 'roc-7.0.x' into docs/7.0.0	2025-09-16 07:37:31 -04:00
Pratik Basyal	72127d21d3	700 update pre GA batch1 (#5322 ) (#5324 ) * Fix PLDM note for ROCm 7.0 (#5320) * fix pdlm for mi300x * update debian 12 support note * 7.0.0 Release notes update Batch 9 (#559) * Changelog synced * Compatibilty updated * Compatibilty update * Compiler highlight updated * wordlist updated --------- Co-authored-by: Peter Park <peter.park@amd.com>	2025-09-16 06:31:27 -05:00
Parag Bhandari	ecbcc9b11f	Merge branch 'develop' into docs/7.0.0	2025-09-16 06:09:57 -04:00
pbhandar-amd	76571df432	Sync develop into docs/7.0.0	2025-09-15 21:44:26 -04:00
pbhandar-amd	40ffdeb995	Sync develop into docs/7.0.0	2025-09-15 12:14:07 -04:00
pbhandar-amd	681f31fbb2	Sync develop into docs/7.0.0	2025-09-11 17:55:27 -04:00
pbhandar-amd	ceae5bc124	Update documentation requirements for ROCm	2025-09-11 15:27:33 -04:00
anisha-amd	5f516799fe	Docs: adding ray and llama.cpp live blog links (#5290 ) (#5292 )	2025-09-10 15:15:41 -04:00
anisha-amd	d6e4bb6ff6	Docs: frameworks compatibility- ray and llama.cpp (#5273 ) (#5275 )	2025-09-09 12:36:25 -04:00
pbhandar-amd	25ec3eec87	Sync develop into docs/7.0.0	2025-08-28 17:44:53 -04:00
pbhandar-amd	6048413d0d	Update documentation requirements	2025-08-28 17:09:16 -04:00
pbhandar-amd	94a4e655a7	Update requirements.in	2025-08-28 16:48:02 -04:00