Merge pull request #612 from ROCm/sync-develop-from-external

Sync develop from external for 7.1.0 GA
2026-01-08 22:28:06 -05:00 · 2025-10-29 17:13:01 -04:00
parent 54d226acd9 36c879b7e0
commit fe3dc988b8
29 changed files with 1832 additions and 810 deletions
--- a/.azuredevops/components/hipTensor.yml
+++ b/.azuredevops/components/hipTensor.yml
@@ -130,7 +130,7 @@ jobs:
      parameters:
        componentName: hipTensor
        testDir: '$(Agent.BuildDirectory)/rocm/bin/hiptensor'
-        testParameters: '-E ".*-extended" --output-on-failure --force-new-ctest-process --output-junit test_output.xml'
+        testParameters: '-E ".*-extended" --extra-verbose --output-on-failure --force-new-ctest-process --output-junit test_output.xml'
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/rccl.yml
+++ b/.azuredevops/components/rccl.yml
@@ -1,10 +1,35 @@
 parameters:
+- name: componentName
+  type: string
+  default: rccl
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+- name: systemsRepo
+  type: string
+  default: systems_repo
+- name: systemsSparseCheckoutDir
+  type: string
+  default: 'projects/rocprofiler-sdk'
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -57,19 +82,28 @@ parameters:
  type: object
  default:
    buildJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
    testJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+- name: downstreamComponentMatrix
+  type: object
+  default:
+    - rocprofiler-sdk:
+      name: rocprofiler-sdk
+      sparseCheckoutDir: ''
+      skipUnifiedBuild: 'false'
+      buildDependsOn:
+        - rccl_build

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: rccl_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}_${{ job.target }}
    timeoutInMinutes: 120
    variables:
    - group: common
@@ -77,17 +111,23 @@ jobs:
    - name: HIP_ROCCLR_HOME
      value: $(Build.BinariesDirectory)/rocm
    pool: ${{ variables.MEDIUM_BUILD_POOL }}
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
    workspace:
      clean: all
    steps:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-custom.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        submoduleBehaviour: recursive
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
      parameters:
@@ -97,10 +137,14 @@ jobs:
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
+        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
+        os: ${{ job.os }}
        extraBuildFlags: >-
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/bin/hipcc
          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/bin/hipcc
@@ -112,58 +156,87 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
+        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        gpuTarget: ${{ job.target }}
-        extraEnvVars:
-          - HIP_ROCCLR_HOME:::/home/user/workspace/rocm
-        installLatestCMake: true
+    - ${{ if eq(job.os, 'ubuntu2204') }}:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          gpuTarget: ${{ job.target }}
+          extraEnvVars:
+            - HIP_ROCCLR_HOME:::/home/user/workspace/rocm
+          installLatestCMake: true

- ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: rccl_test_${{ job.target }}
-    timeoutInMinutes: 120
-    dependsOn: rccl_build_${{ job.target }}
-    condition:
-      and(succeeded(),
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
-        eq(${{ parameters.aggregatePipeline }}, False)
-      )
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool: ${{ job.target }}_test_pool
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-      parameters:
-        gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmTestDependencies }}
-        gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: rccl
-        testDir: '$(Agent.BuildDirectory)/rocm/bin'
-        testExecutable: './rccl-UnitTests'
-        testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        environment: test
-        gpuTarget: ${{ job.target }}
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
+    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
+      timeoutInMinutes: 120
+      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
+      condition:
+        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
+          eq(${{ parameters.aggregatePipeline }}, False)
+        )
+      variables:
+      - group: common
+      - template: /.azuredevops/variables-global.yml
+      pool: ${{ job.target }}_test_pool
+      workspace:
+        clean: all
+      steps:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+        parameters:
+          preTargetFilter: ${{ parameters.componentName }}
+          os: ${{ job.os }}
+          gpuTarget: ${{ job.target }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
+          dependencyList: ${{ parameters.rocmTestDependencies }}
+          os: ${{ job.os }}
+          gpuTarget: ${{ job.target }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: ${{ parameters.componentName }}
+          os: ${{ job.os }}
+          testDir: '$(Agent.BuildDirectory)/rocm/bin'
+          testExecutable: './rccl-UnitTests'
+          testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          environment: test
+          gpuTarget: ${{ job.target }}
+
+- ${{ if parameters.triggerDownstreamJobs }}:
+  - ${{ each component in parameters.downstreamComponentMatrix }}:
+    - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
+      - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
+        parameters:
+          checkoutRepo: ${{ parameters.systemsRepo }}
+          sparseCheckoutDir: ${{ parameters.systemsSparseCheckoutDir }}
+          triggerDownstreamJobs: true
+          unifiedBuild: ${{ parameters.unifiedBuild }}
+          ${{ if parameters.unifiedBuild }}:
+            buildDependsOn: ${{ component.unifiedBuild.buildDependsOn }}
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ component.unifiedBuild.downstreamAggregateNames }}
+          ${{ else }}:
+            buildDependsOn: ${{ component.buildDependsOn }}
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
--- a/.azuredevops/components/rocm-cmake.yml
+++ b/.azuredevops/components/rocm-cmake.yml
@@ -81,7 +81,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
      parameters:
        componentName: rocm-cmake
-        testParameters: '-E "pass-version-parent" --output-on-failure --force-new-ctest-process --output-junit test_output.xml'
+        testParameters: '-E "pass-version-parent" --extra-verbose --output-on-failure --force-new-ctest-process --output-junit test_output.xml'
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
--- a/.azuredevops/components/rocm-examples.yml
+++ b/.azuredevops/components/rocm-examples.yml
@@ -21,13 +21,19 @@ parameters:
    - libtbb-dev
    - libtiff-dev
    - libva-amdgpu-dev
+    - libavcodec-dev
+    - libavformat-dev
+    - libavutil-dev
    - ninja-build
    - python3-pip
 - name: rocmDependencies
  type: object
  default:
    - AMDMIGraphX
+    - aomp
+    - aomp-extras
    - clr
+    - composable_kernel
    - hipBLAS
    - hipBLAS-common
    - hipBLASLt
@@ -40,7 +46,10 @@ parameters:
    - hipTensor
    - llvm-project
    - MIOpen
+    - MIVisionX
+    - rocALUTION
    - rocBLAS
+    - rocDecode
    - rocFFT
    - rocJPEG
    - rocPRIM
@@ -57,7 +66,10 @@ parameters:
  type: object
  default:
    - AMDMIGraphX
+    - aomp
+    - aomp-extras
    - clr
+    - composable_kernel
    - hipBLAS
    - hipBLAS-common
    - hipBLASLt
@@ -70,7 +82,10 @@ parameters:
    - hipTensor
    - llvm-project
    - MIOpen
+    - MIVisionX
+    - rocALUTION
    - rocBLAS
+    - rocDecode
    - rocFFT
    - rocminfo
    - rocPRIM
--- a/.azuredevops/components/rocprofiler-sdk.yml
+++ b/.azuredevops/components/rocprofiler-sdk.yml
@@ -79,27 +79,27 @@ parameters:
  type: object
  default:
    buildJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
    testJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: rocprofiler_sdk_build_${{ job.target }}
+  - job: rocprofiler_sdk_build_${{ job.os }}_${{ job.target }}
    ${{ if parameters.buildDependsOn }}:
      dependsOn:
        - ${{ each build in parameters.buildDependsOn }}:
-          - ${{ build }}_${{ job.target }}
+          - ${{ build }}_${{ job.os}}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
    pool: ${{ variables.MEDIUM_BUILD_POOL }}
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
    workspace:
      clean: all
    steps:
@@ -107,6 +107,7 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
+        packageManager: ${{ job.packageManager }}
        registerROCmPackages: true
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
@@ -118,6 +119,7 @@ jobs:
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
+        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
        ${{ if parameters.triggerDownstreamJobs }}:
@@ -132,6 +134,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        componentName: ${{ parameters.componentName }}
+        os: ${{ job.os }}
        extraBuildFlags: >-
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
          -DROCPROFILER_BUILD_TESTS=ON
@@ -143,6 +146,7 @@ jobs:
      parameters:
        componentName: ${{ parameters.componentName }}
        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
@@ -158,8 +162,8 @@ jobs:

 - ${{ if eq(parameters.unifiedBuild, False) }}:
  - ${{ each job in parameters.jobMatrix.testJobs }}:
-    - job: rocprofiler_sdk_test_${{ job.target }}
-      dependsOn: rocprofiler_sdk_build_${{ job.target }}
+    - job: rocprofiler_sdk_test_${{ job.os }}_${{ job.target }}
+      dependsOn: rocprofiler_sdk_build_${{ job.os }}_${{ job.target }}
      condition:
        and(succeeded(),
          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -177,6 +181,7 @@ jobs:
        parameters:
          aptPackages: ${{ parameters.aptPackages }}
          pipModules: ${{ parameters.pipModules }}
+          packageManager: ${{ job.packageManager }}
          registerROCmPackages: true
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
@@ -188,6 +193,7 @@ jobs:
        parameters:
          checkoutRef: ${{ parameters.checkoutRef }}
          dependencyList: ${{ parameters.rocmDependencies }}
+          os: ${{ job.os }}
          gpuTarget: ${{ job.target }}
          ${{ if parameters.triggerDownstreamJobs }}:
              downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
@@ -202,6 +208,7 @@ jobs:
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
        parameters:
          componentName: ${{ parameters.componentName }}
+          os: ${{ job.os }}
          extraBuildFlags: >-
            -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
            -DROCPROFILER_BUILD_TESTS=ON
@@ -213,7 +220,8 @@ jobs:
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
        parameters:
          componentName: ${{ parameters.componentName }}
-          testDir: $(Agent.BuildDirectory)/s/build
+          os: ${{ job.os }}
+          testDir: $(Agent.BuildDirectory)/build
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
        parameters:
          aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/templates/steps/test.yml
+++ b/.azuredevops/templates/steps/test.yml
@@ -13,7 +13,7 @@ parameters:
  default: ctest
 - name: testParameters
  type: string
-  default: --output-on-failure --force-new-ctest-process --output-junit test_output.xml
+  default: --extra-verbose --output-on-failure --force-new-ctest-process --output-junit test_output.xml
 - name: extraTestParameters
  type: string
  default: ''
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -27,6 +27,7 @@ ASICs
 ASan
 ASAN
 ASm
+Async
 ATI
 atomicRMW
 AddressSanitizer
@@ -34,6 +35,7 @@ AlexNet
 Andrej
 Arb
 Autocast
+autograd
 BARs
 BatchNorm
 BLAS
@@ -86,9 +88,11 @@ Conda
 ConnectX
 CountOnes
 CuPy
+customizable
 da
 Dashboarding
 Dataloading
+dataflows
 DBRX
 DDR
 DF
@@ -130,6 +134,7 @@ ELMo
 ENDPGM
 EPYC
 ESXi
+EP
 EoS
 etcd
 fas
@@ -181,8 +186,9 @@ GPR
 GPT
 GPU
 GPU's
+GPUDirect
 GPUs
-Graphbolt
+GraphBolt
 GraphSage
 GRBM
 GRE
@@ -212,6 +218,7 @@ Haswell
 Higgs
 href
 Hyperparameters
+HybridEngine
 Huggingface
 IB
 ICD
@@ -298,6 +305,7 @@ Makefiles
 Matplotlib
 Matrox
 MaxText
+MBT
 Megablocks
 Megatrends
 Megatron
@@ -307,6 +315,7 @@ Meta's
 Miniconda
 MirroredStrategy
 Mixtral
+MLA
 MosaicML
 MoEs
 Mooncake
@@ -349,6 +358,7 @@ OFED
 OMM
 OMP
 OMPI
+OOM
 OMPT
 OMPX
 ONNX
@@ -394,6 +404,7 @@ Profiler's
 PyPi
 Pytest
 PyTorch
+QPS
 Qcycles
 Qwen
 RAII
@@ -669,6 +680,7 @@ denoised
 denoises
 denormalize
 dequantization
+dequantized
 dequantizes
 deserializers
 detections
@@ -784,6 +796,7 @@ linalg
 linearized
 linter
 linux
+llm
 llvm
 lm
 localscratch
@@ -834,6 +847,7 @@ passthrough
 pe
 perfcounter
 performant
+piecewise
 perl
 pragma
 pre
@@ -980,6 +994,7 @@ tokenizer
 tokenizes
 toolchain
 toolchains
+topk
 toolset
 toolsets
 torchtitan
@@ -1007,6 +1022,7 @@ USM
 UTCL
 UTIL
 utils
+UX
 vL
 variational
 vdi
--- a/docs/compatibility/ml-compatibility/dgl-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/dgl-compatibility.rst
@@ -2,7 +2,7 @@

 .. meta::
    :description: Deep Graph Library (DGL) compatibility
-    :keywords: GPU, DGL compatibility
+    :keywords: GPU, CPU, deep graph library, DGL, deep learning, framework compatibility

 .. version-set:: rocm_version latest

@@ -10,24 +10,42 @@
 DGL compatibility
 ********************************************************************************

-Deep Graph Library `(DGL) <https://www.dgl.ai/>`_ is an easy-to-use, high-performance and scalable 
+Deep Graph Library (`DGL <https://www.dgl.ai/>`__) is an easy-to-use, high-performance, and scalable 
 Python package for deep learning on graphs. DGL is framework agnostic, meaning 
-if a deep graph model is a component in an end-to-end application, the rest of 
+that if a deep graph model is a component in an end-to-end application, the rest of 
 the logic is implemented using PyTorch.  

-* ROCm support for DGL is hosted in the `https://github.com/ROCm/dgl <https://github.com/ROCm/dgl>`_ repository. 
-* Due to independent compatibility considerations, this location differs from the `https://github.com/dmlc/dgl <https://github.com/dmlc/dgl>`_ upstream repository. 
-* Use the prebuilt :ref:`Docker images <dgl-docker-compat>` with DGL, PyTorch, and ROCm preinstalled.
-* See the :doc:`ROCm DGL installation guide <rocm-install-on-linux:install/3rd-party/dgl-install>` 
-  to install and get started.
+DGL provides a high-performance graph object that can reside on either CPUs or GPUs. 
+It bundles structural data features for better control and provides a variety of functions 
+for computing with graph objects, including efficient and customizable message passing 
+primitives for Graph Neural Networks.

-
-Supported devices
+Support overview
 ================================================================================

- **Officially Supported**: TF32 with AMD Instinct MI300X (through hipblaslt)
- **Partially Supported**: TF32 with AMD Instinct MI250X
+- The ROCm-supported version of DGL is maintained in the official `https://github.com/ROCm/dgl 
+  <https://github.com/ROCm/dgl>`__ repository, which differs from the 
+  `https://github.com/dmlc/dgl <https://github.com/dmlc/dgl>`__ upstream repository.

+- To get started and install DGL on ROCm, use the prebuilt :ref:`Docker images <dgl-docker-compat>`, 
+  which include ROCm, DGL, and all required dependencies.
+
+  - See the :doc:`ROCm DGL installation guide <rocm-install-on-linux:install/3rd-party/dgl-install>` 
+    for installation and setup instructions.
+
+  - You can also consult the upstream `Installation guide <https://www.dgl.ai/pages/start.html>`__ 
+    for additional context.
+
+Version support
+--------------------------------------------------------------------------------
+
+DGL is supported on `ROCm 6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__.
+
+Supported devices
+--------------------------------------------------------------------------------
+
+- **Officially Supported**: AMD Instinct™ MI300X (through `hipBLASlt <https://rocm.docs.amd.com/projects/hipBLASLt/en/latest/index.html>`__)
+- **Partially Supported**: AMD Instinct™ MI250X

 .. _dgl-recommendations:

@@ -35,7 +53,7 @@ Use cases and recommendations
 ================================================================================

 DGL can be used for Graph Learning, and building popular graph models like  
-GAT, GCN and GraphSage. Using these we can support a variety of use-cases such as:
+GAT, GCN, and GraphSage. Using these models, a variety of use cases are supported:

 - Recommender systems
 - Network Optimization and Analysis
@@ -62,16 +80,17 @@ Docker image compatibility

   <i class="fab fa-docker"></i>

-AMD validates and publishes `DGL images <https://hub.docker.com/r/rocm/dgl>`_
-with ROCm and Pytorch backends on Docker Hub. The following Docker image tags and associated
-inventories were tested on `ROCm 6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`_.
+AMD validates and publishes `DGL images <https://hub.docker.com/r/rocm/dgl/tags>`__
+with ROCm backends on Docker Hub. The following Docker image tags and associated
+inventories represent the latest available DGL version from the official Docker Hub. 
 Click the |docker-icon| to view the image on Docker Hub.

 .. list-table:: DGL Docker image components
    :header-rows: 1
    :class: docker-image-compatibility

-    * - Docker
+    * - Docker image
+      - ROCm
      - DGL
      - PyTorch
      - Ubuntu
@@ -81,102 +100,106 @@ Click the |docker-icon| to view the image on Docker Hub.

           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.6.0/images/sha256-8ce2c3bcfaa137ab94a75f9e2ea711894748980f57417739138402a542dd5564"><i class="fab fa-docker fa-lg"></i></a>

-      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`_
-      - `2.6.0 <https://github.com/ROCm/pytorch/tree/release/2.6>`_
+      - `6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__.
+      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
+      - `2.6.0 <https://github.com/ROCm/pytorch/tree/release/2.6>`__
      - 24.04
-      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
+      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`__

    * - .. raw:: html

           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.4.1/images/sha256-cf1683283b8eeda867b690229c8091c5bbf1edb9f52e8fb3da437c49a612ebe4"><i class="fab fa-docker fa-lg"></i></a>

-      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`_
-      - `2.4.1 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
+      - `6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__.
+      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
+      - `2.4.1 <https://github.com/ROCm/pytorch/tree/release/2.4>`__
      - 24.04
-      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
+      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`__


    * - .. raw:: html

           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.4.1/images/sha256-4834f178c3614e2d09e89e32041db8984c456d45dfd20286e377ca8635686554"><i class="fab fa-docker fa-lg"></i></a>

-      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`_
-      - `2.4.1 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
+      - `6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__.
+      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
+      - `2.4.1 <https://github.com/ROCm/pytorch/tree/release/2.4>`__
      - 22.04
-      - `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
+      - `3.10.16 <https://www.python.org/downloads/release/python-31016/>`__


    * - .. raw:: html

           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.3.0/images/sha256-88740a2c8ab4084b42b10c3c6ba984cab33dd3a044f479c6d7618e2b2cb05e69"><i class="fab fa-docker fa-lg"></i></a>

-      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`_
-      - `2.3.0 <https://github.com/ROCm/pytorch/tree/release/2.3>`_
+      - `6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__.
+      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`__
+      - `2.3.0 <https://github.com/ROCm/pytorch/tree/release/2.3>`__
      - 22.04
-      - `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
+      - `3.10.16 <https://www.python.org/downloads/release/python-31016/>`__
      

 Key ROCm libraries for DGL
 ================================================================================

 DGL on ROCm depends on specific libraries that affect its features and performance.
-Using the DGL Docker container or building it with the provided docker file or a ROCm base image is recommended.
+Using the DGL Docker container or building it with the provided Docker file or a ROCm base image is recommended.
 If you prefer to build it yourself, ensure the following dependencies are installed:

 .. list-table:: 
    :header-rows: 1

    * - ROCm library
-      - Version
+      - ROCm 6.4.0 Version
      - Purpose
    * - `Composable Kernel <https://github.com/ROCm/composable_kernel>`_
-      - :version-ref:`"Composable Kernel" rocm_version`
+      - 1.1.0
      - Enables faster execution of core operations like matrix multiplication
        (GEMM), convolutions and transformations.
    * - `hipBLAS <https://github.com/ROCm/hipBLAS>`_
-      - :version-ref:`hipBLAS rocm_version`
+      - 2.4.0
      - Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
        matrix and vector operations.
    * - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`_
-      - :version-ref:`hipBLASLt rocm_version`
+      - 0.12.0
      - hipBLASLt is an extension of the hipBLAS library, providing additional
        features like epilogues fused into the matrix multiplication kernel or
        use of integer tensor cores.
    * - `hipCUB <https://github.com/ROCm/hipCUB>`_
-      - :version-ref:`hipCUB rocm_version`
+      - 3.4.0
      - Provides a C++ template library for parallel algorithms for reduction,
        scan, sort and select.
    * - `hipFFT <https://github.com/ROCm/hipFFT>`_
-      - :version-ref:`hipFFT rocm_version`
+      - 1.0.18
      - Provides GPU-accelerated Fast Fourier Transform (FFT) operations.
    * - `hipRAND <https://github.com/ROCm/hipRAND>`_
-      - :version-ref:`hipRAND rocm_version`
+      - 2.12.0
      - Provides fast random number generation for GPUs.
    * - `hipSOLVER <https://github.com/ROCm/hipSOLVER>`_
-      - :version-ref:`hipSOLVER rocm_version`
+      - 2.4.0
      - Provides GPU-accelerated solvers for linear systems, eigenvalues, and
        singular value decompositions (SVD).
    * - `hipSPARSE <https://github.com/ROCm/hipSPARSE>`_
-      - :version-ref:`hipSPARSE rocm_version`
+      - 3.2.0
      - Accelerates operations on sparse matrices, such as sparse matrix-vector
        or matrix-matrix products.
    * - `hipSPARSELt <https://github.com/ROCm/hipSPARSELt>`_
-      - :version-ref:`hipSPARSELt rocm_version`
+      - 0.2.3
      - Accelerates operations on sparse matrices, such as sparse matrix-vector
        or matrix-matrix products.
    * - `hipTensor <https://github.com/ROCm/hipTensor>`_
-      - :version-ref:`hipTensor rocm_version`
+      - 1.5.0
      - Optimizes for high-performance tensor operations, such as contractions.
    * - `MIOpen <https://github.com/ROCm/MIOpen>`_
-      - :version-ref:`MIOpen rocm_version`
+      - 3.4.0
      - Optimizes deep learning primitives such as convolutions, pooling,
        normalization, and activation functions.
    * - `MIGraphX <https://github.com/ROCm/AMDMIGraphX>`_
-      - :version-ref:`MIGraphX rocm_version`
+      - 2.12.0
      - Adds graph-level optimizations, ONNX models and mixed precision support
        and enable Ahead-of-Time (AOT) Compilation.
    * - `MIVisionX <https://github.com/ROCm/MIVisionX>`_
-      - :version-ref:`MIVisionX rocm_version`
+      - 3.2.0
      - Optimizes acceleration for computer vision and AI workloads like
        preprocessing, augmentation, and inferencing.
    * - `rocAL <https://github.com/ROCm/rocAL>`_
@@ -184,25 +207,25 @@ If you prefer to build it yourself, ensure the following dependencies are instal
      - Accelerates the data pipeline by offloading intensive preprocessing and
        augmentation tasks. rocAL is part of MIVisionX.
    * - `RCCL <https://github.com/ROCm/rccl>`_
-      - :version-ref:`RCCL rocm_version`
+      - 2.2.0
      - Optimizes for multi-GPU communication for operations like AllReduce and
        Broadcast.
    * - `rocDecode <https://github.com/ROCm/rocDecode>`_
-      - :version-ref:`rocDecode rocm_version`
+      - 0.10.0
      - Provides hardware-accelerated data decoding capabilities, particularly
        for image, video, and other dataset formats.
    * - `rocJPEG <https://github.com/ROCm/rocJPEG>`_
-      - :version-ref:`rocJPEG rocm_version`
+      - 0.8.0
      - Provides hardware-accelerated JPEG image decoding and encoding.
    * - `RPP <https://github.com/ROCm/RPP>`_
-      - :version-ref:`RPP rocm_version`
+      - 1.9.10
      - Speeds up data augmentation, transformation, and other preprocessing steps.
    * - `rocThrust <https://github.com/ROCm/rocThrust>`_
-      - :version-ref:`rocThrust rocm_version`
+      - 3.3.0
      - Provides a C++ template library for parallel algorithms like sorting,
        reduction, and scanning.
    * - `rocWMMA <https://github.com/ROCm/rocWMMA>`_
-      - :version-ref:`rocWMMA rocm_version`
+      - 1.7.0
      - Accelerates warp-level matrix-multiply and matrix-accumulate to speed up matrix
        multiplication (GEMM) and accumulation operations with mixed precision
        support.
@@ -211,14 +234,14 @@ If you prefer to build it yourself, ensure the following dependencies are instal
 Supported features
 ================================================================================

-Many functions and methods available in DGL Upstream are also supported in DGL ROCm.
+Many functions and methods available upstream are also supported in DGL on ROCm.
 Instead of listing them all, support is grouped into the following categories to provide a general overview. 

 * DGL Base
 * DGL Backend 
 * DGL Data
 * DGL Dataloading
-* DGL DGLGraph
+* DGL Graph
 * DGL Function
 * DGL Ops
 * DGL Sampling
@@ -235,9 +258,9 @@ Instead of listing them all, support is grouped into the following categories to
 Unsupported features
 ================================================================================

-* Graphbolt
-* Partial TF32 Support (MI250x only)
-* Kineto/ ROCTracer integration
+* GraphBolt
+* Partial TF32 Support (MI250X only)
+* Kineto/ROCTracer integration


 Unsupported functions
--- a/docs/compatibility/ml-compatibility/flashinfer-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/flashinfer-compatibility.rst
@@ -1,8 +1,8 @@
 :orphan:

 .. meta::
-    :description: FlashInfer deep learning framework compatibility
-    :keywords: GPU, LLM, FlashInfer, compatibility
+    :description: FlashInfer compatibility
+    :keywords: GPU, LLM, FlashInfer, deep learning, framework compatibility

 .. version-set:: rocm_version latest

@@ -11,7 +11,7 @@ FlashInfer compatibility
 ********************************************************************************

 `FlashInfer <https://docs.flashinfer.ai/index.html>`__ is a library and kernel generator 
-for Large Language Models (LLMs) that provides high-performance implementation of graphics 
+for Large Language Models (LLMs) that provides a high-performance implementation of graphics 
 processing units (GPUs) kernels. FlashInfer focuses on LLM serving and inference, as well 
 as advanced performance across diverse scenarios.

@@ -25,28 +25,30 @@ offers high-performance LLM-specific operators, with easy integration through Py
  For the latest feature compatibility matrix, refer to the ``README`` of the 
  `https://github.com/ROCm/flashinfer <https://github.com/ROCm/flashinfer>`__ repository.

-Support for the ROCm port of FlashInfer is available as follows:
+Support overview
+================================================================================

- ROCm support for FlashInfer is hosted in the `https://github.com/ROCm/flashinfer 
-  <https://github.com/ROCm/flashinfer>`__ repository. This location differs from the 
-  `https://github.com/flashinfer-ai/flashinfer <https://github.com/flashinfer-ai/flashinfer>`_ 
+- The ROCm-supported version of FlashInfer is maintained in the official `https://github.com/ROCm/flashinfer 
+  <https://github.com/ROCm/flashinfer>`__ repository, which differs from the 
+  `https://github.com/flashinfer-ai/flashinfer <https://github.com/flashinfer-ai/flashinfer>`__ 
  upstream repository.

- To install FlashInfer, use the prebuilt :ref:`Docker image <flashinfer-docker-compat>`, 
-  which includes ROCm, FlashInfer, and all required dependencies.
+- To get started and install FlashInfer on ROCm, use the prebuilt :ref:`Docker images <flashinfer-docker-compat>`, 
+  which include ROCm, FlashInfer, and all required dependencies.

  - See the :doc:`ROCm FlashInfer installation guide <rocm-install-on-linux:install/3rd-party/flashinfer-install>` 
-    to install and get started.
+    for installation and setup instructions.

-  - See the `Installation guide <https://docs.flashinfer.ai/installation.html>`__ 
-    in the upstream FlashInfer documentation.
+  - You can also consult the upstream `Installation guide <https://docs.flashinfer.ai/installation.html>`__ 
+    for additional context.

-.. note::
+Version support
+--------------------------------------------------------------------------------

-  Flashinfer is supported on ROCm 6.4.1.
+FlashInfer is supported on `ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__.

 Supported devices
-================================================================================
+--------------------------------------------------------------------------------

 **Officially Supported**: AMD Instinct™ MI300X

@@ -78,10 +80,9 @@ Docker image compatibility

   <i class="fab fa-docker"></i>

-AMD validates and publishes `ROCm FlashInfer images <https://hub.docker.com/r/rocm/flashinfer/tags>`__
-with ROCm and Pytorch backends on Docker Hub. The following Docker image tags and associated
-inventories represent the FlashInfer version from the official Docker Hub.
-The Docker images have been validated for `ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__.
+AMD validates and publishes `FlashInfer images <https://hub.docker.com/r/rocm/flashinfer/tags>`__
+with ROCm backends on Docker Hub. The following Docker image tag and associated
+inventories represent the latest available FlashInfer version from the official Docker Hub. 
 Click |docker-icon| to view the image on Docker Hub.

 .. list-table:: 
--- a/docs/compatibility/ml-compatibility/jax-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/jax-compatibility.rst
@@ -2,7 +2,7 @@

 .. meta::
   :description: JAX compatibility
-   :keywords: GPU, JAX compatibility
+   :keywords: GPU, JAX, deep learning, framework compatibility

 .. version-set:: rocm_version latest

@@ -10,42 +10,38 @@
 JAX compatibility
 *******************************************************************************

-JAX provides a NumPy-like API, which combines automatic differentiation and the
-Accelerated Linear Algebra (XLA) compiler to achieve high-performance machine
-learning at scale.
+`JAX <https://docs.jax.dev/en/latest/notebooks/thinking_in_jax.html>`__ is a library 
+for array-oriented numerical computation (similar to NumPy), with automatic differentiation 
+and just-in-time (JIT) compilation to enable high-performance machine learning research.

-JAX uses composable transformations of Python and NumPy through just-in-time
-(JIT) compilation, automatic vectorization, and parallelization. To learn about
-JAX, including profiling and optimizations, see the official `JAX documentation
-<https://jax.readthedocs.io/en/latest/notebooks/quickstart.html>`_.
+JAX provides an API that combines automatic differentiation and the 
+Accelerated Linear Algebra (XLA) compiler to achieve high-performance machine 
+learning at scale. JAX uses composable transformations of Python and NumPy through 
+JIT compilation, automatic vectorization, and parallelization.

-ROCm support for JAX is upstreamed, and users can build the official source code
-with ROCm support:
+Support overview
+================================================================================

- ROCm JAX release:
+- The ROCm-supported version of JAX is maintained in the official `https://github.com/ROCm/rocm-jax 
+  <https://github.com/ROCm/rocm-jax>`__ repository, which differs from the 
+  `https://github.com/jax-ml/jax <https://github.com/jax-ml/jax>`__ upstream repository.

-  - Offers AMD-validated and community :ref:`Docker images <jax-docker-compat>`
-    with ROCm and JAX preinstalled.
+- To get started and install JAX on ROCm, use the prebuilt :ref:`Docker images <jax-docker-compat>`, 
+  which include ROCm, JAX, and all required dependencies.

-  - ROCm JAX repository: `ROCm/rocm-jax <https://github.com/ROCm/rocm-jax>`_
+  - See the :doc:`ROCm JAX installation guide <rocm-install-on-linux:install/3rd-party/jax-install>` 
+    for installation and setup instructions.

-  - See the :doc:`ROCm JAX installation guide <rocm-install-on-linux:install/3rd-party/jax-install>`
-    to get started.
+  - You can also consult the upstream `Installation guide <https://jax.readthedocs.io/en/latest/installation.html#amd-gpu-linux>`__ 
+    for additional context.

- Official JAX release:
+Version support
+--------------------------------------------------------------------------------

-  - Official JAX repository: `jax-ml/jax <https://github.com/jax-ml/jax>`_
-
-  - See the `AMD GPU (Linux) installation section
-    <https://jax.readthedocs.io/en/latest/installation.html#amd-gpu-linux>`_ in
-    the JAX documentation.
-
-.. note::
-
-   AMD releases official `ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax>`_
-   quarterly alongside new ROCm releases. These images undergo full AMD testing.
-   `Community ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax-community>`_
-   follow upstream JAX releases and use the latest available ROCm version.
+AMD releases official `ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax/tags>`_
+quarterly alongside new ROCm releases. These images undergo full AMD testing.
+`Community ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax-community/tags>`_
+follow upstream JAX releases and use the latest available ROCm version.

 Use cases and recommendations
 ================================================================================
@@ -71,7 +67,7 @@ Use cases and recommendations
 * The `Distributed fine-tuning with JAX on AMD GPUs <https://rocm.blogs.amd.com/artificial-intelligence/distributed-sft-jax/README.html>`_
  outlines the process of fine-tuning a Bidirectional Encoder Representations
  from Transformers (BERT)-based large language model (LLM) using JAX for a text
-  classification task. The blog post discuss techniques for parallelizing the
+  classification task. The blog post discusses techniques for parallelizing the
  fine-tuning across multiple AMD GPUs and assess the model's performance on a
  holdout dataset. During the fine-tuning, a BERT-base-cased transformer model
  and the General Language Understanding Evaluation (GLUE) benchmark dataset was
@@ -90,9 +86,9 @@ For more use cases and recommendations, see `ROCm JAX blog posts <https://rocm.b
 Docker image compatibility
 ================================================================================

-AMD provides preconfigured Docker images with JAX and the ROCm backend.
-These images are published on `Docker Hub <https://hub.docker.com/r/rocm/jax>`__ and are the
-recommended way to get started with deep learning with JAX on ROCm.
+AMD validates and publishes `JAX images <https://hub.docker.com/r/rocm/jax/tags>`__
+with ROCm backends on Docker Hub.
+
 For ``jax-community`` images, see `rocm/jax-community
 <https://hub.docker.com/r/rocm/jax-community/tags>`__ on Docker Hub.

@@ -234,7 +230,7 @@ The ROCm supported data types in JAX are collected in the following table.

 .. note::

-  JAX data type support is effected by the :ref:`key_rocm_libraries` and it's
+  JAX data type support is affected by the :ref:`key_rocm_libraries` and it's
  collected on :doc:`ROCm data types and precision support <rocm:reference/precision-support>`
  page.

--- a/docs/compatibility/ml-compatibility/llama-cpp-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/llama-cpp-compatibility.rst
@@ -1,8 +1,8 @@
 :orphan:

 .. meta::
-    :description: llama.cpp deep learning framework compatibility
-    :keywords: GPU, GGML, llama.cpp compatibility
+    :description: llama.cpp compatibility
+    :keywords: GPU, GGML, llama.cpp, deep learning, framework compatibility

 .. version-set:: rocm_version latest

@@ -20,33 +20,32 @@ to accelerate inference and reduce memory usage. Originally built as a CPU-first
 llama.cpp is easy to integrate with other programming environments and is widely 
 adopted across diverse platforms, including consumer devices. 

-ROCm support for llama.cpp is upstreamed, and you can build the official source code
-with ROCm support:
-
- ROCm support for llama.cpp is hosted in the official `https://github.com/ROCm/llama.cpp 
-  <https://github.com/ROCm/llama.cpp>`_ repository.
-
- Due to independent compatibility considerations, this location differs from the 
-  `https://github.com/ggml-org/llama.cpp <https://github.com/ggml-org/llama.cpp>`_ upstream repository.
-
- To install llama.cpp, use the prebuilt :ref:`Docker image <llama-cpp-docker-compat>`, 
-  which includes ROCm, llama.cpp, and all required dependencies.
-
-  - See the :doc:`ROCm llama.cpp installation guide <rocm-install-on-linux:install/3rd-party/llama-cpp-install>` 
-    to install and get started.
-
-  - See the `Installation guide <https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md#hip>`__ 
-    in the upstream llama.cpp documentation.
-
-.. note::
-
-  llama.cpp is supported on ROCm 7.0.0 and ROCm 6.4.x.
-
-Supported devices
+Support overview
 ================================================================================

-**Officially Supported**: AMD Instinct™ MI300X, MI325X, MI210
+- The ROCm-supported version of llama.cpp is maintained in the official `https://github.com/ROCm/llama.cpp 
+  <https://github.com/ROCm/llama.cpp>`__ repository, which differs from the 
+  `https://github.com/ggml-org/llama.cpp <https://github.com/ggml-org/llama.cpp>`__ upstream repository.

+- To get started and install llama.cpp on ROCm, use the prebuilt :ref:`Docker images <llama-cpp-docker-compat>`, 
+  which include ROCm, llama.cpp, and all required dependencies.
+
+  - See the :doc:`ROCm llama.cpp installation guide <rocm-install-on-linux:install/3rd-party/llama-cpp-install>` 
+    for installation and setup instructions.
+
+  - You can also consult the upstream `Installation guide <https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md>`__ 
+    for additional context.
+
+Version support
+--------------------------------------------------------------------------------
+
+llama.cpp is supported on `ROCm 7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__ and 
+`ROCm 6.4.x <https://repo.radeon.com/rocm/apt/6.4/>`__.
+
+Supported devices
+--------------------------------------------------------------------------------
+
+**Officially Supported**: AMD Instinct™ MI300X, MI325X, MI210

 Use cases and recommendations
 ================================================================================
@@ -84,9 +83,9 @@ Docker image compatibility

   <i class="fab fa-docker"></i>

-AMD validates and publishes `ROCm llama.cpp Docker images <https://hub.docker.com/r/rocm/llama.cpp/tags>`__
+AMD validates and publishes `llama.cpp images <https://hub.docker.com/r/rocm/llama.cpp/tags>`__
 with ROCm backends on Docker Hub. The following Docker image tags and associated
-inventories represent the available llama.cpp versions from the official Docker Hub.
+inventories represent the latest available llama.cpp versions from the official Docker Hub.
 Click |docker-icon| to view the image on Docker Hub.

 .. important::
--- a/docs/compatibility/ml-compatibility/megablocks-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/megablocks-compatibility.rst
@@ -2,7 +2,7 @@

 .. meta::
    :description: Megablocks compatibility
-    :keywords: GPU, megablocks, compatibility
+    :keywords: GPU, megablocks, deep learning, framework compatibility

 .. version-set:: rocm_version latest

@@ -10,28 +10,42 @@
 Megablocks compatibility
 ********************************************************************************

-Megablocks is a light-weight library for mixture-of-experts (MoE) training. 
+`Megablocks <https://github.com/databricks/megablocks>`__ is a lightweight library 
+for mixture-of-experts `(MoE) <https://huggingface.co/blog/moe>`__ training. 
 The core of the system is efficient "dropless-MoE" and standard MoE layers. 
-Megablocks is integrated with `https://github.com/stanford-futuredata/Megatron-LM <https://github.com/stanford-futuredata/Megatron-LM>`_, 
+Megablocks is integrated with `https://github.com/stanford-futuredata/Megatron-LM 
+<https://github.com/stanford-futuredata/Megatron-LM>`__, 
 where data and pipeline parallel training of MoEs is supported.

-* ROCm support for Megablocks is hosted in the official `https://github.com/ROCm/megablocks <https://github.com/ROCm/megablocks>`_ repository. 
-* Due to independent compatibility considerations, this location differs from the `https://github.com/stanford-futuredata/Megatron-LM <https://github.com/stanford-futuredata/Megatron-LM>`_ upstream repository. 
-* Use the prebuilt :ref:`Docker image <megablocks-docker-compat>` with ROCm, PyTorch, and Megablocks preinstalled. 
-* See the :doc:`ROCm Megablocks installation guide <rocm-install-on-linux:install/3rd-party/megablocks-install>` to install and get started.
+Support overview
+================================================================================

-.. note::
+- The ROCm-supported version of Megablocks is maintained in the official `https://github.com/ROCm/megablocks 
+  <https://github.com/ROCm/megablocks>`__ repository, which differs from the 
+  `https://github.com/stanford-futuredata/Megatron-LM <https://github.com/stanford-futuredata/Megatron-LM>`__ upstream repository.

-  Megablocks is supported on ROCm 6.3.0.
+- To get started and install Megablocks on ROCm, use the prebuilt :ref:`Docker image <megablocks-docker-compat>`, 
+  which includes ROCm, Megablocks, and all required dependencies.
+
+  - See the :doc:`ROCm Megablocks installation guide <rocm-install-on-linux:install/3rd-party/megablocks-install>` 
+    for installation and setup instructions.
+
+  - You can also consult the upstream `Installation guide <https://github.com/databricks/megablocks>`__ 
+    for additional context.
+
+Version support
+--------------------------------------------------------------------------------
+
+Megablocks is supported on `ROCm 6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`__.

 Supported devices
-================================================================================
+--------------------------------------------------------------------------------

- **Officially Supported**: AMD Instinct MI300X
- **Partially Supported** (functionality or performance limitations): AMD Instinct MI250X, MI210
+- **Officially Supported**: AMD Instinct™ MI300X
+- **Partially Supported** (functionality or performance limitations): AMD Instinct™ MI250X, MI210

 Supported models and features
-================================================================================
+--------------------------------------------------------------------------------

 This section summarizes the Megablocks features supported by ROCm.

@@ -41,20 +55,28 @@ This section summarizes the Megablocks features supported by ROCm.
 * Mixture-of-Experts
 * dropless-Mixture-of-Experts

-
 .. _megablocks-recommendations:

 Use cases and recommendations
 ================================================================================

-The `ROCm Megablocks blog posts <https://rocm.blogs.amd.com/artificial-intelligence/megablocks/README.html>`_ 
-guide how to leverage the ROCm platform for pre-training using the Megablocks framework. 
+* The `Efficient MoE training on AMD ROCm: How-to use Megablocks on AMD GPUs 
+  <https://rocm.blogs.amd.com/artificial-intelligence/megablocks/README.html>`__ 
+  blog post guides how to leverage the ROCm platform for pre-training using the 
+  Megablocks framework. It introduces a streamlined approach for training Mixture-of-Experts 
+  (MoE) models using the Megablocks library on AMD hardware. Focusing on GPT-2, it 
+  demonstrates how block-sparse computations can enhance scalability and efficiency in MoE 
+  training. The guide provides step-by-step instructions for setting up the environment, 
+  including cloning the repository, building the Docker image, and running the training container. 
+  Additionally, it offers insights into utilizing the ``oscar-1GB.json`` dataset for pre-training 
+  language models. By leveraging Megablocks and the ROCm platform, you can optimize your MoE 
+  training workflows for large-scale transformer models.
+
 It features how to pre-process datasets and how to begin pre-training on AMD GPUs through:

 * Single-GPU pre-training
 * Multi-GPU pre-training

-
 .. _megablocks-docker-compat:

 Docker image compatibility
@@ -64,10 +86,9 @@ Docker image compatibility

   <i class="fab fa-docker"></i>

-AMD validates and publishes `ROCm Megablocks images <https://hub.docker.com/r/rocm/megablocks/tags>`_
-with ROCm and Pytorch backends on Docker Hub. The following Docker image tags and associated
-inventories represent the latest Megatron-LM version from the official Docker Hub.
-The Docker images have been validated for `ROCm 6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`_.
+AMD validates and publishes `Megablocks images <https://hub.docker.com/r/rocm/megablocks/tags>`__
+with ROCm backends on Docker Hub. The following Docker image tag and associated
+inventories represent the latest available Megablocks version from the official Docker Hub. 
 Click |docker-icon| to view the image on Docker Hub.

 .. list-table:: 
--- a/docs/compatibility/ml-compatibility/pytorch-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/pytorch-compatibility.rst
@@ -2,7 +2,7 @@

 .. meta::
    :description: PyTorch compatibility
-    :keywords: GPU, PyTorch compatibility
+    :keywords: GPU, PyTorch, deep learning, framework compatibility

 .. version-set:: rocm_version latest

@@ -15,40 +15,42 @@ deep learning. PyTorch on ROCm provides mixed-precision and large-scale training
 using `MIOpen <https://github.com/ROCm/MIOpen>`__ and
 `RCCL <https://github.com/ROCm/rccl>`__ libraries.

-ROCm support for PyTorch is upstreamed into the official PyTorch repository. Due
-to independent compatibility considerations, this results in two distinct
-release cycles for PyTorch on ROCm:
+PyTorch provides two high-level features:

- ROCm PyTorch release:
+- Tensor computation (like NumPy) with strong GPU acceleration

-  - Provides the latest version of ROCm but might not necessarily support the
-    latest stable PyTorch version.
+- Deep neural networks built on a tape-based autograd system (rapid computation 
+  of multiple partial derivatives or gradients)

-  - Offers :ref:`Docker images <pytorch-docker-compat>` with ROCm and PyTorch
-    preinstalled.
+Support overview
+================================================================================

-  - ROCm PyTorch repository: `<https://github.com/ROCm/pytorch>`__
+ROCm support for PyTorch is upstreamed into the official PyTorch repository. 
+ROCm development is aligned with the stable release of PyTorch, while upstream 
+PyTorch testing uses the stable release of ROCm to maintain consistency:

-  - See the :doc:`ROCm PyTorch installation guide <rocm-install-on-linux:install/3rd-party/pytorch-install>`
-    to get started.
+- The ROCm-supported version of PyTorch is maintained in the official `https://github.com/ROCm/pytorch 
+  <https://github.com/ROCm/pytorch>`__ repository, which differs from the 
+  `https://github.com/pytorch/pytorch <https://github.com/pytorch/pytorch>`__ upstream repository.

- Official PyTorch release:
+- To get started and install PyTorch on ROCm, use the prebuilt :ref:`Docker images <pytorch-docker-compat>`, 
+  which include ROCm, PyTorch, and all required dependencies.

-  - Provides the latest stable version of PyTorch  but might not necessarily
-    support the latest ROCm version.
+  - See the :doc:`ROCm PyTorch installation guide <rocm-install-on-linux:install/3rd-party/pytorch-install>` 
+    for installation and setup instructions.

-  - Official PyTorch repository: `<https://github.com/pytorch/pytorch>`__
-
-  - See the `Nightly and latest stable version installation guide <https://pytorch.org/get-started/locally/>`__
-    or `Previous versions <https://pytorch.org/get-started/previous-versions/>`__
-    to get started.
+  - You can also consult the upstream `Installation guide <https://pytorch.org/get-started/locally/>`__ or 
+    `Previous versions <https://pytorch.org/get-started/previous-versions/>`__ for additional context.

 PyTorch includes tooling that generates HIP source code from the CUDA backend.
 This approach allows PyTorch to support ROCm without requiring manual code
 modifications. For more information, see :doc:`HIPIFY <hipify:index>`.

-ROCm development is aligned with the stable release of PyTorch, while upstream
-PyTorch testing uses the stable release of ROCm to maintain consistency.
+Version support
+--------------------------------------------------------------------------------
+
+AMD releases official `ROCm PyTorch Docker images <https://hub.docker.com/r/rocm/pytorch/tags>`_
+quarterly alongside new ROCm releases. These images undergo full AMD testing.

 .. _pytorch-recommendations:

@@ -78,7 +80,7 @@ Use cases and recommendations
  GPU.

 * The :doc:`Inception with PyTorch documentation </conceptual/ai-pytorch-inception>`
-  describes how PyTorch integrates with ROCm for AI workloads It outlines the
+  describes how PyTorch integrates with ROCm for AI workloads. It outlines the
  use of PyTorch on the ROCm platform and focuses on efficiently leveraging AMD
  GPU hardware for training and inference tasks in AI applications.

@@ -89,9 +91,8 @@ For more use cases and recommendations, see `ROCm PyTorch blog posts <https://ro
 Docker image compatibility
 ================================================================================

-AMD provides preconfigured Docker images with PyTorch and the ROCm backend.
-These images are published on `Docker Hub <https://hub.docker.com/r/rocm/pytorch>`__ and are the
-recommended way to get started with deep learning with PyTorch on ROCm.
+AMD validates and publishes `PyTorch images <https://hub.docker.com/r/rocm/pytorch/tags>`__
+with ROCm backends on Docker Hub.

 To find the right image tag, see the :ref:`PyTorch on ROCm installation
 documentation <rocm-install-on-linux:pytorch-docker-support>` for a list of
--- a/docs/compatibility/ml-compatibility/ray-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/ray-compatibility.rst
@@ -1,8 +1,8 @@
 :orphan:

 .. meta::
-    :description: Ray deep learning framework compatibility
-    :keywords: GPU, Ray compatibility
+    :description: Ray compatibility
+    :keywords: GPU, Ray, deep learning, framework compatibility

 .. version-set:: rocm_version latest

@@ -19,36 +19,35 @@ simplifying machine learning computations.
 Ray is a general-purpose framework that runs many types of workloads efficiently. 
 Any Python application can be scaled with Ray, without extra infrastructure.

-ROCm support for Ray is upstreamed, and you can build the official source code
-with ROCm support: 
-
- ROCm support for Ray is hosted in the official `https://github.com/ROCm/ray 
-  <https://github.com/ROCm/ray>`_ repository.
-
- Due to independent compatibility considerations, this location differs from the 
-  `https://github.com/ray-project/ray <https://github.com/ray-project/ray>`_ upstream repository.
-
- To install Ray, use the prebuilt :ref:`Docker image <ray-docker-compat>` 
-  which includes ROCm, Ray, and all required dependencies.
-
-  - See the :doc:`ROCm Ray installation guide <rocm-install-on-linux:install/3rd-party/ray-install>` 
-    for instructions to get started.
-
-  - See the `Installation section <https://docs.ray.io/en/latest/ray-overview/installation.html>`_ 
-    in the upstream Ray documentation.
-
-  - The Docker image provided is based on the upstream Ray `Daily Release (Nightly) wheels <https://docs.ray.io/en/latest/ray-overview/installation.html#daily-releases-nightlies>`__ 
-    corresponding to commit `005c372 <https://github.com/ray-project/ray/commit/005c372262e050d5745f475e22e64305fa07f8b8>`__.
-
-.. note::
-
-  Ray is supported on ROCm 6.4.1.
-
-Supported devices
+Support overview
 ================================================================================

-**Officially Supported**: AMD Instinct™ MI300X, MI210
+- The ROCm-supported version of Ray is maintained in the official `https://github.com/ROCm/ray 
+  <https://github.com/ROCm/ray>`__ repository, which differs from the 
+  `https://github.com/ray-project/ray <https://github.com/ray-project/ray>`__ upstream repository.

+- To get started and install Ray on ROCm, use the prebuilt :ref:`Docker image <ray-docker-compat>`, 
+  which includes ROCm, Ray, and all required dependencies.
+
+  - The Docker image provided is based on the upstream Ray `Daily Release (Nightly) wheels 
+    <https://docs.ray.io/en/latest/ray-overview/installation.html#daily-releases-nightlies>`__ 
+    corresponding to commit `005c372 <https://github.com/ray-project/ray/commit/005c372262e050d5745f475e22e64305fa07f8b8>`__.
+
+  - See the :doc:`ROCm Ray installation guide <rocm-install-on-linux:install/3rd-party/ray-install>` 
+    for installation and setup instructions.
+
+  - You can also consult the upstream `Installation guide <https://docs.ray.io/en/latest/ray-overview/installation.html>`__ 
+    for additional context.
+
+Version support
+--------------------------------------------------------------------------------
+
+Ray is supported on `ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__.
+
+Supported devices
+--------------------------------------------------------------------------------
+
+**Officially Supported**: AMD Instinct™ MI300X, MI210

 Use cases and recommendations
 ================================================================================
@@ -88,15 +87,15 @@ Docker image compatibility

 AMD validates and publishes ready-made `ROCm Ray Docker images <https://hub.docker.com/r/rocm/ray/tags>`__
 with ROCm backends on Docker Hub. The following Docker image tags and
-associated inventories represent the latest Ray version from the official Docker Hub and are validated for
-`ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`_. Click the |docker-icon|
-icon to view the image on Docker Hub.
+associated inventories represent the latest Ray version from the official Docker Hub.
+Click the |docker-icon| icon to view the image on Docker Hub.

 .. list-table::
    :header-rows: 1
    :class: docker-image-compatibility

    * - Docker image
+      - ROCm
      - Ray
      - Pytorch
      - Ubuntu
@@ -105,6 +104,7 @@ icon to view the image on Docker Hub.
    * - .. raw:: html

           <a href="https://hub.docker.com/layers/rocm/ray/ray-2.48.0.post0_rocm6.4.1_ubuntu24.04_py3.12_pytorch2.6.0/images/sha256-0d166fe6bdced38338c78eedfb96eff92655fb797da3478a62dd636365133cc0"><i class="fab fa-docker fa-lg"></i> rocm/ray</a>
+      - `6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`__.
      - `2.48.0.post0 <https://github.com/ROCm/ray/tree/release/2.48.0.post0>`_
      - 2.6.0+git684f6f2
      - 24.04
--- a/docs/compatibility/ml-compatibility/stanford-megatron-lm-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/stanford-megatron-lm-compatibility.rst
@@ -2,7 +2,7 @@

 .. meta::
    :description: Stanford Megatron-LM compatibility
-    :keywords: Stanford, Megatron-LM, compatibility
+    :keywords: Stanford, Megatron-LM, deep learning, framework compatibility

 .. version-set:: rocm_version latest

@@ -10,34 +10,50 @@
 Stanford Megatron-LM compatibility
 ********************************************************************************

-Stanford Megatron-LM is a large-scale language model training framework developed by NVIDIA `https://github.com/NVIDIA/Megatron-LM <https://github.com/NVIDIA/Megatron-LM>`_. It is
-designed to train massive transformer-based language models efficiently by model and data parallelism. 
+Stanford Megatron-LM is a large-scale language model training framework developed 
+by NVIDIA at `https://github.com/NVIDIA/Megatron-LM <https://github.com/NVIDIA/Megatron-LM>`_. 
+It is designed to train massive transformer-based language models efficiently by model 
+and data parallelism. 

-* ROCm support for Stanford Megatron-LM is hosted in the official `https://github.com/ROCm/Stanford-Megatron-LM <https://github.com/ROCm/Stanford-Megatron-LM>`_ repository. 
-* Due to independent compatibility considerations, this location differs from the `https://github.com/stanford-futuredata/Megatron-LM <https://github.com/stanford-futuredata/Megatron-LM>`_ upstream repository. 
-* Use the prebuilt :ref:`Docker image <megatron-lm-docker-compat>` with ROCm, PyTorch, and Megatron-LM preinstalled. 
-* See the :doc:`ROCm Stanford Megatron-LM installation guide <rocm-install-on-linux:install/3rd-party/stanford-megatron-lm-install>` to install and get started.
+It provides efficient tensor, pipeline, and sequence-based model parallelism for 
+pre-training transformer-based language models such as GPT (Decoder Only), BERT 
+(Encoder Only), and T5 (Encoder-Decoder). 

-.. note::
-
-	Stanford Megatron-LM is supported on ROCm 6.3.0.
-
-
-Supported Devices
+Support overview
 ================================================================================

- **Officially Supported**: AMD Instinct MI300X
- **Partially Supported** (functionality or performance limitations): AMD Instinct MI250X, MI210
+- The ROCm-supported version of Stanford Megatron-LM is maintained in the official `https://github.com/ROCm/Stanford-Megatron-LM 
+  <https://github.com/ROCm/Stanford-Megatron-LM>`__ repository, which differs from the 
+  `https://github.com/stanford-futuredata/Megatron-LM <https://github.com/stanford-futuredata/Megatron-LM>`__ upstream repository.

+- To get started and install Stanford Megatron-LM on ROCm, use the prebuilt :ref:`Docker image <megatron-lm-docker-compat>`, 
+  which includes ROCm, Stanford Megatron-LM, and all required dependencies.
+
+  - See the :doc:`ROCm Stanford Megatron-LM installation guide <rocm-install-on-linux:install/3rd-party/stanford-megatron-lm-install>` 
+    for installation and setup instructions.
+
+  - You can also consult the upstream `Installation guide <https://github.com/NVIDIA/Megatron-LM>`__ 
+    for additional context.
+
+Version support
+--------------------------------------------------------------------------------
+
+Stanford Megatron-LM is supported on `ROCm 6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`__.
+
+Supported devices
+--------------------------------------------------------------------------------
+
+- **Officially Supported**: AMD Instinct™ MI300X
+- **Partially Supported** (functionality or performance limitations): AMD Instinct™ MI250X, MI210

 Supported models and features
-================================================================================
+--------------------------------------------------------------------------------

 This section details models & features that are supported by the ROCm version on Stanford Megatron-LM.

 Models:

-* Bert
+* BERT
 * GPT
 * T5
 * ICT
@@ -54,13 +70,24 @@ Features:
 Use cases and recommendations
 ================================================================================

-See the `Efficient MoE training on AMD ROCm: How-to use Megablocks on AMD GPUs blog <https://rocm.blogs.amd.com/artificial-intelligence/megablocks/README.html>`_ post  
-to leverage the ROCm platform for pre-training by using the Stanford Megatron-LM framework of pre-processing datasets on AMD GPUs. 
-Coverage includes:
+The following blog post mentions Megablocks, but you can run Stanford Megatron-LM with the same steps to pre-process datasets on AMD GPUs:

-  * Single-GPU pre-training
-  * Multi-GPU pre-training
+* The `Efficient MoE training on AMD ROCm: How-to use Megablocks on AMD GPUs 
+  <https://rocm.blogs.amd.com/artificial-intelligence/megablocks/README.html>`__ 
+  blog post guides how to leverage the ROCm platform for pre-training using the 
+  Megablocks framework. It introduces a streamlined approach for training Mixture-of-Experts 
+  (MoE) models using the Megablocks library on AMD hardware. Focusing on GPT-2, it 
+  demonstrates how block-sparse computations can enhance scalability and efficiency in MoE 
+  training. The guide provides step-by-step instructions for setting up the environment, 
+  including cloning the repository, building the Docker image, and running the training container. 
+  Additionally, it offers insights into utilizing the ``oscar-1GB.json`` dataset for pre-training 
+  language models. By leveraging Megablocks and the ROCm platform, you can optimize your MoE 
+  training workflows for large-scale transformer models.

+It features how to pre-process datasets and how to begin pre-training on AMD GPUs through:
+
+* Single-GPU pre-training
+* Multi-GPU pre-training

 .. _megatron-lm-docker-compat:

@@ -71,10 +98,9 @@ Docker image compatibility

   <i class="fab fa-docker"></i>

-AMD validates and publishes `Stanford Megatron-LM images <https://hub.docker.com/r/rocm/megatron-lm>`_
+AMD validates and publishes `Stanford Megatron-LM images <https://hub.docker.com/r/rocm/stanford-megatron-lm/tags>`_
 with ROCm and Pytorch backends on Docker Hub. The following Docker image tags and associated
-inventories represent the latest Megatron-LM version from the official Docker Hub.
-The Docker images have been validated for `ROCm 6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`_.
+inventories represent the latest Stanford Megatron-LM version from the official Docker Hub.
 Click |docker-icon| to view the image on Docker Hub.

 .. list-table:: 
@@ -82,6 +108,7 @@ Click |docker-icon| to view the image on Docker Hub.
    :class: docker-image-compatibility

    * - Docker image
+      - ROCm
      - Stanford Megatron-LM
      - PyTorch
      - Ubuntu
@@ -91,6 +118,7 @@ Click |docker-icon| to view the image on Docker Hub.

           <a href="https://hub.docker.com/layers/rocm/stanford-megatron-lm/stanford-megatron-lm85f95ae_rocm6.3.0_ubuntu24.04_py3.12_pytorch2.4.0/images/sha256-070556f078be10888a1421a2cb4f48c29f28b02bfeddae02588d1f7fc02a96a6"><i class="fab fa-docker fa-lg"></i></a>

+      - `6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`_
      - `85f95ae <https://github.com/stanford-futuredata/Megatron-LM/commit/85f95aef3b648075fe6f291c86714fdcbd9cd1f5>`_
      - `2.4.0 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
      - 24.04
--- a/docs/compatibility/ml-compatibility/taichi-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/taichi-compatibility.rst
@@ -2,7 +2,7 @@

 .. meta::
    :description: Taichi compatibility
-    :keywords: GPU, Taichi compatibility
+    :keywords: GPU, Taichi, deep learning, framework compatibility

 .. version-set:: rocm_version latest

@@ -19,28 +19,52 @@ Taichi is widely used across various domains, including real-time physical simul
 numerical computing, augmented reality, artificial intelligence, computer vision, robotics, 
 visual effects in film and gaming, and general-purpose computing.

-* ROCm support for Taichi is hosted in the official `https://github.com/ROCm/taichi <https://github.com/ROCm/taichi>`_ repository.
-* Due to independent compatibility considerations, this location differs from the `https://github.com/taichi-dev <https://github.com/taichi-dev>`_ upstream repository.
-* Use the prebuilt :ref:`Docker image <taichi-docker-compat>` with ROCm, PyTorch, and Taichi preinstalled.
-* See the :doc:`ROCm Taichi installation guide <rocm-install-on-linux:install/3rd-party/taichi-install>` to install and get started.
+Support overview
+================================================================================

-.. note::
+- The ROCm-supported version of Taichi is maintained in the official `https://github.com/ROCm/taichi 
+  <https://github.com/ROCm/taichi>`__ repository, which differs from the 
+  `https://github.com/taichi-dev/taichi <https://github.com/taichi-dev/taichi>`__ upstream repository.

-	Taichi is supported on ROCm 6.3.2.
+- To get started and install Taichi on ROCm, use the prebuilt :ref:`Docker image <taichi-docker-compat>`, 
+  which includes ROCm, Taichi, and all required dependencies.

-Supported devices and features
-===============================================================================
-There is support through the ROCm software stack for all Taichi GPU features on AMD Instinct MI250X and MI210X Series GPUs with the exception of Taichi’s GPU rendering system, CGUI.
-AMD Instinct MI300X Series GPUs will be supported by November.
+  - See the :doc:`ROCm Taichi installation guide <rocm-install-on-linux:install/3rd-party/taichi-install>` 
+    for installation and setup instructions.
+
+  - You can also consult the upstream `Installation guide <https://github.com/taichi-dev/taichi>`__ 
+    for additional context.
+
+Version support
+--------------------------------------------------------------------------------
+
+Taichi is supported on `ROCm 6.3.2 <https://repo.radeon.com/rocm/apt/6.3.2/>`__.
+
+Supported devices
+--------------------------------------------------------------------------------
+
+- **Officially Supported**: AMD Instinct™ MI250X, MI210X (with the exception of Taichi’s GPU rendering system, CGUI)
+- **Upcoming Support**: AMD Instinct™ MI300X

 .. _taichi-recommendations:

 Use cases and recommendations
 ================================================================================
-To fully leverage Taichi's performance capabilities in compute-intensive tasks, it is best to adhere to specific coding patterns and utilize Taichi decorators. 
-A collection of example use cases is available in the `https://github.com/ROCm/taichi_examples <https://github.com/ROCm/taichi_examples>`_ repository, 
-providing practical insights and foundational knowledge for working with the Taichi programming language. 
-You can also refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`_ to search for Taichi examples and best practices to optimize your workflows on AMD GPUs.
+
+* The `Accelerating Parallel Programming in Python with Taichi Lang on AMD GPUs 
+  <https://rocm.blogs.amd.com/artificial-intelligence/taichi/README.html>`__
+  blog highlights Taichi as an open-source programming language designed for high-performance 
+  numerical computation, particularly in domains like real-time physical simulation, 
+  artificial intelligence, computer vision, robotics, and visual effects. Taichi 
+  is embedded in Python and uses just-in-time (JIT) compilation frameworks like 
+  LLVM to optimize execution on GPUs and CPUs. The blog emphasizes the versatility 
+  of Taichi in enabling complex simulations and numerical algorithms, making 
+  it ideal for developers working on compute-intensive tasks. Developers are 
+  encouraged to follow recommended coding patterns and utilize Taichi decorators 
+  for performance optimization, with examples available in the `https://github.com/ROCm/taichi_examples 
+  <https://github.com/ROCm/taichi_examples>`_ repository. Prebuilt Docker images 
+  integrating ROCm, PyTorch, and Taichi are provided for simplified installation 
+  and deployment, making it easier to leverage Taichi for advanced computational workloads.

 .. _taichi-docker-compat:

@@ -52,9 +76,8 @@ Docker image compatibility
   <i class="fab fa-docker"></i>

 AMD validates and publishes ready-made `ROCm Taichi Docker images <https://hub.docker.com/r/rocm/taichi/tags>`_
-with ROCm backends on Docker Hub. The following Docker image tags and associated inventories 
+with ROCm backends on Docker Hub. The following Docker image tag and associated inventories 
 represent the latest Taichi version from the official Docker Hub.
-The Docker images have been validated for `ROCm 6.3.2 <https://rocm.docs.amd.com/en/docs-6.3.2/about/release-notes.html>`_. 
 Click |docker-icon| to view the image on Docker Hub.

 .. list-table:: 
--- a/docs/compatibility/ml-compatibility/tensorflow-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/tensorflow-compatibility.rst
@@ -2,7 +2,7 @@

 .. meta::
    :description: TensorFlow compatibility
-    :keywords: GPU, TensorFlow compatibility
+    :keywords: GPU, TensorFlow, deep learning, framework compatibility

 .. version-set:: rocm_version latest

@@ -12,37 +12,33 @@ TensorFlow compatibility

 `TensorFlow <https://www.tensorflow.org/>`__ is an open-source library for
 solving machine learning, deep learning, and AI problems. It can solve many
-problems across different sectors and industries but primarily focuses on
-neural network training and inference. It is one of the most popular and
-in-demand frameworks and is very active in open-source contribution and
-development.
+problems across different sectors and industries, but primarily focuses on
+neural network training and inference. It is one of the most popular deep 
+learning frameworks and is very active in open-source development.
+
+Support overview
+================================================================================
+
+- The ROCm-supported version of TensorFlow is maintained in the official `https://github.com/ROCm/tensorflow-upstream 
+  <https://github.com/ROCm/tensorflow-upstream>`__ repository, which differs from the 
+  `https://github.com/tensorflow/tensorflow <https://github.com/tensorflow/tensorflow>`__ upstream repository.
+
+- To get started and install TensorFlow on ROCm, use the prebuilt :ref:`Docker images <tensorflow-docker-compat>`, 
+  which include ROCm, TensorFlow, and all required dependencies.
+
+  - See the :doc:`ROCm TensorFlow installation guide <rocm-install-on-linux:install/3rd-party/tensorflow-install>` 
+    for installation and setup instructions.
+
+  - You can also consult the `TensorFlow API versions <https://www.tensorflow.org/versions>`__ list 
+    for additional context.
+
+Version support
+--------------------------------------------------------------------------------

 The `official TensorFlow repository <http://github.com/tensorflow/tensorflow>`__
 includes full ROCm support. AMD maintains a TensorFlow `ROCm repository
 <http://github.com/rocm/tensorflow-upstream>`__ in order to quickly add bug
-fixes, updates, and support for the latest ROCM versions.
-
- ROCm TensorFlow release:
-
-  - Offers :ref:`Docker images <tensorflow-docker-compat>` with
-    ROCm and TensorFlow pre-installed.
-
-  - ROCm TensorFlow repository: `<https://github.com/ROCm/tensorflow-upstream>`__
-
-  - See the :doc:`ROCm TensorFlow installation guide <rocm-install-on-linux:install/3rd-party/tensorflow-install>`
-    to get started.
-
- Official TensorFlow release:
-
-  - Official TensorFlow repository: `<https://github.com/tensorflow/tensorflow>`__
-
-  - See the `TensorFlow API versions <https://www.tensorflow.org/versions>`__ list.
-
-  .. note::
-
-     The official TensorFlow documentation does not cover ROCm support. Use the
-     ROCm documentation for installation instructions for Tensorflow on ROCm.
-     See :doc:`rocm-install-on-linux:install/3rd-party/tensorflow-install`.
+fixes, updates, and support for the latest ROCm versions.

 .. _tensorflow-docker-compat:

--- a/docs/compatibility/ml-compatibility/verl-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/verl-compatibility.rst
@@ -2,7 +2,7 @@

 .. meta::
   :description: verl compatibility
-   :keywords: GPU, verl compatibility
+   :keywords: GPU, verl, deep learning, framework compatibility

 .. version-set:: rocm_version latest

@@ -10,24 +10,58 @@
 verl compatibility
 *******************************************************************************

-Volcano Engine Reinforcement Learning for LLMs (verl) is a reinforcement learning framework designed for large language models (LLMs). 
-verl offers a scalable, open-source fine-tuning solution optimized for AMD Instinct GPUs with full ROCm support.
+Volcano Engine Reinforcement Learning for LLMs (`verl <https://verl.readthedocs.io/en/latest/>`__)  
+is a reinforcement learning framework designed for large language models (LLMs). 
+verl offers a scalable, open-source fine-tuning solution by using a hybrid programming model 
+that makes it easy to define and run complex post-training dataflows efficiently. 

-* See the `verl documentation <https://verl.readthedocs.io/en/latest/>`_ for more information about verl. 
-* The official verl GitHub repository is `https://github.com/volcengine/verl <https://github.com/volcengine/verl>`_.
-* Use the AMD-validated :ref:`Docker images <verl-docker-compat>` with ROCm and verl preinstalled. 
-* See the :doc:`ROCm verl installation guide <rocm-install-on-linux:install/3rd-party/verl-install>` to install and get started.
+Its modular APIs separate computation from data, allowing smooth integration with other frameworks. 
+It also supports flexible model placement across GPUs for efficient scaling on different cluster sizes.
+verl achieves high training and generation throughput by building on existing LLM frameworks. 
+Its 3D-HybridEngine reduces memory use and communication overhead when switching between training 
+and inference, improving overall performance.

-.. note::
+Support overview
+================================================================================

-	verl is supported on ROCm 6.2.0.
+- The ROCm-supported version of verl is maintained in the official `https://github.com/ROCm/verl 
+  <https://github.com/ROCm/verl>`__ repository, which differs from the 
+  `https://github.com/volcengine/verl <https://github.com/volcengine/verl>`__ upstream repository.
+
+- To get started and install verl on ROCm, use the prebuilt :ref:`Docker image <verl-docker-compat>`, 
+  which includes ROCm, verl, and all required dependencies.
+
+  - See the :doc:`ROCm verl installation guide <rocm-install-on-linux:install/3rd-party/verl-install>` 
+    for installation and setup instructions.
+
+  - You can also consult the upstream `verl documentation <https://verl.readthedocs.io/en/latest/>`__ 
+    for additional context.
+
+Version support
+--------------------------------------------------------------------------------
+
+verl is supported on `ROCm 6.2.0 <https://repo.radeon.com/rocm/apt/6.2/>`__.
+
+Supported devices
+--------------------------------------------------------------------------------
+
+**Officially Supported**: AMD Instinct™ MI300X

 .. _verl-recommendations:

 Use cases and recommendations
 ================================================================================

-The benefits of verl in large-scale reinforcement learning from human feedback (RLHF) are discussed in the `Reinforcement Learning from Human Feedback on AMD GPUs with verl and ROCm Integration <https://rocm.blogs.amd.com/artificial-intelligence/verl-large-scale/README.html>`_ blog.
+* The benefits of verl in large-scale reinforcement learning from human feedback 
+  (RLHF) are discussed in the `Reinforcement Learning from Human Feedback on AMD 
+  GPUs with verl and ROCm Integration <https://rocm.blogs.amd.com/artificial-intelligence/verl-large-scale/README.html>`__ 
+  blog. The blog post outlines how the Volcano Engine Reinforcement Learning 
+  (verl) framework integrates with the AMD ROCm platform to optimize training on 
+  Instinct™ MI300X GPUs. The guide details the process of building a Docker image, 
+  setting up single-node and multi-node training environments, and highlights 
+  performance benchmarks demonstrating improved throughput and convergence accuracy. 
+  This resource serves as a comprehensive starting point for deploying verl on AMD GPUs, 
+  facilitating efficient RLHF training workflows.

 .. _verl-supported_features:

@@ -61,8 +95,10 @@ Docker image compatibility

   <i class="fab fa-docker"></i>

-AMD validates and publishes ready-made `ROCm verl Docker images <https://hub.docker.com/r/rocm/verl/tags>`_
-with ROCm backends on Docker Hub. The following Docker image tags and associated inventories represent the available verl versions from the official Docker Hub. 
+AMD validates and publishes ready-made `verl Docker images <https://hub.docker.com/r/rocm/verl/tags>`_
+with ROCm backends on Docker Hub. The following Docker image tag and associated inventories 
+represent the latest verl version from the official Docker Hub. 
+Click |docker-icon| to view the image on Docker Hub.

 .. list-table:: 
    :header-rows: 1
--- a/docs/how-to/rocm-for-ai/inference-optimization/vllm-optimization.rst
+++ b/docs/how-to/rocm-for-ai/inference-optimization/vllm-optimization.rst
--- a/docs/how-to/rocm-for-ai/inference-optimization/workload.rst
+++ b/docs/how-to/rocm-for-ai/inference-optimization/workload.rst
@@ -15,10 +15,9 @@ using PyTorch. It delves into specific workloads such as
 :ref:`model inference <mi300x-vllm-optimization>`, offering strategies to
 enhance efficiency.

-The following topics highlight :ref:`auto-tunable configurations <mi300x-auto-tune>`
-that streamline optimization as well as advanced techniques like
-:ref:`Triton kernel optimization <mi300x-triton-kernel-performance-optimization>` for
-meticulous tuning.
+The following topics highlight :ref:`auto-tunable configurations <mi300x-auto-tune>` as
+well as :ref:`Triton kernel optimization <mi300x-triton-kernel-performance-optimization>`
+for meticulous tuning.

 Workload tuning strategy
 ========================
@@ -86,27 +85,28 @@ Optimize model inference with vLLM
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

 vLLM provides tools and techniques specifically designed for efficient model
-inference on AMD Instinct MI300X GPUs. See :ref:`fine-tuning-llms-vllm`
-for installation guidance. Optimizing performance with vLLM
-involves configuring tensor parallelism, leveraging advanced features, and
-ensuring efficient execution. Here’s how to optimize vLLM performance:
+inference on AMD Instinct GPUs. See the official `vLLM installation docs
+<https://docs.vllm.ai/en/latest/getting_started/installation/gpu.html>`__ for
+installation guidance. Optimizing performance with vLLM involves configuring
+tensor parallelism, leveraging advanced features, and ensuring efficient
+execution.

-* Tensor parallelism: Configure the
-  :ref:`tensor-parallel-size parameter <mi300x-vllm-multiple-gpus>` to distribute
-  tensor computations across multiple GPUs. Adjust parameters such as
-  ``batch-size``, ``input-len``, and ``output-len`` based on your workload.
-
-* Configuration for vLLM: Set :ref:`parameters <mi300x-vllm-optimization>`
-  according to workload requirements. Benchmark performance to understand
-  characteristics and identify bottlenecks.
+* Configuration for vLLM: Set engine arguments according to workload
+  requirements.

 * Benchmarking and performance metrics: Measure latency and throughput to
  evaluate performance.

+.. seealso::
+
+   See :doc:`vllm-optimization` to learn more about vLLM performance
+   optimization techniques.
+
 .. _mi300x-auto-tune:

 Auto-tunable configurations
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
 Auto-tunable configurations can significantly streamline performance
 optimization by automatically adjusting parameters based on workload
 characteristics. For example:
@@ -120,8 +120,7 @@ characteristics. For example:
  your specific hardware.

 * Triton: Use :ref:`Triton’s auto-tuning features <mi300x-autotunable-kernel-config>`
-  to explore various kernel configurations and automatically select the
-  best-performing ones.
+  to explore various kernel configurations and select the best-performing ones.

 Manual tuning
 ^^^^^^^^^^^^^
@@ -328,380 +327,21 @@ hardware counters are also included.

   ROCm Systems Profiler timeline trace example.

-.. _mi300x-vllm-optimization:
-
 vLLM performance optimization
 =============================

-vLLM is a high-throughput and memory efficient inference and serving engine for large language models that has gained traction in the AI community for
-its performance and ease of use. See :ref:`fine-tuning-llms-vllm` for a primer on vLLM with ROCm.
-
-Performance environment variables
---------------------------------
-
-The following performance tips are not *specific* to vLLM -- they are general
-but relevant in this context. You can tune the following vLLM parameters to
-achieve optimal request latency and throughput performance.
-
-* As described in `Environment variables (MI300X)
-  <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#environment-variables>`_,
-  the environment variable ``HIP_FORCE_DEV_KERNARG`` can improve vLLM
-  performance. Set it to ``export HIP_FORCE_DEV_KERNARG=1``.
-
-* Set the :ref:`RCCL environment variable <mi300x-rccl>` ``NCCL_MIN_NCHANNELS``
-  to ``112`` to increase the number of channels on MI300X to potentially improve
-  performance.
-
-* Set the environment variable ``TORCH_BLAS_PREFER_HIPBLASLT=1`` to use hipBLASLt to improve performance.
-
-Auto-tuning using PyTorch TunableOp
------------------------------------
-
-Since vLLM is based on the PyTorch framework, PyTorch TunableOp can be used for auto-tuning. 
-You can run auto-tuning with TunableOp in two simple steps without modifying your code:
-
-* Enable TunableOp and tuning. Optionally, enable verbose mode:
-
-  .. code-block:: shell
-
-     PYTORCH_TUNABLEOP_ENABLED=1 PYTORCH_TUNABLEOP_VERBOSE=1 your_vllm_script.sh
-
-* Enable TunableOp and disable tuning and measure.
-
-  .. code-block:: shell
-
-     PYTORCH_TUNABLEOP_ENABLED=1 PYTORCH_TUNABLEOP_TUNING=0 your_vllm_script.sh
-
-Learn more about TunableOp in the :ref:`PyTorch TunableOp <mi300x-tunableop>` section.
-
-Performance tuning based on vLLM engine configurations
-------------------------------------------------------
-
-The following subsections describe vLLM-specific configurations for performance tuning.
-You can tune the following vLLM parameters to achieve optimal performance.
-
-*  ``tensor_parallel_size``
-
-*  ``gpu_memory_utilization``
-
-*  ``dtype``
-
-*  ``enforce_eager``
-
-*  ``kv_cache_dtype``
-
-*  ``input_len``
-
-*  ``output_len``
-
-*  ``max_num_seqs``
-
-*  ``num_scheduler_steps``
-
-*  ``max_model_len``
-
-*  ``enable_chunked_prefill``
-
-*  ``distributed_executor_backend``
-
-*  ``max_seq_len_to_capture``
-
-Refer to `vLLM documentation <https://docs.vllm.ai/en/latest/models/performance.html>`_
-for additional performance tips. :ref:`fine-tuning-llms-vllm` describes vLLM
-usage with ROCm.
-
-ROCm provides a prebuilt optimized Docker image for validating the performance
-of LLM inference with vLLM on MI300X Series GPUs. The Docker image includes
-ROCm, vLLM, and PyTorch. For more information, see
-:doc:`/how-to/rocm-for-ai/inference/benchmark-docker/vllm`.
-
-.. _mi300x-vllm-throughput-measurement:
-
-Evaluating performance by throughput measurement
-------------------------------------------------
-
-This tuning guide evaluates the performance of LLM inference workloads by measuring throughput in tokens per second (TPS). Throughput can be assessed using both real-world and synthetic data, depending on your evaluation goals.
-
-Refer to the benchmarking script located at ``benchmarks/benchmark_throughput.py`` in the `vLLM repository <https://github.com/ROCm/vllm/blob/main/benchmarks/benchmark_throughput.py>`_.
-Use this script to measure throughput effectively. You can assess throughput using real-world and synthetic data, depending on your evaluation goals.
-
-* For realistic performance evaluation, you can use datasets like Hugging Face's
-  ``ShareGPT_V3_unfiltered_cleaned_split.json``. This dataset includes real-world conversational
-  data, making it a good representation of typical use cases for language models. Download it using
-  the following command:
-
-  .. code-block:: shell
-
-     wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-
-* For standardized benchmarking, you can set fixed input and output token
-  lengths. Synthetic prompts provide consistent benchmarking runs, making it
-  easier to compare performance across different models or configurations.
-  Additionally, a controlled environment simplifies analysis.
-
-By balancing real-world data and synthetic data approaches, you can get a well-rounded understanding of model performance in varied scenarios.
-
-.. _mi300x-vllm-single-node:
-
-Maximizing vLLM instances on a single node
------------------------------------------
-
-The general guideline is to maximize per-node throughput by running as many vLLM instances as possible.
-However, running too many instances might lead to insufficient memory for the KV-cache, which can affect performance.
-
-The Instinct MI300X GPU is equipped with 192 GB of HBM3 memory capacity and bandwidth.
-For models that fit in one GPU -- to maximize the accumulated throughput -- you can run as many as eight vLLM instances
-simultaneously on one MI300X node (with eight GPUs). To do so, use the GPU isolation environment
-variable ``CUDA_VISIBLE_DEVICES``.
-
-For example, this script runs eight instances of vLLM for throughput benchmarking at the same time
-with a model that can fit in one GPU:
-
-.. code-block:: shell
-
-   for i in $(seq 0 7);
-   do
-       CUDA_VISIBLE_DEVICES="$i" python3 /app/vllm/benchmarks/benchmark_throughput.py -tp 1 --dataset "/path/to/dataset/ShareGPT_V3_unfiltered_cleaned_split.json" --model /path/to/model &
-   done
-
-The total throughput achieved by running ``N`` instances of vLLM is generally much higher than running a
-single vLLM instance across ``N`` GPUs simultaneously (that is, configuring ``tensor_parallel_size`` as N or
-using the ``-tp`` N option, where ``1 < N ≤ 8``).
-
-vLLM on MI300X GPUs can run a variety of model weights, including Llama 2 (7b, 13b, 70b), Llama 3 (8b, 70b), Qwen2 (7b, 72b), Mixtral-8x7b, Mixtral-8x22b, and so on.
-Notable configurations include Llama2-70b and Llama3-70b models on a single MI300X GPU, and the Llama3.1 405b model can fit on one single node with 8 MI300X GPUs.
-
-.. _mi300x-vllm-gpu-memory-utilization:
-
-Configure the gpu_memory_utilization parameter
----------------------------------------------
-
-There are two ways to increase throughput by configuring ``gpu-memory-utilization`` parameter.
-
-1. Increase ``gpu-memory-utilization`` to improve the throughput for a single instance as long as
-   it does not incur HIP or CUDA Out Of Memory. The default ``gpu-memory-utilization`` is 0.9.
-   You can set it to ``>0.9`` and ``<1``.
-
-   For example, below benchmarking command set the ``gpu-memory-utilization`` as 0.98, or 98%.
-
-   .. code-block:: shell
-
-      /vllm-workspace/benchmarks/benchmark_throughput.py --gpu-memory-utilization 0.98 --input-len 1024 --output-len 128 --model /path/to/model
-
-2. Decrease ``gpu-memory-utilization`` to maximize the number of vLLM instances on the same GPU.
-
-   Specify GPU memory utilization to run as many instances of vLLM as possible on a single
-   GPU. However, too many instances can result in no memory for KV-cache. For small models, run
-   multiple instances of vLLM on the same GPU by specifying a smaller ``gpu-memory-utilization`` -- as
-   long as it would not cause HIP Out Of Memory. 
-
-   For example, run two instances of the Llama3-8b model at the same time on a single GPU by specifying
-   ``--gpu-memory-utilization`` to 0.4 (40%) as follows (on GPU ``0``):
-
-   .. code-block:: shell
-
-      CUDA_VISIBLE_DEVICES=0 python3 /vllm-workspace/benchmarks/benchmark_throughput.py --gpu-memory-utilization 0.4 
-      --dataset "/path/to/dataset/ShareGPT_V3_unfiltered_cleaned_split.json" --model /path/to/model &
-
-      CUDA_VISIBLE_DEVICES=0 python3 /vllm-workspace/benchmarks/benchmark_throughput.py --gpu-memory-utilization 0.4 
-      --dataset "/path/to/dataset/ShareGPT_V3_unfiltered_cleaned_split.json" --model /path/to/model &
-
-See :ref:`vllm-engine-args` for other performance suggestions.
-
-.. _mi300x-vllm-multiple-gpus:
-
-Run vLLM on multiple GPUs
-------------------------
-
-The two main reasons to use multiple GPUs are:
-
-*  The model size is too big to run vLLM using one GPU as it results HIP Out of Memory.
-
-*  To achieve better latency when using a single GPU is not desirable.
-
-To run one vLLM instance on multiple GPUs, use the ``-tp`` or ``--tensor-parallel-size`` option to
-specify multiple GPUs. Optionally, use the ``CUDA_VISIBLE_DEVICES`` environment variable to specify
-the GPUs.
-
-For example, you can use two GPUs to start an API server on port 8000:
-
-.. code-block:: shell
-
-   python -m vllm.entrypoints.api_server --model /path/to/model --dtype
-   float16 -tp 2 --port 8000 &
-
-To achieve both latency and throughput performance for serving, you can run multiple API servers on
-different GPUs by specifying different ports for each server and use ``CUDA_VISIBLE_DEVICES`` to
-specify the GPUs for each server, for example:
-
-.. code-block:: shell
-
-   CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.api_server --model
-   /path/to/model --dtype float16 -tp 2 --port 8000 &
-
-   CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.api_server --model
-   /path/to/model --dtype float16 -tp 2 --port 8001 &
-
-Choose an attention backend
---------------------------
-
-vLLM on ROCm supports two attention backends, each suitable for different use cases and performance
-requirements:
-
- **Triton Flash Attention** - For benchmarking, run vLLM scripts at
-  least once as a warm-up step so Triton can perform auto-tuning before
-  collecting benchmarking numbers. This is the default setting.
-
- **Composable Kernel (CK) Flash Attention** - To use CK Flash Attention, specify
-  the environment variable as ``export VLLM_USE_TRITON_FLASH_ATTN=0``.
-
-
-Refer to :ref:`Model acceleration libraries <acceleration-flash-attention>`
-to learn more about Flash Attention with Triton or CK backends.
-
-.. _vllm-engine-args:
-
-vLLM engine arguments
---------------------
-
-The following are configuration suggestions to potentially improve performance with vLLM. See
-`vLLM's engine arguments documentation <https://docs.vllm.ai/en/latest/serving/engine_args.html>`_
-for a full list of configurable engine arguments.
-
-Configure the max-num-seqs parameter
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Increase the ``max-num-seqs`` parameter from the default ``256`` to ``512`` (``--max-num-seqs
-512``). This increases the maximum number of sequences per iteration and can improve throughput.
-
-Use the float16 dtype
-^^^^^^^^^^^^^^^^^^^^^
-
-The default data type (``dtype``) is specified in the model’s configuration file. For instance, some models use ``torch.bfloat16`` as their default ``dtype``.
-Use float16 (``--dtype float16``) for better performance.
-
-Multi-step scheduling
-^^^^^^^^^^^^^^^^^^^^^
-
-Setting ``num-scheduler-steps`` for multi-step scheduling can increase performance. Set it between 10 to 15 (``--num-scheduler-steps 10``).
-
-Distributed executor backend
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-The vLLM supports two modes of distributed executor backend: ``ray`` and ``mp``. When using the `<https://github.com/ROCm/vllm>`__ fork, using the ``mp``
-backend (``--distributed_executor_backend mp``) is recommended.
-
-Graph mode max-seq-len-to-capture
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Maximum sequence length covered by CUDA graphs. In the default mode (where ``enforce_eager`` is ``False``), when a sequence has context length
-larger than this, vLLM engine falls back to eager mode. The default is 8192.
-
-When working with models that support long context lengths, set the parameter ``--max-seq-len-to-capture`` to 16384.
-See this `vLLM blog <https://blog.vllm.ai/2024/10/23/vllm-serving-amd.html>`__ for details.
-
-An example of long context length model is Qwen2-7b.
-
-Whether to enable chunked prefill
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Another vLLM performance tip is to enable chunked prefill to improve
-throughput. Chunked prefill allows large prefills to be chunked into
-smaller chunks and batched together with decode requests.
-
-You can enable the feature by specifying ``--enable-chunked-prefill`` in the
-command line or setting ``enable_chunked_prefill=True`` in the LLM
-constructor. 
-
-As stated in `vLLM's documentation, <https://docs.vllm.ai/en/latest/models/performance.html#chunked-prefill>`__,
-you can tune the performance by changing ``max_num_batched_tokens``. By
-default, it is set to 512 and optimized for ITL (inter-token latency).
-Smaller ``max_num_batched_tokens`` achieves better ITL because there are
-fewer prefills interrupting decodes.
-Higher ``max_num_batched_tokens`` achieves better TTFT (time to the first
-token) as you can put more prefill to the batch.
-
-You might experience noticeable throughput improvements when
-benchmarking on a single GPU or 8 GPUs using the vLLM throughput
-benchmarking script along with the ShareGPT dataset as input.
-
-In the case of fixed ``input-len``/``output-len``, for some configurations,
-enabling chunked prefill increases the throughput. For some other
-configurations, the throughput may be worse and elicit a need to tune
-parameter ``max_num_batched_tokens`` (for example, increasing ``max_num_batched_tokens`` value to 4096 or larger).
-
-.. note::
-
-   Chunked prefill is no longer recommended. See the vLLM blog: `Serving LLMs on AMD MI300X: Best Practices <https://blog.vllm.ai/2024/10/23/vllm-serving-amd.html>`_ (October 2024).
-
-Quantization support
---------------------
-
-Quantization reduces the precision of the model’s weights and activations, which significantly decreases the memory footprint.
-``fp8(w8a8)`` and ``AWQ`` quantization are supported for ROCm.
-
-FP8 quantization
-^^^^^^^^^^^^^^^^^
-
-`<https://github.com/ROCm/vllm>`__ supports FP8 (8-bit floating point) weight and activation quantization using hardware acceleration on the Instinct MI300X.
-Quantization of models with FP8 allows for a 2x reduction in model memory requirements and up to a 1.6x improvement in throughput with minimal impact on accuracy.
-
-AMD publishes Quark Quantized OCP FP8 models on Hugging Face. For example:
-
-* `Llama-3.1-8B-Instruct-FP8-KV <https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV>`__
-* `Llama-3.1-70B-Instruct-FP8-KV <https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV>`__
-* `Llama-3.1-405B-Instruct-FP8-KV <https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV>`__
-* `Mixtral-8x7B-Instruct-v0.1-FP8-KV <https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV>`__
-* `Mixtral-8x22B-Instruct-v0.1-FP8-KV <https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV>`__
-
-To enable vLLM benchmarking to run on fp8 quantized models, use the ``--quantization`` parameter with value ``fp8`` (``--quantization fp8``).
-
-AWQ quantization
-^^^^^^^^^^^^^^^^
-
-You can quantize your own models by installing AutoAWQ or picking one of the 400+ models on Hugging Face. Be aware that
-that AWQ support in vLLM is currently underoptimized.
-
-To enable vLLM to run on ``awq`` quantized models, using ``--quantization`` parameter with ``awq`` (``--quantization awq``).
-
-You can find more specifics in the `vLLM AutoAWQ documentation <https://docs.vllm.ai/en/stable/quantization/auto_awq.html>`_.
-
-fp8 kv-cached-dtype
-^^^^^^^^^^^^^^^^^^^^^^^
-
-Using ``fp8 kv-cache dtype`` can improve performance as it reduces the size
-of ``kv-cache``. As a result, it reduces the cost required for reading and
-writing the ``kv-cache``.
-
-To use this feature, specify ``--kv-cache-dtype`` as ``fp8``.
-
-To specify the quantization scaling config, use the
-``--quantization-param-path`` parameter. If the parameter is not specified,
-the default scaling factor of ``1`` is used, which can lead to less accurate
-results. To generate ``kv-cache`` scaling JSON file, see `FP8 KV
-Cache <https://github.com/vllm-project/llm-compressor/blob/main/examples/quantization_kv_cache/README.md>`__
-in the vLLM GitHub repository.
-
-Two sample Llama scaling configuration files are in vLLM for ``llama2-70b`` and
-``llama2-7b``.
-
-If building the vLLM using
-`Dockerfile.rocm <https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile.rocm>`_
-for ``llama2-70b`` scale config, find the file at
-``/vllm-workspace/tests/fp8_kv/llama2-70b-fp8-kv/kv_cache_scales.json`` at
-runtime.
-
-Below is a sample command to run benchmarking with this feature enabled
-for the ``llama2-70b`` model:
-
-.. code-block:: shell
-
-   python3 /vllm-workspace/benchmarks/benchmark_throughput.py --model \
-   /path/to/llama2-70b-model --kv-cache-dtype "fp8" \
-   --quantization-param-path \
-   "/vllm-workspace/tests/fp8_kv/llama2-70b-fp8-kv/kv_cache_scales.json" \
-   --input-len 512 --output-len 256 --num-prompts 500
-
+vLLM is a high-throughput and memory efficient inference and serving engine for
+large language models that has gained traction in the AI community for its
+performance and ease of use. See :doc:`vllm-optimization`, where you'll learn
+how to:
+
+* Enable AITER (AI Tensor Engine for ROCm) to speed up on LLM models.
+* Configure environment variables for optimal HIP, RCCL, and Quick Reduce performance.
+* Select the right attention backend for your workload (AITER MHA/MLA vs. Triton).
+* Choose parallelism strategies (tensor, pipeline, data, expert) for multi-GPU deployments.
+* Apply quantization (``FP8``/``FP4``) to reduce memory usage by 2-4× with minimal accuracy loss.
+* Tune engine arguments (batch size, memory utilization, graph modes) for your use case.
+* Benchmark and scale across single-node and multi-node configurations.

 .. _mi300x-tunableop:

@@ -946,33 +586,33 @@ for details.

  .. code-block:: shell

-     HIP_FORCE_DEV_KERNARG=1  hipblaslt-bench --alpha 1 --beta 0 -r f16_r \
+     HIP_FORCE_DEV_KERNARG=1  hipblaslt-bench --alpha 1 --beta 0 -r f16_r \
     --a_type f16_r --b_type f8_r --compute_type f32_f16_r \
-     --initialization trig_float  --cold_iters 100 --iters 1000 --rotating 256
+     --initialization trig_float  --cold_iters 100 --iters 1000 --rotating 256

 * Example 2: Benchmark forward epilogues and backward epilogues

-  *  ``HIPBLASLT_EPILOGUE_RELU: "--activation_type relu";``
+  *  ``HIPBLASLT_EPILOGUE_RELU: "--activation_type relu";``

-  *  ``HIPBLASLT_EPILOGUE_BIAS: "--bias_vector";``
+  *  ``HIPBLASLT_EPILOGUE_BIAS: "--bias_vector";``

-  *  ``HIPBLASLT_EPILOGUE_RELU_BIAS: "--activation_type relu --bias_vector";``
+  *  ``HIPBLASLT_EPILOGUE_RELU_BIAS: "--activation_type relu --bias_vector";``

-  *  ``HIPBLASLT_EPILOGUE_GELU: "--activation_type gelu";``
+  *  ``HIPBLASLT_EPILOGUE_GELU: "--activation_type gelu";``

  *  ``HIPBLASLT_EPILOGUE_DGELU": --activation_type gelu --gradient";``

-  *  ``HIPBLASLT_EPILOGUE_GELU_BIAS: "--activation_type gelu --bias_vector";``
+  *  ``HIPBLASLT_EPILOGUE_GELU_BIAS: "--activation_type gelu --bias_vector";``

-  *  ``HIPBLASLT_EPILOGUE_GELU_AUX: "--activation_type gelu --use_e";``
+  *  ``HIPBLASLT_EPILOGUE_GELU_AUX: "--activation_type gelu --use_e";``

-  *  ``HIPBLASLT_EPILOGUE_GELU_AUX_BIAS: "--activation_type gelu --bias_vector --use_e";``
+  *  ``HIPBLASLT_EPILOGUE_GELU_AUX_BIAS: "--activation_type gelu --bias_vector --use_e";``

-  *  ``HIPBLASLT_EPILOGUE_DGELU_BGRAD: "--activation_type gelu --bias_vector --gradient";``
+  *  ``HIPBLASLT_EPILOGUE_DGELU_BGRAD: "--activation_type gelu --bias_vector --gradient";``

-  *  ``HIPBLASLT_EPILOGUE_BGRADA: "--bias_vector --gradient --bias_source a";``
+  *  ``HIPBLASLT_EPILOGUE_BGRADA: "--bias_vector --gradient --bias_source a";``

-  *  ``HIPBLASLT_EPILOGUE_BGRADB:  "--bias_vector --gradient --bias_source b";``
+  *  ``HIPBLASLT_EPILOGUE_BGRADB:  "--bias_vector --gradient --bias_source b";``


 hipBLASLt auto-tuning using hipblaslt-bench
@@ -1031,26 +671,26 @@ The tuning tool is a two-step tool. It first runs the benchmark, then it creates

  .. code-block:: python

-     defaultBenchOptions = {"ProblemType": {
-         "TransposeA": 0,
-         "TransposeB": 0,
-         "ComputeInputDataType": "s",
-         "ComputeDataType": "s",
-         "DataTypeC": "s",
-         "DataTypeD": "s",
-         "UseBias": False
-     }, "TestConfig": {
-         "ColdIter": 20,
-         "Iter": 100,
-         "AlgoMethod": "all",
-         "RequestedSolutions": 2, # Only works in AlgoMethod heuristic
-         "SolutionIndex": None, # Only works in AlgoMethod index
-         "ApiMethod": "cpp",
-         "RotatingBuffer": 0,
-     }, "TuningParameters": {
-         "SplitK": [0]
-     }, "ProblemSizes": []}
-     defaultCreateLogicOptions = {}  # Currently unused
+     defaultBenchOptions = {"ProblemType": {
+         "TransposeA": 0,
+         "TransposeB": 0,
+         "ComputeInputDataType": "s",
+         "ComputeDataType": "s",
+         "DataTypeC": "s",
+         "DataTypeD": "s",
+         "UseBias": False
+     }, "TestConfig": {
+         "ColdIter": 20,
+         "Iter": 100,
+         "AlgoMethod": "all",
+         "RequestedSolutions": 2, # Only works in AlgoMethod heuristic
+         "SolutionIndex": None, # Only works in AlgoMethod index
+         "ApiMethod": "cpp",
+         "RotatingBuffer": 0,
+     }, "TuningParameters": {
+         "SplitK": [0]
+     }, "ProblemSizes": []}
+     defaultCreateLogicOptions = {}  # Currently unused

 * ``TestConfig``
   1. ``ColdIter``: This is number the warm-up iterations before starting the kernel benchmark.
@@ -1230,7 +870,7 @@ command:

 .. code-block:: shell

-   merge.py original_dir new_tuned_yaml_dir output_dir 
+   merge.py original_dir new_tuned_yaml_dir output_dir 

 The following table describes the logic YAML files.

@@ -1833,7 +1473,7 @@ de-quantize the ``int4`` key-value from the ``int4`` data type to ``fp16``.

 From the IR snippet, you can see ``i32`` data is loaded from global memory to
 registers (``%190``). With a few element-wise operations in registers, it is
-stored in shared memory (``%269``) for the transpose operation (``%270``), which
+stored in shared memory (``%269``) for the transpose operation (``%270``), which
 needs data movement across different threads. With the transpose done, it is
 loaded from LDS to register again (``%276``), and with a few more
 element-wise operations, it is stored to LDS again (``%298``). The last step
@@ -1967,7 +1607,7 @@ something similar to the following:
   loaded at: [0x7fd4f100c000-0x7fd4f100e070]

 The kernel name and the code object file should be listed. In the
-example above, the kernel name is vector_add_assert_trap, but this might
+example above, the kernel name is vector_add_assert_trap, but this might
 also look like:

 .. code-block:: text
@@ -2081,3 +1721,8 @@ Hardware efficiency is maximized with 4 or fewer HIP streams. These environment
 configuration to two compute streams and two RCCL streams, aligning with this best practice.
 Additionally, RCCL is often pre-optimized for MI300 systems in production by querying the node
 topology during startup, reducing the need for extensive manual tuning.
+
+Further reading
+===============
+
+* :doc:`vllm-optimization`
--- a/docs/how-to/rocm-for-ai/system-setup/system-health-check.rst
+++ b/docs/how-to/rocm-for-ai/system-setup/system-health-check.rst
@@ -92,7 +92,7 @@ GPUs, which can impact end-to-end latency.
 .. _healthcheck-install-transferbench:

 1. To get started, use the instructions in the `TransferBench documentation
-   <https://rocm.docs.amd.com/projects/TransferBench/en/latest/install/install.html#install-transferbench>`_
+   <https://rocm.docs.amd.com/projects/TransferBench/en/latest/install/install.html#install-transferbench>`__
   or use the following commands:

   .. code:: shell
@@ -102,5 +102,5 @@ GPUs, which can impact end-to-end latency.
      CC=hipcc make

 2. Run the suggested TransferBench tests -- see `TransferBench benchmarking
-   <https://instinct.docs.amd.com/projects/system-acceptance/en/latest/mi300x/performance-bench.html#transferbench-benchmarking-results>`_
+   <https://instinct.docs.amd.com/projects/system-acceptance/en/latest/common/system-validation.html#transferbench>`__
   in the Instinct performance benchmarking documentation for instructions.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
@@ -14,7 +14,7 @@ Training a model with Megatron-LM on ROCm
   <https://hub.docker.com/r/rocm/megatron-lm/>`__ Docker Hub registry will be
   deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
   The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
-   including Megatron-LM, `torchtitan, and torchtune <primus-pytorch>`__.
+   including Megatron-LM and :doc:`torchtitan <primus-pytorch>`.

   Primus with Megatron is designed to replace this ROCm Megatron-LM training workflow.
   To learn how to migrate workloads from Megatron-LM to Primus with Megatron,
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst
@@ -18,7 +18,7 @@ model training. Performance acceleration is powered by `Primus Turbo
   <https://hub.docker.com/r/rocm/megatron-lm/>`__ Docker Hub registry will be
   deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
   The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
-   including Megatron-LM, `torchtitan, and torchtune <primus-pytorch>`__.
+   including Megatron-LM and :doc:`torchtitan <primus-pytorch>`.

   Primus with Megatron is designed to replace the :doc:`ROCm Megatron-LM
   training <megatron-lm>` workflow. To learn how to migrate workloads from
@@ -183,7 +183,7 @@ Configuration
 =============

 Primus defines a training configuration in YAML for each model in
-`examples/megatron/configs <https://github.com/AMD-AGI/rss/tree/e16b27bf6c1b2798f38848fc574fee60d9a9b902/examples/megatron/configs>`__.
+`examples/megatron/configs <https://github.com/AMD-AGI/Primus/tree/e16b27bf6c1b2798f38848fc574fee60d9a9b902/examples/megatron/configs>`__.

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml

--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-pytorch.rst
@@ -17,7 +17,7 @@ Primus now supports the PyTorch torchtitan backend.
   <https://hub.docker.com/r/rocm/pytorch-training/>`__ Docker Hub registry will be
   deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
   The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
-   including `Megatron-LM <primus-megatron>`__, torchtitan, and torchtune.
+   including torchtitan and :doc:`Megatron-LM <primus-megatron>`.

   Primus with the PyTorch torchtitan backend is designed to replace the
   :doc:`ROCm PyTorch training <pytorch-training>` workflow. See
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
@@ -14,7 +14,7 @@ Training a model with PyTorch on ROCm
   <https://hub.docker.com/r/rocm/pytorch-training/>`__ Docker Hub registry will be
   deprecated soon in favor of `rocm/primus <https://hub.docker.com/r/rocm/primus>`__.
   The ``rocm/primus`` Docker containers will cover PyTorch training ecosystem frameworks,
-   including `Megatron-LM <primus-megatron>`__, torchtitan, and torchtune.
+   including torchtitan and :doc:`Megatron-LM <primus-megatron>`.

   See :doc:`primus-pytorch` for details.

--- a/docs/how-to/rocm-for-ai/training/scale-model-training.rst
+++ b/docs/how-to/rocm-for-ai/training/scale-model-training.rst
@@ -46,7 +46,7 @@ In DDP training, each process or worker owns a replica of the model and processe

 See the following developer blogs for more in-depth explanations and examples.

-*  `Multi GPU training with DDP — PyTorch Tutorials <https://pytorch.org/tutorials/beginner/ddp_Series_multigpu.html>`_
+*  `Multi GPU training with DDP — PyTorch Tutorials <https://docs.pytorch.org/tutorials/beginner/ddp_series_multigpu.html>`__

 *  `Building a decoder transformer model on AMD GPUs — ROCm Blogs
   <https://rocm.blogs.amd.com/artificial-intelligence/decoder-transformer/README.html#distributed-training-on-multiple-gpus>`_
--- a/docs/sphinx/_toc.yml.in
+++ b/docs/sphinx/_toc.yml.in
@@ -134,6 +134,8 @@ subtrees:
            title: Profile and debug
          - file: how-to/rocm-for-ai/inference-optimization/workload.rst
            title: Workload optimization
+          - file: how-to/rocm-for-ai/inference-optimization/vllm-optimization.rst
+            title: vLLM V1 performance optimization

      - url: https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/
        title: AI tutorials
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
@@ -1,4 +1,4 @@
-rocm-docs-core==1.26.0
+rocm-docs-core==1.27.0
 sphinx-reredirects
 sphinx-sitemap
 sphinxcontrib.datatemplates==0.11.0
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -187,8 +187,8 @@ requests==2.32.5
    # via
    #   pygithub
    #   sphinx
-rocm-docs-core==1.26.0
-    # via -r docs/sphinx/requirements.in
+rocm-docs-core==1.27.0
+    # via -r requirements.in
 rpds-py==0.27.1
    # via
    #   jsonschema
@@ -230,13 +230,13 @@ sphinx-last-updated-by-git==0.3.8
 sphinx-notfound-page==1.1.0
    # via rocm-docs-core
 sphinx-reredirects==0.1.6
-    # via -r docs/sphinx/requirements.in
+    # via -r requirements.in
 sphinx-sitemap==2.9.0
-    # via -r docs/sphinx/requirements.in
+    # via -r requirements.in
 sphinxcontrib-applehelp==2.0.0
    # via sphinx
 sphinxcontrib-datatemplates==0.11.0
-    # via -r docs/sphinx/requirements.in
+    # via -r requirements.in
 sphinxcontrib-devhelp==2.0.0
    # via sphinx
 sphinxcontrib-htmlhelp==2.1.0