Taichi removed (#5802 )

bump rocm-docs-core to 1.26.0
Fix documented VRAM for Radeon AI Pro R9700 (#5203 ) (#5205 )
2026-01-09 22:58:17 -05:00 · 2025-12-19 15:44:03 -05:00 · 2025-10-14 11:36:26 -04:00 · 2025-08-18 10:20:12 -04:00 · 2025-08-12 14:25:54 -04:00 · 2025-08-01 15:21:17 -04:00
133 changed files with 13340 additions and 3886 deletions
--- a/.azuredevops/components/HIP.yml
+++ b/.azuredevops/components/HIP.yml
@@ -35,22 +35,6 @@ parameters:
  type: object
  default:
    - llvm-project
- name: repos
-  type: object
-  default:
-    cloneRepos:
-      - hip:
-        name: 'HIP'
-        url: 'https://github.com/ROCm/HIP.git'
-        commit: 'efee830'
-      - clr:
-        name: 'clr'
-        url: 'https://github.com/ROCm/clr.git'
-        commit: '78f3cab'
-      - hipother:
-        name: 'hipother'
-        url: 'https://github.com/ROCm/hipother.git'
-        commit: 'e0ff75f'

 # hip and clr are tightly-coupled
 # run this same template for both repos
@@ -67,7 +51,7 @@ parameters:
 # HIP with AMD backend
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: hip_clr_combined_amd_${{ job.os }}
+  - job: hip_clr_combined_${{ job.os }}_amd
    pool:
      vmImage: 'ubuntu-22.04'
    ${{ if eq(job.os, 'almalinux8') }}:
@@ -85,17 +69,18 @@ jobs:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
        packageManager: ${{ job.packageManager }}
-    - checkout: none
-    - ${{ each repo in parameters.repos.cloneRepos }}:
-      - task: Bash@3
-        displayName: 'Clone ${{ repo.name }} @ ${{ repo.commit }}'
-        inputs:
-          targetType: inline
-          script: |
-            echo "Cloning ${{ repo.name }}..."
-            git clone ${{ repo.url }} ${{ repo.name }}
-            cd ${{ repo.name }}
-            git checkout ${{ repo.commit }}
+  # checkout triggering repo (either HIP or clr)
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: ${{ parameters.checkoutRepo }}
+  # if this is triggered by HIP repo, matching repo is clr
+  # if this is triggered by clr repo, matching repo is HIP
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: matching_repo
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: hipother_repo
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
@@ -136,7 +121,7 @@ jobs:

 # HIP with Nvidia backend
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: hip_clr_combined_nvidia_${{ job.os }}
+  - job: hip_clr_combined_${{ job.os }}_nvidia
    pool:
      vmImage: 'ubuntu-22.04'
    ${{ if eq(job.os, 'almalinux8') }}:
@@ -155,17 +140,18 @@ jobs:
        pipModules: ${{ parameters.pipModules }}
        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - checkout: none
-    - ${{ each repo in parameters.repos.cloneRepos }}:
-      - task: Bash@3
-        displayName: 'Clone ${{ repo.name }} @ ${{ repo.commit }}'
-        inputs:
-          targetType: inline
-          script: |
-            echo "Cloning ${{ repo.name }}..."
-            git clone ${{ repo.url }} ${{ repo.name }}
-            cd ${{ repo.name }}
-            git checkout ${{ repo.commit }}
+  # checkout triggering repo (either HIP or clr)
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: ${{ parameters.checkoutRepo }}
+  # if this is triggered by HIP repo, matching repo is clr
+  # if this is triggered by clr repo, matching repo is HIP
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: matching_repo
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: hipother_repo
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
--- a/.azuredevops/components/HIPIFY.yml
+++ b/.azuredevops/components/HIPIFY.yml
@@ -1,4 +1,7 @@
 parameters:
+- name: componentName
+  type: string
+  default: HIPIFY
 - name: checkoutRepo
  type: string
  default: 'self'
@@ -13,113 +16,140 @@ parameters:
 - name: aptPackages
  type: object
  default:
-    - cmake
-    - ninja-build
+    - cuda-toolkit-12-9
+    - libcudnn9-dev-cuda-12
    - libnuma-dev
+    - mesa-common-dev
+    - ninja-build
+    - python-is-python3
    - python3-dev
    - python3-pip
-    - python-is-python3
-    - mesa-common-dev
-    - ccache
-    - cuda-toolkit
-    - cudnn
+- name: pipModules
+  type: object
+  default:
+    - lit
+- name: rocmDependencies
+  type: object
+  default:
+    - llvm-project
+
+- name: jobMatrix
+  type: object
+  default:
+    buildJobs:
+      - { os: ubuntu2204, packageManager: apt }
+      - { os: almalinux8, packageManager: dnf }
+    testJobs:
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }

 jobs:
- job: HIPIFY
-  variables:
-  - group: common
-  - template: /.azuredevops/variables-global.yml
-  - name: UPSTREAM_LLVM_GIT_URL
-    value: https://github.com/llvm/llvm-project.git
-  - name: UPSTREAM_LLVM_TAG
-    value: llvmorg-18.1.2
-  pool: ${{ variables.MEDIUM_BUILD_POOL }}
-  workspace:
-    clean: all
-  steps:
-  - task: Bash@3
-    displayName: 'Register CUDA packages'
-    inputs:
-      targetType: inline
-      script: |
-        wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
-        sudo dpkg -i cuda-keyring_1.1-1_all.deb
-        sudo rm -f cuda-keyring_1.1-1_all.deb
-        sudo apt update
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-    parameters:
-      aptPackages: ${{ parameters.aptPackages }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-  - task: Bash@3
-    displayName: git clone upstream llvm-project
-    inputs:
-      targetType: inline
-      script: git clone $(UPSTREAM_LLVM_GIT_URL) --depth=1 --branch $(UPSTREAM_LLVM_TAG) --recurse-submodules
-      workingDirectory: $(Pipeline.Workspace)
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-    parameters:
-      checkoutRepo: ${{ parameters.checkoutRepo }}
-  - script: |
-      mkdir -p $(CCACHE_DIR)
-      echo "##vso[task.prependpath]/usr/lib/ccache:/usr/local/cuda/bin"
-    displayName: Update path for cuda and ccache
-  - task: Cache@2
-    displayName: Ccache caching
-    inputs:
-      key: HIPIFY | $(Agent.OS) | "$(UPSTREAM_LLVM_TAG)"
-      path: $(CCACHE_DIR)
-      restoreKeys: HIPIFY | $(Agent.OS)
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-    parameters:
-      componentName: upstream-llvm
-      cmakeBuildDir: $(Pipeline.Workspace)/llvm-project/llvm/build
-      cmakeSourceDir: $(Pipeline.Workspace)/llvm-project/llvm
-      installDir: $(Pipeline.Workspace)/llvm
-      extraBuildFlags: >-
-        -DCMAKE_BUILD_TYPE=Release
-        -DLLVM_ENABLE_PROJECTS=clang
-        -DLLVM_INCLUDE_TESTS=OFF
-        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
-        -DCMAKE_C_COMPILER_LAUNCHER=ccache
-        -GNinja
-  - task: Bash@3
-    displayName: python install lit
-    inputs:
-      targetType: inline
-      script: sudo python3 $(Pipeline.Workspace)/llvm-project/llvm/utils/lit/setup.py install
-  - task: Bash@3
-    displayName: install FileCheck
-    inputs:
-      targetType: inline
-      script: cp $(Pipeline.Workspace)/llvm-project/llvm/build/bin/FileCheck $(Pipeline.Workspace)/llvm/bin
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-    parameters:
-      componentName: HIPIFY
-      extraBuildFlags: >-
-        -DHIPIFY_CLANG_TESTS=ON
-        -DCMAKE_BUILD_TYPE=Release
-        -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda/targets/x86_64-linux
-        -DCUDA_DNN_ROOT_DIR=/usr/local/cuda/targets/x86_64-linux
-        -DCMAKE_PREFIX_PATH=$(Pipeline.Workspace)/llvm;/usr/local/cuda/targets/x86_64-linux/lib
-        -DLLVM_EXTERNAL_LIT=$(Pipeline.Workspace)/llvm-project/llvm/build/bin/llvm-lit
-      multithreadFlag: -- -j32
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-    parameters:
-      componentName: HIPIFY
-      testDir: $(Build.SourcesDirectory)/build
-      testExecutable: make
-      testParameters: test-hipify
-      testPublishResults: false
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-    parameters:
-      aptPackages: ${{ parameters.aptPackages }}
-      environment: combined
-      registerCUDAPackages: true
-      extraCopyDirectories:
-        - llvm-project
-      extraEnvVars:
-        - UPSTREAM_LLVM_GIT_URL:::https://github.com/llvm/llvm-project.git
-        - UPSTREAM_LLVM_TAG:::llvmorg-18.1.2
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    pool:
+      ${{ if eq(job.os, 'ubuntu2404') }}:
+        name: rocm-ci_medium_build_pool_2404
+      ${{ else }}:
+        name: ${{ variables.MEDIUM_BUILD_POOL }}
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
+    workspace:
+      clean: all
+    steps:
+    - task: Bash@3
+      displayName: 'Register CUDA packages'
+      inputs:
+        targetType: inline
+        ${{ if eq(job.os, 'ubuntu2204') }}:
+          script: |
+            wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+            sudo dpkg -i cuda-keyring_1.1-1_all.deb
+            sudo rm -f cuda-keyring_1.1-1_all.deb
+            sudo apt update
+        ${{ if eq(job.os, 'almalinux8') }}:
+          script: |
+            sudo dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        pipModules: ${{ parameters.pipModules }}
+        packageManager: ${{ job.packageManager }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
+    - task: Bash@3
+      displayName: Add lit to PATH
+      inputs:
+        targetType: inline
+        script: |
+          site_packages=$(python3 -m site --user-base)/bin
+          sudo ln -sf $site_packages/bin/lit $(Pipeline.Workspace)/llvm-lit
+          echo "##vso[task.prependpath]$site_packages"
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: ${{ parameters.checkoutRepo }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      parameters:
+        checkoutRef: ${{ parameters.checkoutRef }}
+        dependencyList: ${{ parameters.rocmDependencies }}
+        os: ${{ job.os }}
+        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+    # cutensor is not available from apt or dnf
+    - task: Bash@3
+      displayName: 'Download and install cutensor'
+      inputs:
+        targetType: inline
+        script: |
+          wget -q --show-progress https://developer.download.nvidia.com/compute/cutensor/redist/libcutensor/linux-x86_64/libcutensor-linux-x86_64-2.2.0.0-archive.tar.xz
+          tar -xvJf libcutensor-linux-x86_64-*.tar.xz
+          mkdir -p $(Pipeline.Workspace)/cutensor
+          cp -r libcutensor-linux-x86_64-*/* $(Pipeline.Workspace)/cutensor/
+    - task: Bash@3
+      displayName: 'List downloaded CUDA files'
+      inputs:
+        targetType: inline
+        script: ls -la1R /usr/local/cuda-12.9
+    # script: cp $(Pipeline.Workspace)/llvm-project/llvm/build/bin/FileCheck $(Pipeline.Workspace)/llvm/bin
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        componentName: ${{ parameters.componentName }}
+        os: ${{ job.os }}
+        consolidateBuildAndInstall: true
+        extraBuildFlags: >-
+          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm/llvm;/usr/local/cuda/targets/x86_64-linux/lib
+          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang++
+          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang
+          -DHIPIFY_CLANG_TESTS=ON
+          -DCMAKE_BUILD_TYPE=Release
+          -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-12.9
+          -DCUDA_DNN_ROOT_DIR=/usr/local/cuda-12.9
+          -DCUDA_CUB_ROOT_DIR=/usr/local/cuda-12.9/targets/x86_64-linux/include/cub
+          -DCUDA_TENSOR_ROOT_DIR=$(Pipeline.Workspace)/cutensor/
+        multithreadFlag: -- -j32
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+      parameters:
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
+        os: ${{ job.os }}
+    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+    #  parameters:
+    #    componentName: HIPIFY
+    #    testDir: $(Build.SourcesDirectory)/build
+    #    testExecutable: make
+    #    testParameters: -j 32 test-hipify
+    #    testPublishResults: false
+    #    os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+    - ${{ if eq(job.os, 'ubuntu2204') }}:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          environment: combined
+          registerCUDAPackages: true
+          extraCopyDirectories:
+            - llvm-project
--- a/.azuredevops/components/MIVisionX.yml
+++ b/.azuredevops/components/MIVisionX.yml
@@ -43,18 +43,20 @@ parameters:
 - name: rocmDependencies
  type: object
  default:
-    - rocm-cmake
-    - llvm-project
-    - ROCR-Runtime
+    - AMDMIGraphX
    - clr
+    - half
+    - hipBLAS-common
+    - hipBLASLt
+    - llvm-project
+    - MIOpen
+    - rocBLAS
+    - rocDecode
+    - rocm-cmake
    - rocminfo
    - rocprofiler-register
-    - half
-    - rocBLAS
-    - MIOpen
-    - AMDMIGraphX
+    - ROCR-Runtime
    - rpp
-    - rocDecode
 - name: rocmTestDependencies
  type: object
  default:
@@ -90,8 +92,7 @@ jobs:
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
-    pool:
-      vmImage: ${{ variables.BASE_BUILD_POOL }}
+    pool: ${{ variables.MEDIUM_BUILD_POOL }}
    workspace:
      clean: all
    steps:
--- a/.azuredevops/components/ROCR-Runtime.yml
+++ b/.azuredevops/components/ROCR-Runtime.yml
@@ -138,7 +138,6 @@ jobs:
        runRocminfo: false
    - task: Bash@3
      displayName: Build kfdtest
-      continueOnError: true
      inputs:
        targetType: 'inline'
        workingDirectory: $(Build.SourcesDirectory)/libhsakmt/tests/kfdtest
@@ -158,7 +157,6 @@ jobs:
        os: ${{ job.os }}
    - task: Bash@3
      displayName: Build rocrtst
-      continueOnError: true
      inputs:
        targetType: 'inline'
        workingDirectory: $(Build.SourcesDirectory)/rocrtst/suites/test_common
--- a/.azuredevops/components/ROCgdb.yml
+++ b/.azuredevops/components/ROCgdb.yml
@@ -1,4 +1,7 @@
 parameters:
+- name: componentName
+  type: string
+  default: ROCgdb
 - name: checkoutRepo
  type: string
  default: 'self'
@@ -23,8 +26,10 @@ parameters:
    - libgmp-dev
    - liblzma-dev
    - libmpfr-dev
-    - pkg-config
    - ncurses-dev
+    - pkg-config
+    - python3-dev
+    - python3-pip
    - texinfo
    - zlib1g-dev
 - name: rocmDependencies
@@ -40,67 +45,87 @@ parameters:
 - name: jobMatrix
  type: object
  default:
+    buildJobs:
+      - { os: ubuntu2204, packageManager: apt }
+      - { os: almalinux8, packageManager: dnf }
    testJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }

 jobs:
- job: ROCgdb
-  variables:
-  - group: common
-  - template: /.azuredevops/variables-global.yml
-  - name: PKG_CONFIG_PATH
-    value: $(Agent.BuildDirectory)/rocm/share/pkgconfig
-  pool:
-    vmImage: ${{ variables.BASE_BUILD_POOL }}
-  workspace:
-    clean: all
-  steps:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-    parameters:
-      aptPackages: ${{ parameters.aptPackages }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-    parameters:
-      checkoutRepo: ${{ parameters.checkoutRepo }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-    parameters:
-      checkoutRef: ${{ parameters.checkoutRef }}
-      dependencyList: ${{ parameters.rocmDependencies }}
-      aggregatePipeline: ${{ parameters.aggregatePipeline }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-autotools.yml
-    parameters:
-      configureFlags: >-
-        --program-prefix=roc
-        --enable-64-bit-bfd
-        --enable-targets="x86_64-linux-gnu,amdgcn-amd-amdhsa"
-        --disable-ld
-        --disable-gas
-        --disable-gdbserver
-        --disable-sim
-        --enable-tui
-        --disable-gdbtk
-        --disable-shared
-        --disable-gprofng
-        --with-expat
-        --with-system-zlib
-        --without-guile
-        --with-babeltrace
-        --with-lzma
-        --with-python=python3
-        --with-rocm-dbgapi=$(Agent.BuildDirectory)/rocm
-        LDFLAGS="-Wl,--enable-new-dtags,-rpath=$(Agent.BuildDirectory)/rocm/lib"
-      makeCallPrefix: LD_RUN_PATH='${ORIGIN}/../lib'
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    - name: PKG_CONFIG_PATH
+      value: $(Agent.BuildDirectory)/rocm/share/pkgconfig
+    pool:
+      ${{ if eq(job.os, 'ubuntu2404') }}:
+        name: rocm-ci_medium_build_pool_2404
+      ${{ else }}:
+        name: ${{ variables.MEDIUM_BUILD_POOL }}
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: ${{ parameters.checkoutRepo }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+      parameters:
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      parameters:
+        checkoutRef: ${{ parameters.checkoutRef }}
+        dependencyList: ${{ parameters.rocmDependencies }}
+        os: ${{ job.os }}
+        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-autotools.yml
+      parameters:
+        os: ${{ job.os }}
+        configureFlags: >-
+          --program-prefix=roc
+          --enable-64-bit-bfd
+          --enable-targets="x86_64-linux-gnu,amdgcn-amd-amdhsa"
+          --disable-ld
+          --disable-gas
+          --disable-gdbserver
+          --disable-sim
+          --enable-tui
+          --disable-gdbtk
+          --disable-shared
+          --disable-gprofng
+          --with-expat
+          --with-system-zlib
+          --without-guile
+          --with-babeltrace
+          --with-lzma
+          --with-python=python3
+          --with-rocm-dbgapi=$(Agent.BuildDirectory)/rocm
+          LDFLAGS="-Wl,--enable-new-dtags,-rpath=$(Agent.BuildDirectory)/rocm/lib"
+        makeCallPrefix: LD_RUN_PATH='${ORIGIN}/../lib'
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+      parameters:
+        componentName: ${{ parameters.componentName }}
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
+        componentName: ${{ parameters.componentName }}
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml

 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: ROCgdb_test_${{ job.target }}
-    dependsOn: ROCgdb
+  - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
+    dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -119,18 +144,23 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+      parameters:
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
+        os: ${{ job.os }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-autotools.yml
      parameters:
+        os: ${{ job.os }}
        configureFlags: >-
          --program-prefix=roc
          --enable-64-bit-bfd
@@ -166,7 +196,9 @@ jobs:
      continueOnError: true
      inputs:
        targetType: inline
-        script: make check-gdb TESTS=gdb.rocm/simple.exp
+        script: |
+          ${{ iif(eq(job.os, 'almalinux8'), 'source /opt/rh/gcc-toolset-14/enable', '') }}
+          make check-gdb TESTS=gdb.rocm/simple.exp
        workingDirectory: $(Build.SourcesDirectory)
    - task: Bash@3
      displayName: print gdb log
--- a/.azuredevops/components/ROCmValidationSuite.yml
+++ b/.azuredevops/components/ROCmValidationSuite.yml
@@ -86,8 +86,7 @@ jobs:
      value: $(Agent.BuildDirectory)/rocm
    - name: HIP_INC_DIR
      value: $(Agent.BuildDirectory)/rocm
-    pool:
-      vmImage: ${{ variables.BASE_BUILD_POOL }}
+    pool: ${{ variables.MEDIUM_BUILD_POOL }}
    workspace:
      clean: all
    steps:
--- a/.azuredevops/components/Tensile.yml
+++ b/.azuredevops/components/Tensile.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: Tensile
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -13,10 +32,10 @@ parameters:
 - name: aptPackages
  type: object
  default:
-    - python3-pip
    - cmake
-    - libmsgpack-dev
+    - libboost-filesystem-dev
    - libboost-program-options-dev
+    - libmsgpack-dev
 - name: pipModules
  type: object
  default:
@@ -38,75 +57,97 @@ parameters:
 - name: jobMatrix
  type: object
  default:
+    buildJobs:
+      - { os: ubuntu2204, packageManager: apt }
+      - { os: almalinux8, packageManager: dnf }
    testJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }

 jobs:
- job: Tensile_build
-  variables:
-  - group: common
-  - template: /.azuredevops/variables-global.yml
-  - name: ROCM_PATH
-    value: $(Agent.BuildDirectory)/rocm
-  pool:
-    vmImage: ${{ variables.BASE_BUILD_POOL }}
-  workspace:
-    clean: all
-  steps:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-    parameters:
-      aptPackages: ${{ parameters.aptPackages }}
-      pipModules: ${{ parameters.pipModules }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-    parameters:
-      checkoutRepo: ${{ parameters.checkoutRepo }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-    parameters:
-      checkoutRef: ${{ parameters.checkoutRef }}
-      dependencyList: ${{ parameters.rocmDependencies }}
-      aggregatePipeline: ${{ parameters.aggregatePipeline }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-  - task: Bash@3
-    displayName: Create wheel file
-    inputs:
-      targetType: inline
-      script: python3 setup.py bdist_wheel
-      workingDirectory: $(Build.SourcesDirectory)
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-prepare-package.yml
-    parameters:
-      sourceDir: $(Build.SourcesDirectory)/dist
-      contentsString: '*.whl'
-      targetDir: $(Build.ArtifactStagingDirectory)
-      clean: false
-  - task: PublishPipelineArtifact@1
-    displayName: 'wheel file Publish'
-    retryCountOnTaskFailure: 3
-    inputs:
-      targetPath: $(Build.ArtifactStagingDirectory)
-  - task: Bash@3
-    displayName: Save pipeline artifact file names
-    inputs:
-      workingDirectory: $(Pipeline.Workspace)
-      targetType: inline
-      script: |
-        whlFile=$(find "$(Build.ArtifactStagingDirectory)" -type f -name "*.whl" | head -n 1)
-        if [ -n "$whlFile" ]; then
-          echo $(basename "$whlFile") >> pipelineArtifacts.txt
-        fi
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-  # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-  #   parameters:
-  #     aptPackages: ${{ parameters.aptPackages }}
-  #     pipModules: ${{ parameters.pipModules }}
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn: ${{ parameters.buildDependsOn[job.target] }}
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    - name: ROCM_PATH
+      value: $(Agent.BuildDirectory)/rocm
+    pool:
+      vmImage: ${{ variables.BASE_BUILD_POOL }}
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        pipModules: ${{ parameters.pipModules }}
+        packageManager: ${{ job.packageManager }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      parameters:
+        checkoutRef: ${{ parameters.checkoutRef }}
+        dependencyList: ${{ parameters.rocmDependencies }}
+        os: ${{ job.os }}
+        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+      parameters:
+        os: ${{ job.os }}
+    - task: Bash@3
+      displayName: Create wheel file
+      inputs:
+        targetType: inline
+        script: python3 setup.py bdist_wheel
+        workingDirectory: $(Agent.BuildDirectory)/s
+    - task: Bash@3
+      displayName: Rename wheel file with job OS
+      inputs:
+        targetType: inline
+        workingDirectory: $(Agent.BuildDirectory)/s
+        script: |
+          wheelFile=$(find "$(Agent.BuildDirectory)/s/dist" -type f -name "*.whl" | head -n 1)
+          newWheelFile="$(basename "$wheelFile" .whl)-${{ job.os }}.whl"
+          mv "$wheelFile" "$(dirname "$wheelFile")/$newWheelFile"
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-prepare-package.yml
+      parameters:
+        sourceDir: $(Agent.BuildDirectory)/s/dist
+        contentsString: '*.whl'
+        targetDir: $(Build.ArtifactStagingDirectory)
+        clean: false
+    - task: PublishPipelineArtifact@1
+      displayName: 'wheel file Publish'
+      retryCountOnTaskFailure: 3
+      inputs:
+        targetPath: $(Build.ArtifactStagingDirectory)
+    - task: Bash@3
+      displayName: Save pipeline artifact file names
+      inputs:
+        workingDirectory: $(Pipeline.Workspace)
+        targetType: inline
+        script: |
+          whlFile=$(find "$(Build.ArtifactStagingDirectory)" -type f -name "*.whl" | head -n 1)
+          if [ -n "$whlFile" ]; then
+            echo $(basename "$whlFile") >> pipelineArtifacts.txt
+          fi
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+    #   parameters:
+    #     aptPackages: ${{ parameters.aptPackages }}
+    #     pipModules: ${{ parameters.pipModules }}

 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: Tensile_test_${{ job.target }}
+  - job: Tensile_test_${{ job.os }}_${{ job.target }}
    timeoutInMinutes: 180
-    dependsOn: Tensile_build
+    dependsOn: Tensile_build_${{ job.os }}
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -126,20 +167,23 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
+        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - task: DownloadPipelineArtifact@2
      displayName: 'Download Pipeline Wheel Files'
      inputs:
-        itemPattern: '**/*.whl'
+        itemPattern: '**/*${{ job.os }}*.whl'
        targetPath: $(Agent.BuildDirectory)
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
+        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - task: Bash@3
      displayName: pip install
@@ -164,7 +208,7 @@ jobs:
      inputs:
        targetType: inline
        script: tox run -v -e ci -- -m pre_checkin
-        workingDirectory: $(Build.SourcesDirectory)
+        workingDirectory: $(Agent.BuildDirectory)/s
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/amdsmi.yml
+++ b/.azuredevops/components/amdsmi.yml
@@ -104,7 +104,7 @@ jobs:
      parameters:
        componentName: amdsmi
        testDir: '$(Agent.BuildDirectory)'
-        testExecutable: './rocm/share/amd_smi/tests/amdsmitst'
+        testExecutable: 'sudo ./rocm/share/amd_smi/tests/amdsmitst'
        testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
--- a/.azuredevops/components/aomp.yml
+++ b/.azuredevops/components/aomp.yml
@@ -1,4 +1,7 @@
 parameters:
+- name: componentName
+  type: string
+  default: aomp
 - name: checkoutRepo
  type: string
  default: 'self'
@@ -15,173 +18,187 @@ parameters:
 - name: aptPackages
  type: object
  default:
+    - bison
+    - ccache
    - cmake
-    - python3-pip
-    - ninja-build
-    - pkg-config
-    - libpci-dev
-    - libnuma-dev
-    - libffi-dev
-    - git
-    - libopenmpi-dev
+    - flex
    - gawk
+    - git
    - mesa-common-dev
-    - libtool
+    - ninja-build
+    - libbabeltrace-dev
+    - libbison-dev
    - libdrm-amdgpu1
    - libdrm-dev
    - libdw-dev
-    - libgtest-dev
-    - libsystemd-dev
+    - libffi-dev
+    - libgmp-dev
+    - liblzma-dev
+    - libmpfr-dev
+    - libncurses5-dev
+    - libnuma-dev
+    - libopenmpi-dev
+    - libpci-dev
    - libssl-dev
    - libstdc++-12-dev
-    - ccache
-    - libgmp-dev
-    - libmpfr-dev
-    - texinfo
-    - libbison-dev
-    - bison
-    - flex
-    - libbabeltrace-dev
-    - libncurses5-dev
-    - liblzma-dev
-    - python3-setuptools
-    - python3-dev
+    - libsystemd-dev
+    - libtool
    - libudev-dev
    - parallel
-  # Referencing comment snippet.
-  #
-  # snippet from https://github.com/ROCm/aomp/blob/aomp-dev/bin/build_aomp.sh#L131-L134
-  #
-  # For ROCM build (AOMP_STANDALONE_BUILD=0) the components roct, rocr,
-  # libdevice, project, comgr, rocminfo, hipamd, rocdbgapi, rocgdb,
-  # roctracer, rocprofiler, rocm_smi_lib, and amdsmi should be found
-  # in ROCM in /opt/rocm.  The ROCM build only needs these components:
+    - pkg-config
+    - python3-dev
+    - python3-pip
+    - python3-setuptools
+    - texinfo
 - name: rocmDependencies
  type: object
  default:
-    - amdsmi
+    - llvm-project
+    - ROCR-Runtime
+- name: rocmTestDependencies
+  type: object
+  default:
    - clr
    - llvm-project
-    - ROCdbgapi
-    - ROCgdb
-    - rocm-cmake
    - rocm-core
    - rocminfo
-    - rocm_smi_lib
-    - rocprofiler
-    - rocprofiler-register
-    - rocprofiler-sdk
    - ROCR-Runtime
-    - roctracer
+    - rocprofiler-register

 - name: jobMatrix
  type: object
  default:
+    buildJobs:
+      - { os: ubuntu2204, packageManager: apt }
+      - { os: almalinux8, packageManager: dnf }
    testJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }

 jobs:
- job: aomp
-  variables:
-  - group: common
-  - template: /.azuredevops/variables-global.yml
-  pool: ${{ variables.MEDIUM_BUILD_POOL }}
-  workspace:
-    clean: all
-  steps:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-    parameters:
-      aptPackages: ${{ parameters.aptPackages }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-# checkout the repos tied to openmp-extras, plus llvm-project
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-    parameters:
-      checkoutRepo: ${{ parameters.checkoutRepo }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-    parameters:
-      checkoutRepo: aomp-extras_repo
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-    parameters:
-      checkoutRepo: flang_repo
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-    parameters:
-      checkoutRepo: llvm-project_repo
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-    parameters:
-      checkoutRef: ${{ parameters.checkoutRef }}
-      dependencyList: ${{ parameters.rocmDependencies }}
-      aggregatePipeline: ${{ parameters.aggregatePipeline }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-    parameters:
-      componentName: extras
-      cmakeBuildDir: '$(Build.SourcesDirectory)/aomp-extras/build'
-      cmakeSourceDir: '$(Build.SourcesDirectory)/aomp-extras'
-      installDir: '$(Build.BinariesDirectory)/llvm'
-      extraBuildFlags: >-
-        -DLLVM_DIR=$(Agent.BuildDirectory)/rocm/llvm
-        -DCMAKE_BUILD_TYPE=Release
-        -DAOMP_STANDALONE_BUILD=0
-        -DAOMP_VERSION_STRING=9.99.99
-        -GNinja
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-    parameters:
-      componentName: openmp
-      cmakeBuildDir: '$(Build.SourcesDirectory)/llvm-project/openmp/build'
-      cmakeSourceDir: '$(Build.SourcesDirectory)/llvm-project/openmp'
-      installDir: '$(Build.BinariesDirectory)/llvm'
-      extraBuildFlags: >-
-        -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Build.BinariesDirectory)"
-        -DCMAKE_BUILD_TYPE=Release
-        -DOPENMP_TEST_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang++
-        -DOPENMP_TEST_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang
-        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang++
-        -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang
-        -DOPENMP_ENABLE_LIBOMPTARGET=1
-        -DLIBOMP_COPY_EXPORTS=OFF
-        -DLIBOMP_OMPT_SUPPORT=ON
-        -DLIBOMP_OMPD_SUPPORT=ON
-        -DCMAKE_SKIP_INSTALL_RPATH=TRUE
-        -DLLVM_MAIN_INCLUDE_DIR=$(Build.SourcesDirectory)/llvm-project/llvm/include
-        -DLIBOMP_FORTRAN_MODULES_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/flang
-        -DLIBOMP_MODULES_INSTALL_PATH=$(Build.BinariesDirectory)/llvm/include/flang/
-        -GNinja
-  - task: Bash@3
-    displayName: 'ROCm symbolic link'
-    inputs:
-      targetType: inline
-      script: sudo ln -s $(Agent.BuildDirectory)/rocm /opt/rocm
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-    parameters:
-      componentName: offload
-      cmakeBuildDir: '$(Build.SourcesDirectory)/llvm-project/offload/build'
-      cmakeSourceDir: '$(Build.SourcesDirectory)/llvm-project/offload'
-      installDir: '$(Build.BinariesDirectory)/llvm'
-      extraBuildFlags: >-
-        -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Build.BinariesDirectory)"
-        -DCMAKE_BUILD_TYPE=Release
-        -DOPENMP_TEST_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang++
-        -DOPENMP_TEST_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang
-        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang++
-        -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang
-        -DCMAKE_SKIP_INSTALL_RPATH=TRUE
-        -DLLVM_MAIN_INCLUDE_DIR=$(Build.SourcesDirectory)/llvm-project/llvm/include
-        -DLIBOMPTARGET_LLVM_INCLUDE_DIRS=$(Build.SourcesDirectory)/llvm-project/llvm/include
-        -DCMAKE_EXE_LINKER_FLAGS="-L$(Agent.BuildDirectory)/rocm/llvm/lib"
-        -DCMAKE_SHARED_LINKER_FLAGS="-L$(Agent.BuildDirectory)/rocm/llvm/lib"
-        -GNinja
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-    parameters:
-      aptPackages: ${{ parameters.aptPackages }}
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    pool:
+      ${{ if eq(job.os, 'ubuntu2404') }}:
+        name: rocm-ci_medium_build_pool_2404
+      ${{ else }}:
+        name: ${{ variables.MEDIUM_BUILD_POOL }}
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    # checkout the repos tied to openmp-extras, plus llvm-project
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: ${{ parameters.checkoutRepo }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: aomp-extras_repo
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: flang_repo
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: llvm-project_repo
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
+      parameters:
+        dependencyList:
+          - gtest
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      parameters:
+        checkoutRef: ${{ parameters.checkoutRef }}
+        dependencyList: ${{ parameters.rocmDependencies }}
+        os: ${{ job.os }}
+        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        os: ${{ job.os }}
+        useAmdclang: false
+        componentName: extras
+        cmakeBuildDir: '$(Build.SourcesDirectory)/aomp-extras/build'
+        cmakeSourceDir: '$(Build.SourcesDirectory)/aomp-extras'
+        installDir: '$(Build.BinariesDirectory)/llvm'
+        extraBuildFlags: >-
+          -DLLVM_DIR=$(Agent.BuildDirectory)/rocm/llvm
+          -DCMAKE_BUILD_TYPE=Release
+          -DAOMP_STANDALONE_BUILD=0
+          -DAOMP_VERSION_STRING=9.99.99
+          -GNinja
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        os: ${{ job.os }}
+        componentName: openmp
+        cmakeBuildDir: '$(Build.SourcesDirectory)/llvm-project/openmp/build'
+        cmakeSourceDir: '$(Build.SourcesDirectory)/llvm-project/openmp'
+        installDir: '$(Build.BinariesDirectory)/llvm'
+        extraBuildFlags: >-
+          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Build.BinariesDirectory)"
+          -DCMAKE_BUILD_TYPE=Release
+          -DOPENMP_TEST_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang++
+          -DOPENMP_TEST_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang
+          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang++
+          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang
+          -DOPENMP_ENABLE_LIBOMPTARGET=1
+          -DLIBOMP_COPY_EXPORTS=OFF
+          -DLIBOMP_OMPD_SUPPORT=ON
+          -DCMAKE_SKIP_INSTALL_RPATH=TRUE
+          -DLLVM_MAIN_INCLUDE_DIR=$(Build.SourcesDirectory)/llvm-project/llvm/include
+          -DLIBOMP_FORTRAN_MODULES_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/flang
+          -DLIBOMP_MODULES_INSTALL_PATH=$(Build.BinariesDirectory)/llvm/include/flang/
+        multithreadFlag: -- -j32
+    - task: Bash@3
+      displayName: 'ROCm symbolic link'
+      inputs:
+        targetType: inline
+        script: sudo ln -s $(Agent.BuildDirectory)/rocm /opt/rocm
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        os: ${{ job.os }}
+        componentName: offload
+        cmakeBuildDir: '$(Build.SourcesDirectory)/llvm-project/offload/build'
+        cmakeSourceDir: '$(Build.SourcesDirectory)/llvm-project/offload'
+        installDir: '$(Build.BinariesDirectory)/llvm'
+        extraBuildFlags: >-
+          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Build.BinariesDirectory)"
+          -DCMAKE_BUILD_TYPE=Release
+          -DOPENMP_TEST_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang++
+          -DOPENMP_TEST_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang
+          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang++
+          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang
+          -DCMAKE_SKIP_INSTALL_RPATH=TRUE
+          -DLLVM_MAIN_INCLUDE_DIR=$(Build.SourcesDirectory)/llvm-project/llvm/include
+          -DLIBOMPTARGET_LLVM_INCLUDE_DIRS=$(Build.SourcesDirectory)/llvm-project/llvm/include
+          -DCMAKE_EXE_LINKER_FLAGS="-L$(Agent.BuildDirectory)/rocm/llvm/lib"
+          -DCMAKE_SHARED_LINKER_FLAGS="-L$(Agent.BuildDirectory)/rocm/llvm/lib"
+          -GNinja
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+      parameters:
+        componentName: ${{ parameters.componentName }}
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
+        componentName: ${{ parameters.componentName }}
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+    - ${{ if eq(job.os, 'ubuntu2204') }}:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}

 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: aomp_test_${{ job.target }}
-    dependsOn: aomp
+  - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
+    dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -198,12 +215,16 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+      parameters:
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmDependencies }}
+        dependencyList: ${{ parameters.rocmTestDependencies }}
+        os: ${{ job.os }}
    - task: Bash@3
      displayName: ROCm symbolic link
      inputs:
@@ -215,7 +236,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: aomp-extras_repo
-  # these copy steps are from the aomp prototype script for test prep
+    # these copy steps are from the aomp prototype script for test prep
    - task: CopyFiles@2
      displayName: 'Copy AOMP contents'
      inputs:
--- a/.azuredevops/components/composable_kernel.yml
+++ b/.azuredevops/components/composable_kernel.yml
@@ -107,6 +107,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
        gpuTarget: ${{ job.target }}
+    # if this artifact name is changed, please also update $ARTIFACT_URL inside miopen-get-ck-build.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
        gpuTarget: ${{ job.target }}
--- a/.azuredevops/components/copyHIP.yml
+++ b/.azuredevops/components/copyHIP.yml
@@ -1,36 +1,44 @@
 parameters:
- name: checkoutRepo
-  type: string
-  default: 'self'
- name: checkoutRef
-  type: string
-  default: ''
+- name: jobMatrix
+  type: object
+  default:
+    copyJobs:
+      - { os: ubuntu2204, backend: amd }
+      - { os: almalinux8, backend: amd }
+      - { os: ubuntu2204, backend: nvidia }
+      - { os: almalinux8, backend: nvidia }

 # hip and clr are tightly-coupled
 # run this same template for both repos
 # any changes for clr should just trigger HIP pipeline
 jobs:
- job: hip_clr_combined
-  variables:
-  - group: common
-  - template: /.azuredevops/variables-global.yml
-  pool:
-    vmImage: ${{ variables.BASE_BUILD_POOL }}
-  workspace:
-    clean: all
-  steps:
-# checkout nothing, just copy artifacts from triggering HIP job
-# and then publish for this clr job or for this hipother job to maintain latest
-  - checkout: none
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-download.yml
-    parameters:
-      componentName: HIP
-      pipelineId: $(HIP_PIPELINE_ID)
-  - task: Bash@3
-    displayName: Copy HIP artifacts
-    inputs:
-      targetType: inline
-      script: cp -a $(Agent.BuildDirectory)/rocm/* $(Build.BinariesDirectory)/
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+- ${{ each job in parameters.jobMatrix.copyJobs }}:
+  - job: hip_clr_combined_${{ job.os }}_${{ job.backend }}
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    pool:
+      vmImage: ${{ variables.BASE_BUILD_POOL }}
+    workspace:
+      clean: all
+    steps:
+  # checkout nothing, just copy artifacts from triggering HIP job
+  # and then publish for this clr job or for this hipother job to maintain latest
+    - checkout: none
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-download.yml
+      parameters:
+        componentName: HIP
+        pipelineId: $(HIP_PIPELINE_ID)
+        fileFilter: ${{ job.os }}*${{ job.backend }}
+    - task: Bash@3
+      displayName: Copy HIP artifacts
+      inputs:
+        targetType: inline
+        script: cp -a $(Agent.BuildDirectory)/rocm/* $(Build.BinariesDirectory)/
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+      parameters:
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      inputs:
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
--- a/.azuredevops/components/hipBLAS-common.yml
+++ b/.azuredevops/components/hipBLAS-common.yml
@@ -33,19 +33,24 @@ parameters:
  type: object
  default:
    - cmake
-    - ninja-build
    - git
+    - ninja-build
    - wget
-    - python3-pip
 - name: rocmDependencies
  type: object
  default:
-    - rocm-cmake
-    - llvm-project
-    - ROCR-Runtime
    - clr
+    - llvm-project
+    - rocm-cmake
    - rocminfo
+    - ROCR-Runtime

+- name: jobMatrix
+  type: object
+  default:
+    buildJobs:
+      - { os: ubuntu2204, packageManager: apt }
+      - { os: almalinux8, packageManager: dnf }
 - name: downstreamComponentMatrix
  type: object
  default:
@@ -54,57 +59,68 @@ parameters:
      sparseCheckoutDir: projects/hipblaslt
      skipUnifiedBuild: 'false'
      buildDependsOn:
-        gfx942:
-          - hipBLAS_common
-        gfx90a:
-          - hipBLAS_common
+        - hipBLAS_common_build

 jobs:
- job: hipBLAS_common
-  variables:
-  - group: common
-  - name: ROCM_PATH
-    value: $(Agent.BuildDirectory)/rocm
-  - template: /.azuredevops/variables-global.yml
-  pool:
-    vmImage: ${{ variables.BASE_BUILD_POOL }}
-  workspace:
-    clean: all
-  steps:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-    parameters:
-      aptPackages: ${{ parameters.aptPackages }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-    parameters:
-      checkoutRepo: ${{ parameters.checkoutRepo }}
-      sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-    parameters:
-      checkoutRef: ${{ parameters.checkoutRef }}
-      dependencyList: ${{ parameters.rocmDependencies }}
-      aggregatePipeline: ${{ parameters.aggregatePipeline }}
-      ${{ if parameters.triggerDownstreamJobs }}:
-          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-    parameters:
-      extraBuildFlags: >-
-        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
-        -GNinja
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-    parameters:
-      componentName: ${{ parameters.componentName }}
-      sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-    parameters:
-      componentName: ${{ parameters.componentName }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-  # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-  #   parameters:
-  #     aptPackages: ${{ parameters.aptPackages }}
-  #     extraEnvVars:
-  #       - ROCM_PATH:::/home/user/workspace/rocm
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
+  - job: hipBLAS_common_build_${{ job.os }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    - name: ROCM_PATH
+      value: $(Agent.BuildDirectory)/rocm
+    pool:
+      vmImage: ${{ variables.BASE_BUILD_POOL }}
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      parameters:
+        checkoutRef: ${{ parameters.checkoutRef }}
+        dependencyList: ${{ parameters.rocmDependencies }}
+        os: ${{ job.os }}
+        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        os: ${{ job.os }}
+        extraBuildFlags: >-
+          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
+          -GNinja
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+      parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
+        os: ${{ job.os }}
+        componentName: ${{ parameters.componentName }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+    #   parameters:
+    #     aptPackages: ${{ parameters.aptPackages }}
+    #     extraEnvVars:
+    #       - ROCM_PATH:::/home/user/workspace/rocm

 - ${{ if parameters.triggerDownstreamJobs }}:
  - ${{ each component in parameters.downstreamComponentMatrix }}:
--- a/.azuredevops/components/hipBLASLt.yml
+++ b/.azuredevops/components/hipBLASLt.yml
@@ -32,6 +32,8 @@ parameters:
 - name: aptPackages
  type: object
  default:
+    - ccache
+    - gfortran
    - git
    - libdrm-dev
    - libmsgpack-dev
@@ -39,9 +41,6 @@ parameters:
    - ninja-build
    - python3-pip
    - python3-venv
-    - gfortran
-    - libblas-dev
-    - ccache
 - name: pipModules
  type: object
  default:
@@ -78,15 +77,19 @@ parameters:
  type: object
  default:
    buildJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { pool: rocm-ci_ultra_build_pool, os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { pool: rocm-ci_medium_build_pool, os: ubuntu2204, packageManager: apt, target: gfx90a }
+      - { pool: rocm-ci_medium_build_pool, os: ubuntu2204, packageManager: apt, target: gfx1201 }
+      - { pool: rocm-ci_medium_build_pool, os: ubuntu2204, packageManager: apt, target: gfx1100 }
+      - { pool: rocm-ci_medium_build_pool, os: ubuntu2204, packageManager: apt, target: gfx1030 }
+      - { pool: rocm-ci_ultra_build_pool, os: almalinux8, packageManager: dnf, target: gfx942 }
+      - { pool: rocm-ci_medium_build_pool, os: almalinux8, packageManager: dnf, target: gfx90a }
+      - { pool: rocm-ci_medium_build_pool, os: almalinux8, packageManager: dnf, target: gfx1201 }
+      - { pool: rocm-ci_medium_build_pool, os: almalinux8, packageManager: dnf, target: gfx1100 }
+      - { pool: rocm-ci_medium_build_pool, os: almalinux8, packageManager: dnf, target: gfx1030 }
    testJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
 - name: downstreamComponentMatrix
  type: object
  default:
@@ -95,17 +98,16 @@ parameters:
      sparseCheckoutDir: projects/rocblas
      skipUnifiedBuild: 'false'
      buildDependsOn:
-        gfx942:
-          - hipBLASLt_build_gfx942
-        gfx90a:
-          - hipBLASLt_build_gfx90a
+        - hipBLASLt_build

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: ${{ parameters.componentName }}_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
    timeoutInMinutes: 300
    ${{ if parameters.buildDependsOn }}:
-      dependsOn: ${{ parameters.buildDependsOn[job.target] }}
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -119,7 +121,11 @@ jobs:
      value: $(Agent.BuildDirectory)/rocm
    - name: DAY_STRING
      value: $[format('{0:ddMMyyyy}', pipeline.startTime)]
-    pool: ${{ variables.ULTRA_BUILD_POOL }}
+    pool: ${{ job.pool }}
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
    workspace:
      clean: all
    steps:
@@ -127,16 +133,22 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
+        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
+      parameters:
+        dependencyList:
+          - gtest
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
+        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
        ${{ if parameters.triggerDownstreamJobs }}:
@@ -148,22 +160,17 @@ jobs:
        script: |
          echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/bin"
          echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/llvm/bin"
-  # Build and install gtest, lapack, hipBLAS-common
-  # $(Pipeline.Workspace)/deps is a temporary folder for the build process
-  # $(Pipeline.Workspace)/s/deps is part of the hipBLASLt repo
-    - script: mkdir $(Pipeline.Workspace)/deps
-      displayName: Create temp folder for external dependencies
-  # hipBLASLt already has a CMake script for external deps, so we can just run that
-  # https://github.com/ROCm/hipBLASLt/blob/develop/deps/CMakeLists.txt
-    - script: cmake $(Pipeline.Workspace)/s/deps
-      displayName: Configure hipBLASLt external dependencies
-      workingDirectory: $(Pipeline.Workspace)/deps
-    - script: make
-      displayName: Build hipBLASLt external dependencies
-      workingDirectory: $(Pipeline.Workspace)/deps
-    - script: sudo make install
-      displayName: Install hipBLASLt external dependencies
-      workingDirectory: $(Pipeline.Workspace)/deps
+    - task: Bash@3
+      displayName: Build and install LAPACK
+      inputs:
+        targetType: inline
+        script: |
+          mkdir -p $(Agent.BuildDirectory)/temp-deps
+          cd $(Agent.BuildDirectory)/temp-deps
+          # position-independent LAPACK is required for almalinux8 builds
+          cmake -DBUILD_GTEST=OFF -DBUILD_LAPACK=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON $(Agent.BuildDirectory)/s/deps
+          make
+          sudo make install
    - script: |
        mkdir -p $(CCACHE_DIR)
        echo "##vso[task.prependpath]/usr/lib/ccache"
@@ -171,58 +178,58 @@ jobs:
    - task: Cache@2
      displayName: Ccache caching
      inputs:
-        key: hipBLASLt | $(Agent.OS) | ${{ job.target }} | $(DAY_STRING) | $(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
+        key: hipBLASLt | ${{ job.os }} | ${{ job.target }} | $(DAY_STRING) | $(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
        path: $(CCACHE_DIR)
        restoreKeys: |
-          hipBLASLt | $(Agent.OS) | ${{ job.target }} | $(DAY_STRING)
-          hipBLASLt | $(Agent.OS) | ${{ job.target }}
-          hipBLASLt | $(Agent.OS)
+          hipBLASLt | ${{ job.os }} | ${{ job.target }} | $(DAY_STRING)
+          hipBLASLt | ${{ job.os }} | ${{ job.target }}
+          hipBLASLt | ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
+        os: ${{ job.os }}
        extraBuildFlags: >-
-          -DCMAKE_BUILD_TYPE=Release
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor
+          -DCMAKE_INCLUDE_PATH=$(Agent.BuildDirectory)/rocm/llvm/include
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
          -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
          -DCMAKE_C_COMPILER_LAUNCHER=ccache
          -DAMDGPU_TARGETS=${{ job.target }}
-          -DTensile_LOGIC=
-          -DTensile_CPU_THREADS=
-          -DTensile_LIBRARY_FORMAT=msgpack
-          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm"
          -DBUILD_CLIENTS_TESTS=ON
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
        componentName: ${{ parameters.componentName }}
        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
        componentName: ${{ parameters.componentName }}
+        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-        gpuTarget: ${{ job.target }}
-        extraPaths: /home/user/workspace/rocm/llvm/bin:/home/user/workspace/rocm/bin
-        installLatestCMake: true
-        extraEnvVars:
-          - HIP_ROCCLR_HOME:::/home/user/workspace/rocm
-          - TENSILE_ROCM_ASSEMBLER_PATH:::/home/user/workspace/rocm/llvm/bin/amdclang
-          - TENSILE_ROCM_OFFLOAD_BUNDLER_PATH:::/home/user/workspace/rocm/llvm/bin/clang-offload-bundler
-          - ROCM_PATH:::/home/user/workspace/rocm
-        extraCopyDirectories:
-          - deps
+    - ${{ if eq(job.os, 'ubuntu2204') }}:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+          gpuTarget: ${{ job.target }}
+          extraPaths: /home/user/workspace/rocm/llvm/bin:/home/user/workspace/rocm/bin
+          installLatestCMake: true
+          extraEnvVars:
+            - HIP_ROCCLR_HOME:::/home/user/workspace/rocm
+            - TENSILE_ROCM_ASSEMBLER_PATH:::/home/user/workspace/rocm/llvm/bin/amdclang
+            - TENSILE_ROCM_OFFLOAD_BUNDLER_PATH:::/home/user/workspace/rocm/llvm/bin/clang-offload-bundler
+            - ROCM_PATH:::/home/user/workspace/rocm
+          extraCopyDirectories:
+            - deps

 - ${{ if eq(parameters.unifiedBuild, False) }}:
  - ${{ each job in parameters.jobMatrix.testJobs }}:
-    - job: ${{ parameters.componentName }}_test_${{ job.target }}
+    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
      timeoutInMinutes: 300
-      dependsOn: ${{ parameters.componentName }}_build_${{ job.target }}
+      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
      condition:
        and(succeeded(),
          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -238,6 +245,7 @@ jobs:
      workspace:
        clean: all
      steps:
+      - checkout: none
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
        parameters:
          aptPackages: ${{ parameters.aptPackages }}
@@ -246,12 +254,16 @@ jobs:
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
        parameters:
          preTargetFilter: ${{ parameters.componentName }}
+          os: ${{ job.os }}
          gpuTarget: ${{ job.target }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+        parameters:
+          os: ${{ job.os }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
        parameters:
          checkoutRef: ${{ parameters.checkoutRef }}
          dependencyList: ${{ parameters.rocmTestDependencies }}
+          os: ${{ job.os }}
          gpuTarget: ${{ job.target }}
          ${{ if parameters.triggerDownstreamJobs }}:
            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
@@ -259,6 +271,7 @@ jobs:
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
        parameters:
          componentName: ${{ parameters.componentName }}
+          os: ${{ job.os }}
          testDir: '$(Agent.BuildDirectory)/rocm/bin'
          testExecutable: './hipblaslt-test'
          testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes --gtest_filter=*pre_checkin*'
--- a/.azuredevops/components/hipCUB.yml
+++ b/.azuredevops/components/hipCUB.yml
@@ -33,9 +33,8 @@ parameters:
  type: object
  default:
    - cmake
-    - ninja-build
-    - libgtest-dev
    - git
+    - ninja-build
    - python3-pip
 - name: rocmDependencies
  type: object
@@ -52,79 +51,102 @@ parameters:
    - llvm-project
    - rocminfo
    - rocPRIM
-    - ROCR-Runtime
    - rocprofiler-register
+    - ROCR-Runtime

 - name: jobMatrix
  type: object
  default:
    buildJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
+      - { os: almalinux8, packageManager: dnf, target: gfx942 }
+      - { os: almalinux8, packageManager: dnf, target: gfx90a }
+      - { os: almalinux8, packageManager: dnf, target: gfx1201 }
+      - { os: almalinux8, packageManager: dnf, target: gfx1100 }
+      - { os: almalinux8, packageManager: dnf, target: gfx1030 }
    testJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: ${{ parameters.componentName }}_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
    ${{ if parameters.buildDependsOn }}:
-      dependsOn: ${{ parameters.buildDependsOn[job.target] }}
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
    pool: ${{ variables.MEDIUM_BUILD_POOL }}
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
    workspace:
      clean: all
    steps:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
+      parameters:
+        dependencyList:
+          - gtest
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
        ${{ if parameters.triggerDownstreamJobs }}:
          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
+        os: ${{ job.os }}
+        consolidateBuildAndInstall: true
        extraBuildFlags: >-
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor"
+          -DBUILD_BENCHMARK=ON
          -DBUILD_TEST=ON
          -DAMDGPU_TARGETS=${{ job.target }}
          -GNinja
+        extraCxxFlags: -Wno-deprecated-declarations
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
        componentName: ${{ parameters.componentName }}
        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        gpuTarget: ${{ job.target }}
+    - ${{ if eq(job.os, 'ubuntu2204') }}:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          gpuTarget: ${{ job.target }}

 - ${{ if eq(parameters.unifiedBuild, False) }}:
  - ${{ each job in parameters.jobMatrix.testJobs }}:
-    - job: ${{ parameters.componentName }}_test_${{ job.target }}
-      dependsOn: ${{ parameters.componentName }}_build_${{ job.target }}
+    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
+      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
      condition:
        and(succeeded(),
          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -142,17 +164,20 @@ jobs:
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
        parameters:
          aptPackages: ${{ parameters.aptPackages }}
+          packageManager: ${{ job.packageManager }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
        parameters:
          preTargetFilter: ${{ parameters.componentName }}
          gpuTarget: ${{ job.target }}
+          os: ${{ job.os }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
        parameters:
          checkoutRef: ${{ parameters.checkoutRef }}
          dependencyList: ${{ parameters.rocmTestDependencies }}
          gpuTarget: ${{ job.target }}
+          os: ${{ job.os }}
          ${{ if parameters.triggerDownstreamJobs }}:
            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
@@ -160,6 +185,7 @@ jobs:
        parameters:
          componentName: ${{ parameters.componentName }}
          testDir: '$(Agent.BuildDirectory)/rocm/bin/hipcub'
+          os: ${{ job.os }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
        parameters:
          aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/hipFFT.yml
+++ b/.azuredevops/components/hipFFT.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: hipFFT
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -61,7 +80,11 @@ parameters:

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: hipFFT_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.target }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.target }} # todo: add OS
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -79,12 +102,15 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        extraBuildFlags: >-
@@ -102,9 +128,11 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
@@ -113,8 +141,8 @@ jobs:
    #     gpuTarget: ${{ job.target }}

 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: hipFFT_test_${{ job.target }}
-    dependsOn: hipFFT_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_test_${{ job.target }}
+    dependsOn: ${{ parameters.componentName }}_build_${{ job.target }}
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -128,12 +156,14 @@ jobs:
    workspace:
      clean: all
    steps:
+    - checkout: none
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
      parameters:
+        preTargetFilter: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
@@ -141,10 +171,12 @@ jobs:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmTestDependencies }}
        gpuTarget: ${{ job.target }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
      parameters:
-        componentName: hipFFT
+        componentName: ${{ parameters.componentName }}
        testDir: '$(Agent.BuildDirectory)/rocm/bin'
        testExecutable: './hipfft-test'
        testParameters: '--test_prob 0.002 --gtest_output=xml:./test_output.xml --gtest_color=yes'
--- a/.azuredevops/components/hipRAND.yml
+++ b/.azuredevops/components/hipRAND.yml
@@ -33,18 +33,18 @@ parameters:
  type: object
  default:
    - cmake
-    - ninja-build
-    - googletest
    - git
+    - ninja-build
    - python3-pip
 - name: rocmDependencies
  type: object
  default:
-    - llvm-project
-    - ROCR-Runtime
    - clr
+    - llvm-project
+    - rocm-cmake
    - rocminfo
    - rocRAND
+    - ROCR-Runtime
 - name: rocmTestDependencies
  type: object
  default:
@@ -52,62 +52,88 @@ parameters:
    - llvm-project
    - rocminfo
    - rocprofiler-register
-    - ROCR-Runtime
    - rocRAND
+    - ROCR-Runtime

 - name: jobMatrix
  type: object
  default:
    buildJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
+      - { os: almalinux8, packageManager: dnf, target: gfx942 }
+      - { os: almalinux8, packageManager: dnf, target: gfx90a }
+      - { os: almalinux8, packageManager: dnf, target: gfx1201 }
+      - { os: almalinux8, packageManager: dnf, target: gfx1100 }
+      - { os: almalinux8, packageManager: dnf, target: gfx1030 }
    testJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+# - name: downstreamComponentMatrix
+#   type: object
+#   default:
+#     - rocFFT:
+#       name: rocFFT
+#       sparseCheckoutDir: projects/rocfft
+#       skipUnifiedBuild: 'false'
+#       buildDependsOn:
+#         - hipRAND_build

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: ${{ parameters.componentName }}_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
    ${{ if parameters.buildDependsOn }}:
-      dependsOn: ${{ parameters.buildDependsOn[job.target] }}
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
    - name: HIP_ROCCLR_HOME
      value: $(Build.BinariesDirectory)/rocm
-    pool:
-      vmImage: ${{ variables.BASE_BUILD_POOL }}
+    pool: ${{ variables.MEDIUM_BUILD_POOL }}
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
    workspace:
      clean: all
    steps:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
+      parameters:
+        dependencyList:
+          - gtest
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
        ${{ if parameters.triggerDownstreamJobs }}:
          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
+        os: ${{ job.os }}
+        consolidateBuildAndInstall: true
        extraBuildFlags: >-
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
          -DBUILD_TEST=ON
          -DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm/lib/cmake/hip
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor"
          -DCMAKE_BUILD_TYPE=Release
          -DAMDGPU_TARGETS=${{ job.target }}
          -GNinja
@@ -116,22 +142,25 @@ jobs:
        componentName: ${{ parameters.componentName }}
        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-    #   parameters:
-    #     aptPackages: ${{ parameters.aptPackages }}
-    #     gpuTarget: ${{ job.target }}
-    #     extraEnvVars:
-    #       - HIP_ROCCLR_HOME:::/home/user/workspace/rocm
+    - ${{ if eq(job.os, 'ubuntu2204') }}:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          gpuTarget: ${{ job.target }}
+          extraEnvVars:
+            - HIP_ROCCLR_HOME:::/home/user/workspace/rocm

 - ${{ if eq(parameters.unifiedBuild, False) }}:
  - ${{ each job in parameters.jobMatrix.testJobs }}:
-    - job: ${{ parameters.componentName }}_test_${{ job.target }}
-      dependsOn: ${{ parameters.componentName }}_build_${{ job.target }}
+    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
+      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
      condition:
          and(succeeded(),
            eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -149,10 +178,12 @@ jobs:
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
        parameters:
          aptPackages: ${{ parameters.aptPackages }}
+          packageManager: ${{ job.packageManager }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
        parameters:
          preTargetFilter: ${{ parameters.componentName }}
+          os: ${{ job.os }}
          gpuTarget: ${{ job.target }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
@@ -160,6 +191,7 @@ jobs:
          checkoutRef: ${{ parameters.checkoutRef }}
          dependencyList: ${{ parameters.rocmTestDependencies }}
          gpuTarget: ${{ job.target }}
+          os: ${{ job.os }}
          ${{ if parameters.triggerDownstreamJobs }}:
            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
@@ -167,8 +199,21 @@ jobs:
        parameters:
          componentName: ${{ parameters.componentName }}
          testDir: '$(Agent.BuildDirectory)/rocm/bin/hipRAND'
+          os: ${{ job.os }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
        parameters:
          aptPackages: ${{ parameters.aptPackages }}
          environment: test
          gpuTarget: ${{ job.target }}
+
+# - ${{ if parameters.triggerDownstreamJobs }}:
+#   - ${{ each component in parameters.downstreamComponentMatrix }}:
+#     - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
+#       - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
+#         parameters:
+#           checkoutRepo: ${{ parameters.checkoutRepo }}
+#           sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
+#           buildDependsOn: ${{ component.buildDependsOn }}
+#           downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
+#           triggerDownstreamJobs: true
+#           unifiedBuild: ${{ parameters.unifiedBuild }}
--- a/.azuredevops/components/hipSOLVER.yml
+++ b/.azuredevops/components/hipSOLVER.yml
@@ -70,8 +70,7 @@ jobs:
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
-    pool:
-      vmImage: ${{ variables.BASE_BUILD_POOL }}
+    pool: ${{ variables.MEDIUM_BUILD_POOL }}
    workspace:
      clean: all
    steps:
--- a/.azuredevops/components/hipSPARSE.yml
+++ b/.azuredevops/components/hipSPARSE.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: hipSPARSE
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -14,13 +33,11 @@ parameters:
  type: object
  default:
    - cmake
-    - ninja-build
-    - libboost-program-options-dev
-    - googletest
-    - libfftw3-dev
-    - git
    - gfortran
-    - libgtest-dev
+    - git
+    - libboost-program-options-dev
+    - libfftw3-dev
+    - ninja-build
    - python3-pip
 - name: rocmDependencies
  type: object
@@ -49,19 +66,31 @@ parameters:
  type: object
  default:
    buildJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
    testJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+- name: downstreamComponentMatrix
+  type: object
+  default:
+    - hipSPARSELt:
+      name: hipSPARSELt
+      sparseCheckoutDir: projects/hipsparselt
+      skipUnifiedBuild: 'false'
+      buildDependsOn:
+        - hipSPARSE_build

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: hipSPARSE_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -73,42 +102,57 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
+      parameters:
+        dependencyList:
+          - gtest
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
+        os: ${{ job.os }}
        extraBuildFlags: >-
+          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
+          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/bin/amdclang
          -DCMAKE_BUILD_TYPE=Release
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/rocm/share/rocm/cmake/
          -DBUILD_CLIENTS_TESTS=ON
          -DBUILD_CLIENTS_SAMPLES=OFF
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
-        artifactName: hipSPARSE
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
-        artifactName: hipSPARSE
+        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
        publish: false
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-prepare-package.yml
      parameters:
-        sourceDir: $(Build.SourcesDirectory)/build/clients
+        sourceDir: $(Agent.BuildDirectory)/s/build/clients
        contentsString: matrices/**
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        artifactName: testMatrices
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
    #   parameters:
@@ -116,44 +160,65 @@ jobs:
    #     environment: test
    #     gpuTarget: ${{ job.target }}

- ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: hipSPARSE_test_${{ job.target }}
-    dependsOn: hipSPARSE_build_${{ job.target }}
-    condition:
-      and(succeeded(),
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
-        eq(${{ parameters.aggregatePipeline }}, False)
-      )
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool: ${{ job.target }}_test_pool
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-      parameters:
-        gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmTestDependencies }}
-        gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: hipSPARSE
-        testDir: '$(Agent.BuildDirectory)/rocm/bin'
-        testExecutable: './hipsparse-test'
-        testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        environment: test
-        gpuTarget: ${{ job.target }}
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
+    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
+      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
+      condition:
+        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
+          eq(${{ parameters.aggregatePipeline }}, False)
+        )
+      variables:
+      - group: common
+      - template: /.azuredevops/variables-global.yml
+      pool: ${{ job.target }}_test_pool
+      workspace:
+        clean: all
+      steps:
+      - checkout: none
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          packageManager: ${{ job.packageManager }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+        parameters:
+          preTargetFilter: ${{ parameters.componentName }}
+          gpuTarget: ${{ job.target }}
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
+          dependencyList: ${{ parameters.rocmTestDependencies }}
+          gpuTarget: ${{ job.target }}
+          os: ${{ job.os }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: ${{ parameters.componentName }}
+          os: ${{ job.os }}
+          testDir: '$(Agent.BuildDirectory)/rocm/bin'
+          testExecutable: './hipsparse-test'
+          testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          environment: test
+          gpuTarget: ${{ job.target }}
+
+  - ${{ if parameters.triggerDownstreamJobs }}:
+    - ${{ each component in parameters.downstreamComponentMatrix }}:
+      - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
+        - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
+          parameters:
+            checkoutRepo: ${{ parameters.checkoutRepo }}
+            sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
+            buildDependsOn: ${{ component.buildDependsOn }}
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
+            triggerDownstreamJobs: true
+            unifiedBuild: ${{ parameters.unifiedBuild }}
--- a/.azuredevops/components/hipSPARSELt.yml
+++ b/.azuredevops/components/hipSPARSELt.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: hipSPARSELt
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -56,15 +75,17 @@ parameters:
  type: object
  default:
    buildJobs:
-      - gfx942:
-        target: gfx942
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
    testJobs:
-      - gfx942:
-        target: gfx942
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: hipSPARSELt_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -86,17 +107,22 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
+        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
  # Build and install gtest and lapack
  # $(Pipeline.Workspace)/deps is a temporary folder for the build process
  # $(Pipeline.Workspace)/s/deps is part of the hipSPARSELt repo
@@ -115,6 +141,7 @@ jobs:
      workingDirectory: $(Pipeline.Workspace)/deps
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
+        os: ${{ job.os }}
        extraBuildFlags: >-
          -DCMAKE_BUILD_TYPE=Release
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
@@ -130,64 +157,80 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-        gpuTarget: ${{ job.target }}
-        extraCopyDirectories:
-          - deps
-        extraPaths: /home/user/workspace/rocm/llvm/bin:/home/user/workspace/rocm/bin
-        extraEnvVars:
-          - HIP_ROCCLR_HOME:::/home/user/workspace/rocm
-          - TENSILE_ROCM_ASSEMBLER_PATH:::/home/user/workspace/rocm/llvm/bin/clang
-          - CMAKE_CXX_COMPILER:::/home/user/workspace/rocm/llvm/bin/hipcc
-          - TENSILE_ROCM_OFFLOAD_BUNDLER_PATH:::/home/user/workspace/rocm/llvm/bin/clang-offload-bundler
-        installLatestCMake: true
+    - ${{ if eq(job.os, 'ubuntu2204') }}:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+          gpuTarget: ${{ job.target }}
+          extraCopyDirectories:
+            - deps
+          extraPaths: /home/user/workspace/rocm/llvm/bin:/home/user/workspace/rocm/bin
+          extraEnvVars:
+            - HIP_ROCCLR_HOME:::/home/user/workspace/rocm
+            - TENSILE_ROCM_ASSEMBLER_PATH:::/home/user/workspace/rocm/llvm/bin/clang
+            - CMAKE_CXX_COMPILER:::/home/user/workspace/rocm/llvm/bin/hipcc
+            - TENSILE_ROCM_OFFLOAD_BUNDLER_PATH:::/home/user/workspace/rocm/llvm/bin/clang-offload-bundler
+          installLatestCMake: true

- ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: hipSPARSELt_test_${{ job.target }}
-    dependsOn: hipSPARSELt_build_${{ job.target }}
-    condition:
-      and(succeeded(),
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
-        eq(${{ parameters.aggregatePipeline }}, False)
-      )
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool: ${{ job.target }}_test_pool
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-      parameters:
-        gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmTestDependencies }}
-        gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: hipSPARSELt
-        testDir: '$(Agent.BuildDirectory)/rocm/bin'
-        testExecutable: './hipsparselt-test'
-        testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes --gtest_filter=*pre_checkin*'
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-        environment: test
-        gpuTarget: ${{ job.target }}
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
+    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
+      timeoutInMinutes: 120
+      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
+      condition:
+        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
+          eq(${{ parameters.aggregatePipeline }}, False)
+        )
+      variables:
+      - group: common
+      - template: /.azuredevops/variables-global.yml
+      pool: ${{ job.target }}_test_pool
+      workspace:
+        clean: all
+      steps:
+      - checkout: none
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          packageManager: ${{ job.packageManager }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+        parameters:
+          preTargetFilter: ${{ parameters.componentName }}
+          gpuTarget: ${{ job.target }}
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
+          dependencyList: ${{ parameters.rocmTestDependencies }}
+          gpuTarget: ${{ job.target }}
+          os: ${{ job.os }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: ${{ parameters.componentName }}
+          os: ${{ job.os }}
+          testDir: '$(Agent.BuildDirectory)/rocm/bin'
+          testExecutable: './hipsparselt-test'
+          testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes --gtest_filter=*pre_checkin*'
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+          environment: test
+          gpuTarget: ${{ job.target }}
--- a/.azuredevops/components/llvm-project.yml
+++ b/.azuredevops/components/llvm-project.yml
@@ -30,13 +30,17 @@ parameters:
  default:
    buildJobs:
      - { os: ubuntu2204, packageManager: apt }
+      - { os: ubuntu2404, packageManager: apt }
      - { os: almalinux8, packageManager: dnf }

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: llvm_project_${{ job.os }}
    pool:
-      name: 'rocm-ci_ultra_build_pool'
+      ${{ if eq(job.os, 'ubuntu2404') }}:
+        name: 'rocm-ci_high_build_pool_2404' #temporarily using 'high' pool while 'ultra' is down
+      ${{ else }}:
+        name: 'rocm-ci_ultra_build_pool'
    ${{ if eq(job.os, 'almalinux8') }}:
      container:
        image: rocmexternalcicd.azurecr.io/manylinux228:latest
@@ -63,7 +67,6 @@ jobs:
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
-        skipLlvmSymlink: true
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
@@ -104,6 +107,12 @@ jobs:
        cmakeSourceDir: '$(Build.SourcesDirectory)/llvm'
        installDir: '$(Build.BinariesDirectory)/llvm'
    # use llvm-lit to run unit tests for llvm, clang, and lld
+    - task: Bash@3
+      displayName: 'Copy llvm-lit to install directory'
+      inputs:
+        targetType: inline
+        script: |
+          cp $(Build.SourcesDirectory)/llvm/build/bin/llvm-lit $(Build.BinariesDirectory)/llvm/bin/
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
      parameters:
        componentName: check-llvm
--- a/.azuredevops/components/rccl.yml
+++ b/.azuredevops/components/rccl.yml
@@ -15,7 +15,6 @@ parameters:
  default:
    - cmake
    - git
-    - googletest
    - libboost-program-options-dev
    - libdrm-dev
    - libfftw3-dev
@@ -90,6 +89,10 @@ jobs:
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
        submoduleBehaviour: recursive
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
+      parameters:
+        dependencyList:
+          - gtest
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
@@ -101,12 +104,11 @@ jobs:
        extraBuildFlags: >-
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/bin/hipcc
          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/bin/hipcc
-          -DHALF_INCLUDE_DIR=$(Agent.BuildDirectory)/rocm/include
          -DCMAKE_BUILD_TYPE=Release
          -DROCM_PATH=$(Agent.BuildDirectory)/rocm
          -DBUILD_TESTS=ON
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/rocm/share/rocm/cmake;$(Agent.BuildDirectory)/rocm/libexec/hipify
-          -DAMDGPU_TARGETS=${{ job.target }}
+          -DGPU_TARGETS=${{ job.target }}
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
--- a/.azuredevops/components/rocAL.yml
+++ b/.azuredevops/components/rocAL.yml
@@ -86,8 +86,7 @@ jobs:
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
-    pool:
-      vmImage: ${{ variables.BASE_BUILD_POOL }}
+    pool:  ${{ variables.MEDIUM_BUILD_POOL }}
    workspace:
      clean: all
    steps:
--- a/.azuredevops/components/rocALUTION.yml
+++ b/.azuredevops/components/rocALUTION.yml
@@ -73,8 +73,7 @@ jobs:
    - template: /.azuredevops/variables-global.yml
    - name: HIP_ROCCLR_HOME
      value: $(Build.BinariesDirectory)/rocm
-    pool:
-      vmImage: ${{ variables.BASE_BUILD_POOL }}
+    pool: ${{ variables.MEDIUM_BUILD_POOL }}
    workspace:
      clean: all
    steps:
--- a/.azuredevops/components/rocBLAS.yml
+++ b/.azuredevops/components/rocBLAS.yml
@@ -33,17 +33,15 @@ parameters:
  type: object
  default:
    - cmake
-    - ninja-build
-    - python3-venv
    - git
-    - libmsgpack-dev
    - gfortran
-    - libopenblas-dev
-    - googletest
-    - libgtest-dev
-    - wget
-    - python3-pip
    - libdrm-dev
+    - libmsgpack-dev
+    - libopenblas-dev
+    - ninja-build
+    - python3-pip
+    - python3-venv
+    - wget
 - name: pipModules
  type: object
  default:
@@ -52,18 +50,17 @@ parameters:
 - name: rocmDependencies
  type: object
  default:
-    - rocm-cmake
-    - llvm-project
-    - ROCR-Runtime
-    - clr
-    - rocminfo
-    - rocprofiler-register
-    - rocm_smi_lib
-    - rocm-core
    - aomp
-    - aomp-extras
+    - clr
    - hipBLAS-common
    - hipBLASLt
+    - llvm-project
+    - rocm-cmake
+    - rocm-core
+    - rocm_smi_lib
+    - rocminfo
+    - rocprofiler-register
+    - ROCR-Runtime
    - roctracer
 - name: rocmTestDependencies
  type: object
@@ -83,44 +80,49 @@ parameters:
  type: object
  default:
    buildJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
+      - { os: almalinux8, packageManager: dnf, target: gfx942 }
+      - { os: almalinux8, packageManager: dnf, target: gfx90a }
+      - { os: almalinux8, packageManager: dnf, target: gfx1201 }
+      - { os: almalinux8, packageManager: dnf, target: gfx1100 }
+      - { os: almalinux8, packageManager: dnf, target: gfx1030 }
    testJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
 - name: downstreamComponentMatrix
  type: object
  default:
-    # rocSOLVER depends on both rocBLAS and rocPRIM
-    # for a unified build, rocBLAS will be the one to call rocSOLVER
-    - rocSOLVER:
-      name: rocSOLVER
-      sparseCheckoutDir: projects/rocsolver
+    - rocSPARSE:
+      name: rocSPARSE
+      sparseCheckoutDir: projects/rocsparse
      skipUnifiedBuild: 'false'
      buildDependsOn:
-        gfx942:
-          - rocBLAS_build_gfx942
-        gfx90a:
-          - rocBLAS_build_gfx90a
-      unifiedBuild:
-        downstreamAggregateNames: rocBLAS+rocPRIM
-        buildDependsOn:
-          gfx942:
-            - rocBLAS_build_gfx942
-            - rocPRIM_build_gfx942
-          gfx90a:
-            - rocBLAS_build_gfx90a
-            - rocPRIM_build_gfx90a
+        - rocBLAS_build
+    # rocSOLVER depends on both rocBLAS and rocPRIM
+    # for a unified build, rocBLAS will be the one to call rocSOLVER
+    # - rocSOLVER:
+    #   name: rocSOLVER
+    #   sparseCheckoutDir: projects/rocsolver
+    #   skipUnifiedBuild: 'false'
+    #   buildDependsOn:
+    #     - rocBLAS_build
+    #   unifiedBuild:
+    #     downstreamAggregateNames: rocBLAS+rocPRIM
+    #     buildDependsOn:
+    #       - rocBLAS_build
+    #       - rocPRIM_build

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: ${{ parameters.componentName }}_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
    ${{ if parameters.buildDependsOn }}:
-      dependsOn: ${{ parameters.buildDependsOn[job.target] }}
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -133,6 +135,10 @@ jobs:
    - name: ROCM_PATH
      value: $(Agent.BuildDirectory)/rocm
    pool: ${{ variables.MEDIUM_BUILD_POOL }}
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
    workspace:
      clean: all
    steps:
@@ -140,6 +146,7 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
+        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
@@ -147,59 +154,62 @@ jobs:
        checkoutRepo: ${{ parameters.checkoutRepo }}
        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aocl.yml
+      parameters:
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
+      parameters:
+        dependencyList:
+          - gtest
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
+        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
        ${{ if parameters.triggerDownstreamJobs }}:
          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
+        os: ${{ job.os }}
        extraBuildFlags: >-
-          -DCMAKE_TOOLCHAIN_FILE=toolchain-linux.cmake
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm/llvm;$(Agent.BuildDirectory)/rocm
+          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm/llvm;$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor
          -DCMAKE_BUILD_TYPE=Release
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/bin/amdclang++
          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/bin/amdclang
          -DGPU_TARGETS=${{ job.target }}
-          -DTensile_CODE_OBJECT_VERSION=default
-          -DTensile_LOGIC=asm_full
-          -DTensile_SEPARATE_ARCHITECTURES=ON
-          -DTensile_LAZY_LIBRARY_LOADING=ON
-          -DTensile_LIBRARY_FORMAT=msgpack
          -DBUILD_CLIENTS_TESTS=ON
-          -DBUILD_CLIENTS_BENCHMARKS=OFF
-          -DBUILD_CLIENTS_SAMPLES=OFF
-          -DROCM_PATH=$(Agent.BuildDirectory)/rocm
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
        componentName: ${{ parameters.componentName }}
        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
        componentName: ${{ parameters.componentName }}
+        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-        installAOCL: true
-        gpuTarget: ${{ job.target }}
-        extraEnvVars:
-          - HIP_ROCCLR_HOME:::/home/user/workspace/rocm
-          - TENSILE_ROCM_ASSEMBLER_PATH:::/home/user/workspace/rocm/llvm/bin/clang
-          - TENSILE_ROCM_OFFLOAD_BUNDLER_PATH:::/home/user/workspace/rocm/llvm/bin/clang-offload-bundler
-          - ROCM_PATH:::/home/user/workspace/rocm
+    - ${{ if eq(job.os, 'ubuntu2204') }}:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+          installAOCL: true
+          gpuTarget: ${{ job.target }}
+          extraEnvVars:
+            - HIP_ROCCLR_HOME:::/home/user/workspace/rocm
+            - TENSILE_ROCM_ASSEMBLER_PATH:::/home/user/workspace/rocm/llvm/bin/clang
+            - TENSILE_ROCM_OFFLOAD_BUNDLER_PATH:::/home/user/workspace/rocm/llvm/bin/clang-offload-bundler
+            - ROCM_PATH:::/home/user/workspace/rocm

 - ${{ if eq(parameters.unifiedBuild, False) }}:
  - ${{ each job in parameters.jobMatrix.testJobs }}:
-    - job: ${{ parameters.componentName }}_test_${{ job.target }}
-      dependsOn: ${{ parameters.componentName }}_build_${{ job.target }}
+    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
+      timeoutInMinutes: 120
+      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
      condition:
        and(succeeded(),
          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -213,6 +223,7 @@ jobs:
      workspace:
        clean: all
      steps:
+      - checkout: none
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
        parameters:
          aptPackages: ${{ parameters.aptPackages }}
@@ -221,12 +232,16 @@ jobs:
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
        parameters:
          preTargetFilter: ${{ parameters.componentName }}
+          os: ${{ job.os }}
          gpuTarget: ${{ job.target }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+        parameters:
+          os: ${{ job.os }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
        parameters:
          checkoutRef: ${{ parameters.checkoutRef }}
          dependencyList: ${{ parameters.rocmTestDependencies }}
+          os: ${{ job.os }}
          gpuTarget: ${{ job.target }}
          ${{ if parameters.triggerDownstreamJobs }}:
            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
@@ -234,6 +249,7 @@ jobs:
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
        parameters:
          componentName: ${{ parameters.componentName }}
+          os: ${{ job.os }}
          testDir: '$(Agent.BuildDirectory)/rocm/bin'
          testExecutable: './rocblas-test'
          testParameters: '--yaml rocblas_smoke.yaml --gtest_output=xml:./test_output.xml --gtest_color=yes'
@@ -251,11 +267,11 @@ jobs:
        parameters:
          checkoutRepo: ${{ parameters.checkoutRepo }}
          sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
+          triggerDownstreamJobs: true
+          unifiedBuild: ${{ parameters.unifiedBuild }}
          ${{ if parameters.unifiedBuild }}:
            buildDependsOn: ${{ component.unifiedBuild.buildDependsOn }}
            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ component.unifiedBuild.downstreamAggregateNames }}
          ${{ else }}:
            buildDependsOn: ${{ component.buildDependsOn }}
            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
-          triggerDownstreamJobs: true
-          unifiedBuild: ${{ parameters.unifiedBuild }}
--- a/.azuredevops/components/rocDecode.yml
+++ b/.azuredevops/components/rocDecode.yml
@@ -1,4 +1,7 @@
 parameters:
+- name: componentName
+  type: string
+  default: rocDecode
 - name: checkoutRepo
  type: string
  default: 'self'
@@ -13,29 +16,28 @@ parameters:
 - name: aptPackages
  type: object
  default:
-    - python3-pip
    - cmake
-    - ninja-build
-    - pkg-config
    - ffmpeg
    - libavcodec-dev
    - libavformat-dev
    - libavutil-dev
+    - libdrm-dev
    - libstdc++-12-dev
    - libva-amdgpu-dev
    - mesa-amdgpu-va-drivers
-    - libdrm-dev
+    - ninja-build
+    - pkg-config
    - python3-pip
 - name: rocmDependencies
  type: object
  default:
-    - rocm-cmake
-    - llvm-project
-    - ROCR-Runtime
    - clr
-    - rocminfo
+    - llvm-project
+    - rocm-cmake
    - rocm-core
+    - rocminfo
    - rocprofiler-register
+    - ROCR-Runtime
 - name: rocmTestDependencies
  type: object
  default:
@@ -48,53 +50,70 @@ parameters:
 - name: jobMatrix
  type: object
  default:
+    buildJobs:
+      - { os: ubuntu2204, packageManager: apt }
+      - { os: almalinux8, packageManager: dnf }
    testJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }

 jobs:
- job: rocDecode_build
-  variables:
-  - group: common
-  - template: /.azuredevops/variables-global.yml
-  - name: ROCM_PATH
-    value: $(Agent.BuildDirectory)/rocm
-  pool:
-    vmImage: ${{ variables.BASE_BUILD_POOL }}
-  workspace:
-    clean: all
-  steps:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-    parameters:
-      aptPackages: ${{ parameters.aptPackages }}
-      registerROCmPackages: true
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-    parameters:
-      checkoutRepo: ${{ parameters.checkoutRepo }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-    parameters:
-      checkoutRef: ${{ parameters.checkoutRef }}
-      dependencyList: ${{ parameters.rocmDependencies }}
-      aggregatePipeline: ${{ parameters.aggregatePipeline }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-    parameters:
-      extraBuildFlags: >-
-        -DCMAKE_BUILD_TYPE=Release
-        -GNinja
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-  # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-  #   parameters:
-  #     aptPackages: ${{ parameters.aptPackages }}
-  #     registerROCmPackages: true
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    - name: ROCM_PATH
+      value: $(Agent.BuildDirectory)/rocm
+    pool:
+      vmImage: ${{ variables.BASE_BUILD_POOL }}
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
+        registerROCmPackages: true
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: ${{ parameters.checkoutRepo }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      parameters:
+        checkoutRef: ${{ parameters.checkoutRef }}
+        dependencyList: ${{ parameters.rocmDependencies }}
+        os: ${{ job.os }}
+        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        os: ${{ job.os }}
+        consolidateBuildAndInstall: true
+        extraBuildFlags: >-
+          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+          -DCMAKE_BUILD_TYPE=Release
+          -GNinja
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+      parameters:
+        componentName: ${{ parameters.componentName }}
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
+        componentName: ${{ parameters.componentName }}
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+    #   parameters:
+    #     aptPackages: ${{ parameters.aptPackages }}
+    #     registerROCmPackages: true

 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: rocDecode_test_${{ job.target }}
-    dependsOn: rocDecode_build
+  - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
+    dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -114,20 +133,27 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
        registerROCmPackages: true
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+      parameters:
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+      parameters:
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmTestDependencies }}
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
    - task: Bash@3
      displayName: Build rocDecode tests
      inputs:
        targetType: inline
        script: |
+          ${{ iif(eq(job.os, 'almalinux8'), 'source /opt/rh/gcc-toolset-14/enable', '') }}
          mkdir rocDecode-tests
          cd rocDecode-tests
          cmake $(Agent.BuildDirectory)/rocm/share/rocdecode/test
@@ -136,6 +162,7 @@ jobs:
      parameters:
        componentName: rocDecode
        testDir: 'rocDecode-tests'
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/rocFFT.yml
+++ b/.azuredevops/components/rocFFT.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: rocFFT
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -59,10 +78,23 @@ parameters:
        target: gfx942
      - gfx90a:
        target: gfx90a
+# - name: downstreamComponentMatrix
+#   type: object
+#   default:
+#     - hipFFT:
+#       name: hipFFT
+#       sparseCheckoutDir: projects/hipfft
+#       skipUnifiedBuild: 'false'
+#       buildDependsOn:
+#         - rocFFT_build

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: rocFFT_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.target }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_ubuntu2204_${{ job.target }} # todo: un-hardcode OS
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -79,12 +111,15 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        extraBuildFlags: >-
@@ -101,9 +136,11 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
@@ -114,8 +151,8 @@ jobs:
          - HIP_ROCCLR_HOME:::/home/user/workspace/rocm

 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: rocFFT_test_${{ job.target }}
-    dependsOn: rocFFT_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_test_${{ job.target }}
+    dependsOn: ${{ parameters.componentName }}_build_${{ job.target }}
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -129,12 +166,14 @@ jobs:
    workspace:
      clean: all
    steps:
+    - checkout: none
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
      parameters:
+        preTargetFilter: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
@@ -142,10 +181,12 @@ jobs:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmTestDependencies }}
        gpuTarget: ${{ job.target }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
      parameters:
-        componentName: rocFFT
+        componentName: ${{ parameters.componentName }}
        testDir: '$(Agent.BuildDirectory)/rocm/bin'
        testExecutable: './rocfft-test'
        testParameters: '--test_prob 0.004 --gtest_output=xml:./test_output.xml --gtest_color=yes'
@@ -154,3 +195,15 @@ jobs:
        aptPackages: ${{ parameters.aptPackages }}
        environment: test
        gpuTarget: ${{ job.target }}
+
+# - ${{ if parameters.triggerDownstreamJobs }}:
+#   - ${{ each component in parameters.downstreamComponentMatrix }}:
+#     - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
+#       - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
+#         parameters:
+#           checkoutRepo: ${{ parameters.checkoutRepo }}
+#           sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
+#           buildDependsOn: ${{ component.buildDependsOn }}
+#           downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
+#           triggerDownstreamJobs: true
+#           unifiedBuild: ${{ parameters.unifiedBuild }}
--- a/.azuredevops/components/rocJPEG.yml
+++ b/.azuredevops/components/rocJPEG.yml
@@ -1,4 +1,7 @@
 parameters:
+- name: componentName
+  type: string
+  default: rocJPEG
 - name: checkoutRepo
  type: string
  default: 'self'
@@ -44,32 +47,44 @@ parameters:
  type: object
  default:
    buildJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
+      - { os: almalinux8, packageManager: dnf, target: gfx942 }
+      - { os: almalinux8, packageManager: dnf, target: gfx90a }
+      - { os: almalinux8, packageManager: dnf, target: gfx1201 }
+      - { os: almalinux8, packageManager: dnf, target: gfx1100 }
+      - { os: almalinux8, packageManager: dnf, target: gfx1030 }
    testJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: rocJPEG_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
    - name: ROCM_PATH
      value: $(Agent.BuildDirectory)/rocm
    pool:
-      vmImage: ${{ variables.BASE_BUILD_POOL }}
+      ${{ if eq(job.os, 'ubuntu2404') }}:
+        name: rocm-ci_medium_build_pool_2404
+      ${{ else }}:
+        name: ${{ variables.MEDIUM_BUILD_POOL }}
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
    workspace:
      clean: all
    steps:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
        registerROCmPackages: true
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
@@ -80,17 +95,26 @@ jobs:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
+        os: ${{ job.os }}
+        consolidateBuildAndInstall: true
        extraBuildFlags: >-
+          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm"
          -DCMAKE_BUILD_TYPE=Release
          -DGPU_TARGETS=${{ job.target }}
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+      parameters:
+        componentName: ${{ parameters.componentName }}
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
    #   parameters:
@@ -99,8 +123,8 @@ jobs:
    #     registerROCmPackages: true

 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: rocJPEG_test_${{ job.target }}
-    dependsOn: rocJPEG_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
+    dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -120,22 +144,28 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
        registerROCmPackages: true
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
      parameters:
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+      parameters:
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmTestDependencies }}
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
    - task: Bash@3
      displayName: Build rocJPEG tests
      inputs:
        targetType: inline
        script: |
+          ${{ iif(eq(job.os, 'almalinux8'), 'source /opt/rh/gcc-toolset-14/enable', '') }}
          mkdir rocJPEG-tests
          cd rocJPEG-tests
          cmake $(Agent.BuildDirectory)/rocm/share/rocjpeg/test
@@ -144,6 +174,7 @@ jobs:
      parameters:
        componentName: rocJPEG
        testDir: 'rocJPEG-tests'
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/rocMLIR.yml
+++ b/.azuredevops/components/rocMLIR.yml
@@ -27,6 +27,7 @@ parameters:
    - numpy
    - tomli
    - scipy
+    - pybind11
 - name: rocmDependencies
  type: object
  default:
--- a/.azuredevops/components/rocPRIM.yml
+++ b/.azuredevops/components/rocPRIM.yml
@@ -68,8 +68,12 @@ parameters:
      - { os: almalinux8, packageManager: dnf, target: gfx1100 }
      - { os: almalinux8, packageManager: dnf, target: gfx1030 }
    testJobs:
-      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+      - { os: ubuntu2204, packageManager: apt, target: gfx942, shard: 1, shardCount: 3 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx942, shard: 2, shardCount: 3 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx942, shard: 3, shardCount: 3 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a, shard: 1, shardCount: 3 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a, shard: 2, shardCount: 3 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a, shard: 3, shardCount: 3 }
 - name: downstreamComponentMatrix
  type: object
  default:
@@ -78,36 +82,29 @@ parameters:
      sparseCheckoutDir: projects/rocthrust
      skipUnifiedBuild: 'false'
      buildDependsOn:
-        gfx942:
-          - rocPRIM_build_gfx942
-        gfx90a:
-          - rocPRIM_build_gfx90a
+        - rocPRIM_build
    - hipCUB:
      name: hipCUB
      sparseCheckoutDir: projects/hipcub
      skipUnifiedBuild: 'false'
      buildDependsOn:
-        gfx942:
-          - rocPRIM_build_gfx942
-        gfx90a:
-          - rocPRIM_build_gfx90a
+        - rocPRIM_build
    # rocSOLVER depends on both rocBLAS and rocPRIM
    # for a unified build, rocBLAS will be the one to call rocSOLVER
-    - rocSOLVER:
-      name: rocSOLVER
-      sparseCheckoutDir: projects/rocsolver
-      skipUnifiedBuild: 'true'
-      buildDependsOn:
-        gfx942:
-          - rocPRIM_build_gfx942
-        gfx90a:
-          - rocPRIM_build_gfx90a
+    # - rocSOLVER:
+    #   name: rocSOLVER
+    #   sparseCheckoutDir: projects/rocsolver
+    #   skipUnifiedBuild: 'true'
+    #   buildDependsOn:
+    #     - rocPRIM_build

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
    ${{ if parameters.buildDependsOn }}:
-      dependsOn: ${{ parameters.buildDependsOn[job.target] }}
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -144,6 +141,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        os: ${{ job.os }}
+        consolidateBuildAndInstall: true
        extraBuildFlags: >-
          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor"
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
@@ -172,7 +170,7 @@ jobs:

 - ${{ if eq(parameters.unifiedBuild, False) }}:
  - ${{ each job in parameters.jobMatrix.testJobs }}:
-    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
+    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}_shard_${{ job.shard }}
      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
      condition:
        and(succeeded(),
@@ -212,6 +210,7 @@ jobs:
        parameters:
          componentName: ${{ parameters.componentName }}
          testDir: '$(Agent.BuildDirectory)/rocm/bin/rocprim'
+          extraTestParameters: '-I ${{ job.shard }},,${{ job.shardCount }} -E device_merge_inplace'
          os: ${{ job.os }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
        parameters:
--- a/.azuredevops/components/rocPyDecode.yml
+++ b/.azuredevops/components/rocPyDecode.yml
@@ -36,6 +36,7 @@ parameters:
    - clr
    - llvm-project
    - rocDecode
+    - rocJPEG
    - rocm-cmake
    - rocm-core
    - rocminfo
@@ -192,9 +193,9 @@ jobs:
      inputs:
        itemPattern: '**/*.whl'
        targetPath: $(Agent.BuildDirectory)
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
      parameters:
-        checkoutRepo: ${{ parameters.checkoutRepo }}
+        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
@@ -221,25 +222,17 @@ jobs:
    - task: CMake@1
      displayName: 'rocPyDecode Test CMake Flags'
      inputs:
+        workingDirectory: $(Agent.BuildDirectory)/rocm/share/rocpydecode/tests
        cmakeArgs: >-
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(PYTHON_USER_SITE)/pybind11;$(PYTHON_DIST_PACKAGES)/pybind11;$(PYBIND11_PATH)
          -DCMAKE_BUILD_TYPE=Release
          -DGPU_TARGETS=${{ job.target }}
-          ..
+          .
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
      parameters:
        componentName: rocPyDecode
-        testDir: $(Build.SourcesDirectory)/build
-  # sudo required for pip install but screws up permissions for next pipeline run
-    - task: Bash@3
-      displayName: Clean up test environment
-      condition: always()
-      inputs:
-        targetType: inline
-        script: |
-          pip uninstall -y rocPyDecode
-          pip uninstall -y hip-python
+        testDir: $(Agent.BuildDirectory)/rocm/share/rocpydecode/tests
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/rocRAND.yml
+++ b/.azuredevops/components/rocRAND.yml
@@ -78,16 +78,15 @@ parameters:
      sparseCheckoutDir: projects/hiprand
      skipUnifiedBuild: 'false'
      buildDependsOn:
-        gfx942:
-          - rocRAND_build_gfx942
-        gfx90a:
-          - rocRAND_build_gfx90a
+        - rocRAND_build

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
    ${{ if parameters.buildDependsOn }}:
-      dependsOn: ${{ parameters.buildDependsOn[job.target] }}
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
--- a/.azuredevops/components/rocSOLVER.yml
+++ b/.azuredevops/components/rocSOLVER.yml
@@ -33,13 +33,11 @@ parameters:
  type: object
  default:
    - cmake
-    - ninja-build
-    - libsuitesparse-dev
    - gfortran
-    - libfmt-dev
    - git
-    - googletest
-    - libgtest-dev
+    - libfmt-dev
+    - libsuitesparse-dev
+    - ninja-build
    - python3-pip
 - name: rocmDependencies
  type: object
@@ -72,31 +70,42 @@ parameters:
  type: object
  default:
    buildJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
+      - { os: almalinux8, packageManager: dnf, target: gfx942 }
+      - { os: almalinux8, packageManager: dnf, target: gfx90a }
+      - { os: almalinux8, packageManager: dnf, target: gfx1201 }
+      - { os: almalinux8, packageManager: dnf, target: gfx1100 }
+      - { os: almalinux8, packageManager: dnf, target: gfx1030 }
    testJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: ${{ parameters.componentName }}_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
    ${{ if parameters.buildDependsOn }}:
-      dependsOn: ${{ parameters.buildDependsOn[job.target] }}
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
    pool: ${{ variables.MEDIUM_BUILD_POOL }}
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
    workspace:
      clean: all
    steps:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
@@ -108,10 +117,15 @@ jobs:
        targetType: inline
        script: git clone --depth 1 --branch v3.9.1 https://github.com/Reference-LAPACK/lapack
        workingDirectory: '$(Build.SourcesDirectory)'
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
+      parameters:
+        dependencyList:
+          - gtest
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
+        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
        ${{ if parameters.triggerDownstreamJobs }}:
@@ -119,8 +133,10 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        componentName: lapack
+        os: ${{ job.os }}
        extraBuildFlags: >-
          -DCMAKE_BUILD_TYPE=Release
+          -DCMAKE_POSITION_INDEPENDENT_CODE=ON
          -DCMAKE_Fortran_FLAGS=-fno-optimize-sibling-calls
          -DBUILD_TESTING=OFF
          -DCBLAS=ON
@@ -131,8 +147,9 @@ jobs:
        installDir: '$(Pipeline.Workspace)/deps-install'
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
+        os: ${{ job.os }}
        extraBuildFlags: >-
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Pipeline.Workspace)/deps-install
+          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Pipeline.Workspace)/deps-install;$(Agent.BuildDirectory)/vendor
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
          -DAMDGPU_TARGETS=${{ job.target }}
@@ -144,23 +161,26 @@ jobs:
      parameters:
        componentName: ${{ parameters.componentName }}
        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
        componentName: ${{ parameters.componentName }}
+        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        gpuTarget: ${{ job.target }}
-        extraCopyDirectories:
-          - deps-install
+    - ${{ if eq(job.os, 'ubuntu2204') }}:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          gpuTarget: ${{ job.target }}
+          extraCopyDirectories:
+            - deps-install

 - ${{ if eq(parameters.unifiedBuild, False) }}:
  - ${{ each job in parameters.jobMatrix.testJobs }}:
-    - job: ${{ parameters.componentName }}_test_${{ job.target }}
-      dependsOn: ${{ parameters.componentName }}_build_${{ job.target }}
+    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
+      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
      condition:
        and(succeeded(),
          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -174,6 +194,7 @@ jobs:
      workspace:
        clean: all
      steps:
+      - checkout: none
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
        parameters:
          aptPackages: ${{ parameters.aptPackages }}
@@ -181,12 +202,16 @@ jobs:
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
        parameters:
          preTargetFilter: ${{ parameters.componentName }}
+          os: ${{ job.os }}
          gpuTarget: ${{ job.target }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+        parameters:
+          os: ${{ job.os }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
        parameters:
          checkoutRef: ${{ parameters.checkoutRef }}
          dependencyList: ${{ parameters.rocmTestDependencies }}
+          os: ${{ job.os }}
          gpuTarget: ${{ job.target }}
          ${{ if parameters.triggerDownstreamJobs }}:
            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
@@ -194,6 +219,7 @@ jobs:
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
        parameters:
          componentName: ${{ parameters.componentName }}
+          os: ${{ job.os }}
          testDir: '$(Agent.BuildDirectory)/rocm/bin'
          testExecutable: './rocsolver-test'
          testParameters: '--gtest_filter="*checkin*" --gtest_output=xml:./test_output.xml --gtest_color=yes'
--- a/.azuredevops/components/rocSPARSE.yml
+++ b/.azuredevops/components/rocSPARSE.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: rocSPARSE
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -13,27 +32,25 @@ parameters:
 - name: aptPackages
  type: object
  default:
-    - python3-pip
    - cmake
-    - ninja-build
-    - libboost-program-options-dev
-    - googletest
-    - libfftw3-dev
-    - git
    - gfortran
-    - libgtest-dev
+    - git
+    - libboost-program-options-dev
    - libdrm-dev
+    - libfftw3-dev
+    - ninja-build
+    - python3-pip
 - name: rocmDependencies
  type: object
  default:
-    - rocm-cmake
-    - llvm-project
-    - ROCR-Runtime
    - clr
+    - llvm-project
    - rocBLAS
+    - rocm-cmake
    - rocminfo
    - rocPRIM
    - rocprofiler-register
+    - ROCR-Runtime
    - roctracer
 - name: rocmTestDependencies
  type: object
@@ -52,19 +69,39 @@ parameters:
  type: object
  default:
    buildJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
    testJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+- name: downstreamComponentMatrix
+  type: object
+  default:
+    - hipSPARSE:
+      name: hipSPARSE
+      sparseCheckoutDir: projects/hipsparse
+      skipUnifiedBuild: 'false'
+      buildDependsOn:
+        - rocSPARSE_build
+    # hipSOLVER depends on both rocSOLVER and rocSPARSE
+    # for a unified build, rocSOLVER will be the one to call hipSOLVER
+    # - hipSOLVER:
+    #   name: hipSOLVER
+    #   sparseCheckoutDir: projects/hipsolver
+    #   skipUnifiedBuild: 'true'
+    #   buildDependsOn:
+    #     - rocSPARSE_build

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: rocSPARSE_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -77,22 +114,32 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
+      parameters:
+        dependencyList:
+          - gtest
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
+        os: ${{ job.os }}
        extraBuildFlags: >-
-          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/bin/hipcc
-          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/bin/hipcc
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor
+          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/bin/amdclang++
+          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/bin/amdclang
          -DROCM_PATH=$(Agent.BuildDirectory)/rocm
          -DCMAKE_BUILD_TYPE=Release
          -DAMDGPU_TARGETS=${{ job.target }}
@@ -103,68 +150,94 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
-        artifactName: rocSPARSE
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
-        artifactName: rocSPARSE
+        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
        publish: false
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-prepare-package.yml
      parameters:
-        sourceDir: $(Build.SourcesDirectory)/build/clients
+        sourceDir: $(Agent.BuildDirectory)/s/build/clients
        contentsString: matrices/**
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        artifactName: testMatrices
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        gpuTarget: ${{ job.target }}
-        extraEnvVars:
-          - HIP_ROCCLR_HOME:::/home/user/workspace/rocm
+    - ${{ if eq(job.os, 'ubuntu2204') }}:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          gpuTarget: ${{ job.target }}
+          extraEnvVars:
+            - HIP_ROCCLR_HOME:::/home/user/workspace/rocm

- ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: rocSPARSE_test_${{ job.target }}
-    timeoutInMinutes: 90
-    dependsOn: rocSPARSE_build_${{ job.target }}
-    condition:
-      and(succeeded(),
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
-        eq(${{ parameters.aggregatePipeline }}, False)
-      )
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool: ${{ job.target }}_test_pool
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-      parameters:
-        gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmTestDependencies }}
-        gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: rocSPARSE
-        testDir: '$(Agent.BuildDirectory)/rocm/bin'
-        testExecutable: './rocsparse-test'
-        testParameters: '--gtest_filter="*quick*" --gtest_output=xml:./test_output.xml --gtest_color=yes'
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        environment: test
-        gpuTarget: ${{ job.target }}
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
+    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
+      timeoutInMinutes: 120
+      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
+      condition:
+        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
+          eq(${{ parameters.aggregatePipeline }}, False)
+        )
+      variables:
+      - group: common
+      - template: /.azuredevops/variables-global.yml
+      pool: ${{ job.target }}_test_pool
+      workspace:
+        clean: all
+      steps:
+      - checkout: none
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          packageManager: ${{ job.packageManager }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+        parameters:
+          preTargetFilter: ${{ parameters.componentName }}
+          gpuTarget: ${{ job.target }}
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
+          dependencyList: ${{ parameters.rocmTestDependencies }}
+          gpuTarget: ${{ job.target }}
+          os: ${{ job.os }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: ${{ parameters.componentName }}
+          os: ${{ job.os }}
+          testDir: '$(Agent.BuildDirectory)/rocm/bin'
+          testExecutable: './rocsparse-test'
+          testParameters: '--gtest_filter="*quick*" --gtest_output=xml:./test_output.xml --gtest_color=yes'
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          environment: test
+          gpuTarget: ${{ job.target }}
+
+- ${{ if parameters.triggerDownstreamJobs }}:
+  - ${{ each component in parameters.downstreamComponentMatrix }}:
+    - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
+      - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
+        parameters:
+          checkoutRepo: ${{ parameters.checkoutRepo }}
+          sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
+          buildDependsOn: ${{ component.buildDependsOn }}
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
+          triggerDownstreamJobs: true
+          unifiedBuild: ${{ parameters.unifiedBuild }}
--- a/.azuredevops/components/rocThrust.yml
+++ b/.azuredevops/components/rocThrust.yml
@@ -33,17 +33,17 @@ parameters:
  type: object
  default:
    - cmake
+    - git
    - ninja-build
    - libboost-program-options-dev
-    - googletest
    - libfftw3-dev
-    - git
    - python3-pip
 - name: rocmDependencies
  type: object
  default:
    - clr
    - llvm-project
+    - rocm-cmake
    - rocminfo
    - rocPRIM
    - ROCR-Runtime
@@ -54,58 +54,76 @@ parameters:
    - llvm-project
    - rocminfo
    - rocPRIM
-    - ROCR-Runtime
    - rocprofiler-register
+    - ROCR-Runtime

 - name: jobMatrix
  type: object
  default:
    buildJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
+      - { os: almalinux8, packageManager: dnf, target: gfx942 }
+      - { os: almalinux8, packageManager: dnf, target: gfx90a }
+      - { os: almalinux8, packageManager: dnf, target: gfx1201 }
+      - { os: almalinux8, packageManager: dnf, target: gfx1100 }
+      - { os: almalinux8, packageManager: dnf, target: gfx1030 }
    testJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: ${{ parameters.componentName }}_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
    ${{ if parameters.buildDependsOn }}:
-      dependsOn: ${{ parameters.buildDependsOn[job.target] }}
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
    pool: ${{ variables.MEDIUM_BUILD_POOL }}
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
    workspace:
      clean: all
    steps:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
+      parameters:
+        dependencyList:
+          - gtest
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
        ${{ if parameters.triggerDownstreamJobs }}:
          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
+        os: ${{ job.os }}
+        consolidateBuildAndInstall: true
        extraBuildFlags: >-
          -GNinja
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
          -DROCM_PATH=$(Agent.BuildDirectory)/rocm
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor"
          -DAMDGPU_TARGETS=${{ job.target }}
          -DBUILD_TEST=ON
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
@@ -113,20 +131,23 @@ jobs:
        componentName: ${{ parameters.componentName }}
        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        gpuTarget: ${{ job.target }}
+    - ${{ if eq(job.os, 'ubuntu2204') }}:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          gpuTarget: ${{ job.target }}

 - ${{ if eq(parameters.unifiedBuild, False) }}:
  - ${{ each job in parameters.jobMatrix.testJobs }}:
-    - job: ${{ parameters.componentName }}_test_${{ job.target }}
-      dependsOn: ${{ parameters.componentName }}_build_${{ job.target }}
+    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
+      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
      condition:
        and(succeeded(),
          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -144,17 +165,20 @@ jobs:
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
        parameters:
          aptPackages: ${{ parameters.aptPackages }}
+          packageManager: ${{ job.packageManager }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
        parameters:
          preTargetFilter: ${{ parameters.componentName }}
          gpuTarget: ${{ job.target }}
+          os: ${{ job.os }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
        parameters:
          checkoutRef: ${{ parameters.checkoutRef }}
          dependencyList: ${{ parameters.rocmTestDependencies }}
          gpuTarget: ${{ job.target }}
+          os: ${{ job.os }}
          ${{ if parameters.triggerDownstreamJobs }}:
            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
@@ -163,6 +187,7 @@ jobs:
          componentName: ${{ parameters.componentName }}
          testDir: '$(Agent.BuildDirectory)/rocm/bin/rocthrust'
          testParameters: '--output-on-failure --force-new-ctest-process --output-junit test_output.xml --exclude-regex "scan.hip"'
+          os: ${{ job.os }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
        parameters:
          aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/rocm-examples.yml
+++ b/.azuredevops/components/rocm-examples.yml
@@ -184,7 +184,7 @@ jobs:
      parameters:
        componentName: rocm-examples
        testDir: $(Build.SourcesDirectory)/build
-        testParameters: '--output-on-failure --force-new-ctest-process --output-junit test_output.xml --exclude-regex "rocfft_callback"'
+        testParameters: '--output-on-failure --force-new-ctest-process --output-junit test_output.xml'
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/rocm_smi_lib.yml
+++ b/.azuredevops/components/rocm_smi_lib.yml
@@ -105,7 +105,7 @@ jobs:
      parameters:
        componentName: rocm_smi_lib
        testDir: '$(Agent.BuildDirectory)'
-        testExecutable: './rocm/share/rocm_smi/rsmitst_tests/rsmitst'
+        testExecutable: 'sudo ./rocm/share/rocm_smi/rsmitst_tests/rsmitst'
        testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
--- a/.azuredevops/components/rocminfo.yml
+++ b/.azuredevops/components/rocminfo.yml
@@ -67,7 +67,6 @@ jobs:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-        skipLlvmSymlink: true
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
--- a/.azuredevops/components/rocprofiler-compute.yml
+++ b/.azuredevops/components/rocprofiler-compute.yml
@@ -24,24 +24,28 @@ parameters:
  default:
    - astunparse==1.6.2
    - colorlover
-    - "dash>=1.12.0"
+    - dash-bootstrap-components
+    - dash-svg
+    - "dash>=3.0.0"
+    - kaleido==0.2.1
    - matplotlib
    - "numpy>=1.17.5"
    - "pandas>=1.4.3"
+    - plotext
+    - plotille
    - pymongo
    - pyyaml
-    - tabulate
-    - tqdm
-    - dash-svg
-    - dash-bootstrap-components
-    - kaleido
    - setuptools
-    - plotille
+    - tabulate
+    - textual
+    - textual_plotext
+    - textual-fspicker
+    - tqdm
    - mock
    - pytest
    - pytest-cov
    - pytest-xdist
- name: rocmDependencies
+- name: rocmTestDependencies
  type: object
  default:
    - amdsmi
@@ -114,14 +118,6 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmDependencies }}
-        dependencySource: ${{ job.dependencySource }}
-        gpuTarget: ${{ job.target }}
-        aggregatePipeline: ${{ parameters.aggregatePipeline }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        extraBuildFlags: >-
@@ -165,14 +161,6 @@ jobs:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - task: Bash@3
-      displayName: Add en_US.UTF-8 locale
-      inputs:
-        targetType: inline
-        script: |
-          sudo locale-gen en_US.UTF-8
-          sudo update-locale
-          locale -a
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
@@ -184,9 +172,17 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmDependencies }}
+        dependencyList: ${{ parameters.rocmTestDependencies }}
        dependencySource: ${{ job.dependencySource }}
        gpuTarget: ${{ job.target }}
+    - task: Bash@3
+      displayName: Add en_US.UTF-8 locale
+      inputs:
+        targetType: inline
+        script: |
+          sudo locale-gen en_US.UTF-8
+          sudo update-locale
+          locale -a
    - task: Bash@3
      displayName: Add ROCm binaries to PATH
      inputs:
--- a/.azuredevops/components/rocprofiler-sdk.yml
+++ b/.azuredevops/components/rocprofiler-sdk.yml
@@ -14,10 +14,12 @@ parameters:
  type: object
  default:
    - build-essential
+    - cmake
    - libdrm-amdgpu-dev
    - libdrm-dev
    - libdw-dev
    - libelf-dev
+    - libsqlite3-dev
    - libva-dev
    - ninja-build
    - pkg-config
@@ -74,8 +76,7 @@ jobs:
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
-    pool:
-      vmImage: ${{ variables.BASE_BUILD_POOL }}
+    pool: ${{ variables.MEDIUM_BUILD_POOL }}
    workspace:
      clean: all
    steps:
--- a/.azuredevops/components/rocprofiler.yml
+++ b/.azuredevops/components/rocprofiler.yml
@@ -1,4 +1,7 @@
 parameters:
+- name: componentName
+  type: string
+  default: rocprofiler
 - name: checkoutRepo
  type: string
  default: 'self'
@@ -15,7 +18,6 @@ parameters:
  type: object
  default:
    - cmake
-    - libgtest-dev
    - libdrm-dev
    - libdw-dev
    - libsystemd-dev
@@ -26,13 +28,13 @@ parameters:
 - name: pipModules
  type: object
  default:
-    - pyyaml==5.3.1
-    - Cppheaderparser
-    - websockets
-    - matplotlib
-    - lxml
    - barectf
+    - Cppheaderparser
+    - lxml
+    - matplotlib
    - pandas
+    - pyyaml==5.3.1
+    - websockets
 - name: rocmDependencies
  type: object
  default:
@@ -41,29 +43,33 @@ parameters:
    - ROCdbgapi
    - rocm-cmake
    - rocm-core
-    - rocm_smi_lib
    - rocminfo
-    - ROCR-Runtime
+    - rocm_smi_lib
    - rocprofiler-register
+    - ROCR-Runtime
    - roctracer

 - name: jobMatrix
  type: object
  default:
    buildJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
+      - { os: almalinux8, packageManager: dnf, target: gfx942 }
+      - { os: almalinux8, packageManager: dnf, target: gfx90a }
+      - { os: almalinux8, packageManager: dnf, target: gfx1201 }
+      - { os: almalinux8, packageManager: dnf, target: gfx1100 }
+      - { os: almalinux8, packageManager: dnf, target: gfx1030 }
    testJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: rocprofiler_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -72,6 +78,10 @@ jobs:
    - name: ROCM_PATH
      value: $(Agent.BuildDirectory)/rocm
    pool: ${{ variables.MEDIUM_BUILD_POOL }}
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
    workspace:
      clean: all
    steps:
@@ -79,46 +89,59 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
+        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
+      parameters:
+        dependencyList:
+          - gtest
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+      parameters:
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
+        os: ${{ job.os }}
+        useAmdclang: false
        extraBuildFlags: >-
-          -DCMAKE_MODULE_PATH=$(Build.SourcesDirectory)/cmake_modules;$(Agent.BuildDirectory)/rocm/lib/cmake;$(Agent.BuildDirectory)/rocm/lib/cmake/hip
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+          -DCMAKE_MODULE_PATH=$(Build.SourcesDirectory)/cmake_modules;$(Agent.BuildDirectory)/rocm/lib/cmake;$(Agent.BuildDirectory)/rocm/lib/cmake/hip;$(Agent.BuildDirectory)/rocm/lib64/cmake;$(Agent.BuildDirectory)/rocm/lib64/cmake/hip
+          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor"
+          -DCMAKE_POSITION_INDEPENDENT_CODE=ON
          -DENABLE_LDCONFIG=OFF
          -DUSE_PROF_API=1
          -DGPU_TARGETS=${{ job.target }}
-          -DAMDGPU_TARGETS=${{ job.target }}
        multithreadFlag: -- -j32
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-        gpuTarget: ${{ job.target }}
-        extraEnvVars:
-          - HIP_ROCCLR_HOME:::/home/user/workspace/rocm
-          - ROCM_PATH:::/home/user/workspace/rocm
+    - ${{ if eq(job.os, 'ubuntu2204') }}:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+          gpuTarget: ${{ job.target }}
+          extraEnvVars:
+            - HIP_ROCCLR_HOME:::/home/user/workspace/rocm
+            - ROCM_PATH:::/home/user/workspace/rocm

 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: rocprofiler_test_${{ job.target }}
-    dependsOn: rocprofiler_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
+    dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -139,16 +162,21 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
      parameters:
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+      parameters:
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
      parameters:
@@ -157,12 +185,14 @@ jobs:
        testExecutable:  ./run.sh
        testParameters: ''
        testPublishResults: false
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
      parameters:
        componentName: rocprofilerV2
        testDir: $(Agent.BuildDirectory)/rocm
        testExecutable:  share/rocprofiler/tests/runUnitTests
        testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/roctracer.yml
+++ b/.azuredevops/components/roctracer.yml
@@ -1,4 +1,7 @@
 parameters:
+- name: componentName
+  type: string
+  default: roctracer
 - name: checkoutRepo
  type: string
  default: 'self'
@@ -47,10 +50,12 @@ parameters:
    buildJobs:
      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
      - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
      - { os: almalinux8, packageManager: dnf, target: gfx942 }
      - { os: almalinux8, packageManager: dnf, target: gfx90a }
+      - { os: almalinux8, packageManager: dnf, target: gfx1201 }
      - { os: almalinux8, packageManager: dnf, target: gfx1100 }
      - { os: almalinux8, packageManager: dnf, target: gfx1030 }
    testJobs:
@@ -59,18 +64,16 @@ parameters:

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: roctracer_build_${{ job.os }}_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
    pool:
-      vmImage: 'ubuntu-22.04'
+      vmImage: ${{ variables.BASE_BUILD_POOL }}
    ${{ if eq(job.os, 'almalinux8') }}:
      container:
        image: rocmexternalcicd.azurecr.io/manylinux228:latest
        endpoint: ContainerService3
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    - name: HIP_ROCCLR_HOME
-      value: $(Build.BinariesDirectory)/rocm
    workspace:
      clean: all
    steps:
@@ -121,8 +124,8 @@ jobs:
    #     registerROCmPackages: true

 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: roctracer_test_${{ job.os }}_${{ job.target }}
-    dependsOn: roctracer_build_${{ job.os }}_${{ job.target }}
+  - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
+    dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
--- a/.azuredevops/nightly/pytorch.yml
+++ b/.azuredevops/nightly/pytorch.yml
@@ -4,71 +4,71 @@ parameters:
 - name: aptPackages
  type: object
  default:
-    - build-essential
-    - git
-    - ninja-build
-    - openjdk-8-jdk
-    - ca-certificates
+    - autoconf
    - bc
    - bridge-utils
+    - build-essential
+    - ca-certificates
+    - ccache
    - devscripts
    - dkms
    - doxygen
+    - fakeroot
+    - ffmpeg
+    - gfortran
+    - git
+    - gnutls-bin
+    - libamd2
+    - libavformat-dev
+    - libblas3
+    - libcamd2
+    - libccolamd2
+    - libcholmod3
+    - libcolamd2
    - libdpkg-dev
    - libdpkg-perl
+    - libdrm-amdgpu1
+    - libdrm-dev
    - libelf-dev
+    - libfreetype-dev
+    - libgfortran5
+    - libgomp1
+    - libjpeg-dev
+    - libjpeg-turbo-official
+    - liblapack-dev
+    - liblapack3
+    - libmetis5
+    - libncurses-dev
+    - libnuma-dev
+    - libopenblas-dev
+    - libpth-dev
+    - libquadmath0
+    - libssh-dev
+    - libstdc++-12-dev
+    - libsuitesparseconfig5
+    - libswscale-dev
+    - libtinfo-dev
+    - libunwind-dev
+    - libwebp-dev
+    - llvm-dev
+    - ncurses-base
+    - ninja-build
+    - numactl
+    - openjdk-8-jdk
+    - python-is-python3
    - python3-dev
    - python3-pip
    - python3-venv
-    - wget
-    - ncurses-base
-    - libncurses-dev
-    - numactl
-    - libnuma-dev
-    - libssh-dev
-    - libunwind-dev
-    - llvm-dev
-    - libpth-dev
    - qemu-kvm
    - re2c
    - subversion
-    - fakeroot
-    - autoconf
-    - libgomp1
-    - libtinfo-dev
-    - libcholmod3
-    - libsuitesparseconfig5
-    - libstdc++-12-dev
-    - python-is-python3
-    - gfortran
-    - libgfortran5
-    - liblapack3
-    - libblas3
-    - libquadmath0
-    - libmetis5
-    - libamd2
-    - libcamd2
-    - libcolamd2
-    - libccolamd2
-    - libdrm-amdgpu1
-    - ccache
+    - wget
    - zip
-    - libjpeg-turbo-official
-    - libjpeg-dev
-    - libwebp-dev
-    - libfreetype-dev
-    - gnutls-bin
-    - ffmpeg
-    - libopenblas-dev
-    - liblapack-dev
-    - libswscale-dev
-    - libavformat-dev
 - name: pipModules
  type: object
  default:
-    - cmake
    - astunparse
-    - "expecttest>=0.2.1"
+    - "expecttest>=0.3.0"
    - hypothesis
    - numpy
    - psutil
@@ -76,8 +76,8 @@ parameters:
    - requests
    - setuptools==75.8.0
    - types-dataclasses
-    - "typing-extensions>=4.8.0"
-    - "sympy>=1.13.0"
+    - "typing-extensions>=4.10.0"
+    - "sympy>=1.13.3"
    - filelock
    - networkx
    - jinja2
@@ -97,36 +97,39 @@ parameters:
 - name: rocmDependencies
  type: object
  default:
-    - rocminfo
-    - MIOpen
    - clr
    - hipBLAS
+    - hipBLASLt
    - hipFFT
    - hipRAND
    - hipSOLVER
    - hipSPARSE
-    - ROCR-Runtime
+    - hipSPARSELt
    - llvm-project
+    - MIOpen
    - rccl
    - rocBLAS
    - rocFFT
+    - rocm-core
+    - rocminfo
    - rocm_smi_lib
+    - rocPRIM
+    - rocprofiler-register
    - rocRAND
+    - ROCR-Runtime
    - rocSOLVER
    - rocSPARSE
    - roctracer
-    - hipBLASLt
-    - rocprofiler-register
-    - rocm-core
-    - rocPRIM
    # below are additional dependencies not called out by build script, but throw errors during cmake
+    - composable_kernel
+    - hipBLAS-common
    - hipCUB
    - rocThrust
-    - hipBLAS-common
-    - composable_kernel
 - name: rocmTestDependencies
  type: object
  default:
+    # rocroller.so needed and is not included in the wheel
+    - hipBLASLt
    - rocminfo
 # Reference on what tests to run for torchvision found in private repo:
 # https://github.com/ROCm/rocAutomation/blob/jenkins-pipelines/pytorch/pytorch_ci/test_pytorch_test1.sh#L54
@@ -240,12 +243,6 @@ jobs:
        git clone https://github.com/pytorch/builder.git --depth=1 --recurse-submodules
        sudo ln -s $(Build.SourcesDirectory)/builder /builder
      workingDirectory: $(Build.SourcesDirectory)
-  - task: Bash@3
-    displayName: Temporarily Patch CK Submodule
-    inputs:
-      targetType: inline
-      script: git pull origin develop
-      workingDirectory: $(Build.SourcesDirectory)/pytorch/third_party/composable_kernel
  - task: Bash@3
    displayName: Install patchelf
    inputs:
@@ -267,6 +264,11 @@ jobs:
      script: |
        sudo bash pytorch/.ci/docker/common/install_rocm_magma.sh $(MAGMA_ROCM)
      workingDirectory: $(Build.SourcesDirectory)
+  - task: Bash@3
+    displayName: Install targeted typing_extensions for build
+    inputs:
+      targetType: inline
+      script: pip install --target=$(Build.SourcesDirectory)/pytorch/torch/.. typing_extensions
  - task: Bash@3
    displayName: Run ROCm Build Script
    inputs:
@@ -281,7 +283,6 @@ jobs:
        PYTORCH_ROOT=$(PYTORCH_ROOT)
        CMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
        DESIRED_DEVTOOLSET=$(DESIRED_DEVTOOLSET)
-        TORCH_PACKAGE_NAME=torch.$(ROCM_BRANCH).$(JOB_GPU_TARGET)
        PYTORCH_BUILD_VERSION=$(cat $(Build.SourcesDirectory)/pytorch/version.txt | cut -da -f1)
        PYTORCH_BUILD_NUMBER=$(date -u +%Y%m%d)
        SKIP_ALL_TESTS=1
@@ -322,8 +323,6 @@ jobs:
      inputs:
        targetType: inline
        script: >-
-          TORCH_PACKAGE_NAME=torch.$(ROCM_BRANCH).$(JOB_GPU_TARGET)
-          TORCHVISION_PACKAGE_NAME=torchvision.$(ROCM_BRANCH).$(JOB_GPU_TARGET)
          PYTORCH_VERSION=$(cat $(Build.SourcesDirectory)/pytorch/version.txt | cut -da -f1)post$(date -u +%Y%m%d)
          BUILD_VERSION=$(cat $(Build.SourcesDirectory)/vision/version.txt | cut -da -f1)post$(date -u +%Y%m%d)
          python3 setup.py bdist_wheel
@@ -400,17 +399,14 @@ jobs:
  - task: DownloadPipelineArtifact@2
    displayName: 'Download Pipeline Wheel Files'
    inputs:
-      itemPattern: '**/*$(JOB_GPU_TARGET)*.whl'
+      itemPattern: '**/*.whl'
      targetPath: $(Agent.BuildDirectory)
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-    parameters:
-      dependencySource: staging
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
    parameters:
      dependencyList: ${{ parameters.rocmTestDependencies }}
      gpuTarget: $(JOB_GPU_TARGET)
      dependencySource: staging
-      skipLlvmSymlink: true
 # get sources to run test scripts
  - task: Bash@3
    displayName: git clone upstream pytorch
--- a/.azuredevops/nightly/rocm-nightly.yml
+++ b/.azuredevops/nightly/rocm-nightly.yml
@@ -3,12 +3,21 @@ parameters:
 - name: jobList
  type: object
  default:
-    - gfx942-staging:
-      target: gfx942
-      source: staging
-    - gfx90a-staging:
-      target: gfx90a
-      source: staging
+    - { os: ubuntu2204, target: gfx942, source: staging }
+    - { os: ubuntu2204, target: gfx90a, source: staging }
+    - { os: ubuntu2204, target: gfx1201, source: staging }
+    - { os: ubuntu2204, target: gfx1100, source: staging }
+    - { os: ubuntu2204, target: gfx1030, source: staging }
+    - { os: ubuntu2404, target: gfx942, source: staging }
+    - { os: ubuntu2404, target: gfx90a, source: staging }
+    - { os: ubuntu2404, target: gfx1201, source: staging }
+    - { os: ubuntu2404, target: gfx1100, source: staging }
+    - { os: ubuntu2404, target: gfx1030, source: staging }
+    - { os: almalinux8, target: gfx942, source: staging }
+    - { os: almalinux8, target: gfx90a, source: staging }
+    - { os: almalinux8, target: gfx1201, source: staging }
+    - { os: almalinux8, target: gfx1100, source: staging }
+    - { os: almalinux8, target: gfx1030, source: staging }
 - name: rocmDependencies
  type: object
  default:
@@ -16,9 +25,9 @@ parameters:
    - amdsmi
    - aomp-extras
    - aomp
+    - clr
    - composable_kernel
    - half
-    - HIP
    - hip-tests
    - hipBLAS
    - hipBLAS-common
@@ -83,7 +92,7 @@ schedules:

 jobs:
 - ${{ each job in parameters.jobList }}:
-  - job: rocm_nightly_${{ job.target }}_${{ job.source }}
+  - job: rocm_nightly_${{ job.os }}_${{ job.target }}_${{ job.source }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -108,9 +117,8 @@ jobs:
      parameters:
        dependencySource: ${{ job.source }}
        dependencyList: ${{ parameters.rocmDependencies }}
+        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
-        skipLibraryLinking: true
-        skipLlvmSymlink: true
    - script: df -h
      displayName: System disk space after ROCm
    - script: du -sh $(Agent.BuildDirectory)/rocm
--- a/.azuredevops/tag-builds/clr.yml
+++ b/.azuredevops/tag-builds/clr.yml
@@ -28,12 +28,22 @@ resources:
    endpoint: ROCm
    name: ROCm/hipother
    ref: ${{ parameters.checkoutRef }}
+  pipelines:
+  - pipeline: hip_pipeline
+    source: \experimental\HIP
+    trigger: true
+  - pipeline: hipother_pipeline
+    source: \experimental\hipother
+    trigger: true

 trigger: none
 pr: none

 jobs:
-  - template: ${{ variables.CI_COMPONENT_PATH }}/HIP.yml
-    parameters:
-      checkoutRepo: release_repo
-      checkoutRef: ${{ parameters.checkoutRef }}
+  - ${{ if eq(variables['Build.Reason'], 'ResourceTrigger') }}:
+    - template: ${{ variables.CI_COMPONENT_PATH }}/copyHIP.yml@pipelines_repo
+  - ${{ if ne(variables['Build.Reason'], 'ResourceTrigger') }}:
+    - template: ${{ variables.CI_COMPONENT_PATH }}/HIP.yml@pipelines_repo
+      parameters:
+        checkoutRepo: release_repo
+        checkoutRef: ${{ parameters.checkoutRef }}
--- a/.azuredevops/templates/steps/artifact-download.yml
+++ b/.azuredevops/templates/steps/artifact-download.yml
@@ -12,6 +12,9 @@ parameters:
 - name: fileFilter
  type: string
  default: ''
+- name: extractAndDeleteFiles
+  type: boolean
+  default: true
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -19,46 +22,35 @@ parameters:
  default: false

 steps:
- task: Bash@3
-  displayName: Set allowPartiallySucceededBuilds
-  inputs:
-    targetType: inline
-    script: |
-      if [[ ",$ALLOWED_PARTIAL_SUCCEED_BUILDS," == *",${{ parameters.componentName }},"* ]]; then
-        echo "##vso[task.setvariable variable=allowPartiallySucceededBuilds;]true"
-      else
-        echo "##vso[task.setvariable variable=allowPartiallySucceededBuilds;]false"
-      fi
 - task: DownloadPipelineArtifact@2
  displayName: Download ${{ parameters.componentName }}
  inputs:
-    ${{ if eq(parameters.aggregatePipeline, false) }}:
+    itemPattern: '**/*${{ parameters.componentName }}*${{ parameters.fileFilter }}*'
+    targetPath: '$(Pipeline.Workspace)/d'
+    allowPartiallySucceededBuilds: true
+    ${{ if parameters.aggregatePipeline }}:
+      buildType: 'current'
+    ${{ else }}:
      buildType: 'specific'
      project: ROCm-CI
-      definition: ${{ parameters.pipelineId }}
      specificBuildWithTriggering: true
-      itemPattern: '**/*${{ parameters.fileFilter }}*'
-      # aomp is a special case, since the trigger file is under ROCm/ROCm instead of the component repo
-      ${{ if notIn(parameters.componentName, 'aomp') }}:
-        buildVersionToDownload: latestFromBranch # default is 'latest'
+      definition: ${{ parameters.pipelineId }}
      branchName: refs/heads/${{ parameters.branchName }}
-      allowPartiallySucceededBuilds: $(allowPartiallySucceededBuilds)
-      targetPath: '$(Pipeline.Workspace)/d'
-    ${{ else }}:
-      buildType: 'current'
-      itemPattern: '**/${{ parameters.componentName }}*${{ parameters.fileFilter }}*'
-      allowPartiallySucceededBuilds: $(allowPartiallySucceededBuilds)
-      targetPath: '$(Pipeline.Workspace)/d'
- task: ExtractFiles@1
-  displayName: Extract ${{ parameters.componentName }}
-  inputs:
-    archiveFilePatterns: '$(Pipeline.Workspace)/d/**/*.tar.gz'
-    destinationFolder: '$(Agent.BuildDirectory)/rocm'
-    cleanDestinationFolder: false
-    overwriteExistingFiles: true
- task: DeleteFiles@1
-  displayName: Cleanup Compressed ${{ parameters.componentName }}
-  inputs:
-    SourceFolder: '$(Pipeline.Workspace)/d'
-    Contents: '**/*.tar.gz'
-    RemoveDotFiles: true
+      ${{ if eq(parameters.componentName, 'aomp') }}:
+        buildVersionToDownload: latest # aomp trigger lives in ROCm/ROCm, so cannot use ROCm/aomp branch names
+      ${{ else }}:
+        buildVersionToDownload: latestFromBranch
+- ${{ if eq(parameters.extractAndDeleteFiles, true) }}:
+  - task: ExtractFiles@1
+    displayName: Extract ${{ parameters.componentName }}
+    inputs:
+      archiveFilePatterns: '$(Pipeline.Workspace)/d/**/*.tar.gz'
+      destinationFolder: '$(Agent.BuildDirectory)/rocm'
+      cleanDestinationFolder: false
+      overwriteExistingFiles: true
+  - task: DeleteFiles@1
+    displayName: Clean up Compressed ${{ parameters.componentName }}
+    inputs:
+      SourceFolder: '$(Pipeline.Workspace)/d'
+      Contents: '**/*.tar.gz'
+      RemoveDotFiles: true
--- a/.azuredevops/templates/steps/artifact-links.yml
+++ b/.azuredevops/templates/steps/artifact-links.yml
@@ -15,8 +15,8 @@ steps:
      URL_BEGIN="https://artprodcus3.artifacts.visualstudio.com/"
      URL_MIDDLE="/_apis/artifact/"
      URL_END="/content?format=file&subPath=%2F"
-      FORMATTED_JOB_NAME=$(echo $(Agent.JobName) | sed 's/ /./g; s/[-_]//g')
-      ARTIFACT_STRING="pipelineartifact://ROCm-CI/projectId/$(DOWNLOAD_PROJECT_ID)/buildId/$(Build.BuildId)/artifactName/${FORMATTED_JOB_NAME}"
+      ARTIFACT_NAME="$(Agent.JobName)_$(System.JobAttempt)"
+      ARTIFACT_STRING="pipelineartifact://ROCm-CI/projectId/$(DOWNLOAD_PROJECT_ID)/buildId/$(Build.BuildId)/artifactName/${ARTIFACT_NAME}"
      ENCODED_STRING=$(echo -n "${ARTIFACT_STRING}" | base64 -w 0)
      PADDING_COUNT=$(echo -n "${ENCODED_STRING}" | awk -F= '{print NF-1}')
      if [ "$PADDING_COUNT" -gt 0 ]; then
--- a/.azuredevops/templates/steps/artifact-upload.yml
+++ b/.azuredevops/templates/steps/artifact-upload.yml
@@ -26,7 +26,7 @@ steps:
    includeRootFolder: false
    archiveType: 'tar'
    tarCompression: 'gz'
-    archiveFile: '$(Build.ArtifactStagingDirectory)/${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.tar.gz'
+    archiveFile: '$(Build.ArtifactStagingDirectory)/${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}_$(System.JobAttempt).tar.gz'
 - task: DeleteFiles@1
  displayName: 'Cleanup Staging Area'
  inputs:
@@ -38,7 +38,7 @@ steps:
  inputs:
    workingDirectory: $(Pipeline.Workspace)
    targetType: inline
-    script: echo "${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.tar.gz" >> pipelineArtifacts.txt
+    script: echo "${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}_$(System.JobAttempt).tar.gz" >> pipelineArtifacts.txt
 # then publish it
 - ${{ if parameters.publish }}:
  - task: PublishPipelineArtifact@1
@@ -46,4 +46,6 @@ steps:
    displayName: '${{ parameters.artifactName }} Publish'
    retryCountOnTaskFailure: 3
    inputs:
+      # if this artifact name is changed, please also update $ARTIFACT_URL inside miopen-get-ck-build.yml
+      artifactName: $(Agent.JobName)_$(System.JobAttempt)
      targetPath: '$(Build.ArtifactStagingDirectory)'
--- a/.azuredevops/templates/steps/build-autotools.yml
+++ b/.azuredevops/templates/steps/build-autotools.yml
@@ -1,4 +1,7 @@
 parameters:
+- name: os
+  type: string
+  default: 'ubuntu2204'
 - name: componentName
  type: string
  default: ''
@@ -20,17 +23,23 @@ steps:
  displayName: '${{ parameters.componentName }} configure flags'
  inputs:
    targetType: inline
-    script: ./configure --prefix=${{ parameters.installDir }} ${{ parameters.configureFlags }}
    workingDirectory: ${{ parameters.buildDir }}
+    script: |
+      ${{ iif(eq(parameters.os, 'almalinux8'), 'source /opt/rh/gcc-toolset-14/enable', '') }}
+      ./configure --prefix=${{ parameters.installDir }} ${{ parameters.configureFlags }}
 - task: Bash@3
  displayName: '${{ parameters.componentName }} make'
  inputs:
    targetType: inline
-    script: ${{ parameters.makeCallPrefix }} make -j$(nproc)
    workingDirectory: ${{ parameters.buildDir }}
+    script: |
+      ${{ iif(eq(parameters.os, 'almalinux8'), 'source /opt/rh/gcc-toolset-14/enable', '') }}
+      ${{ parameters.makeCallPrefix }} make -j$(nproc)
 - task: Bash@3
  displayName: '${{ parameters.componentName }} make install'
  inputs:
    targetType: inline
-    script: make install
    workingDirectory: ${{ parameters.buildDir }}
+    script: |
+      ${{ iif(eq(parameters.os, 'almalinux8'), 'source /opt/rh/gcc-toolset-14/enable', '') }}
+      make install
--- a/.azuredevops/templates/steps/dependencies-aocl.yml
+++ b/.azuredevops/templates/steps/dependencies-aocl.yml
@@ -1,10 +1,15 @@
 parameters:
+- name: os
+  type: string
+  default: ubuntu2204
 - name: repositoryUrl
  type: string
  default: https://download.amd.com/developer/eula/aocl/aocl-4-2
 - name: packageName
-  type: string
-  default: aocl-linux-gcc-4.2.0_1_amd64.deb
+  type: object
+  default:
+    ubuntu2204: aocl-linux-gcc-4.2.0_1_amd64.deb
+    almalinux8: aocl-linux-gcc-4.2.0-1.x86_64.rpm

 steps:
 - task: Bash@3
@@ -12,16 +17,19 @@ steps:
  inputs:
    targetType: inline
    workingDirectory: $(Pipeline.Workspace)
-    script: wget -nv ${{ parameters.repositoryUrl }}/${{ parameters.packageName }}
+    script: wget -nv ${{ parameters.repositoryUrl }}/${{ parameters.packageName[parameters.os] }}
 - task: Bash@3
  displayName: Install AOCL
  inputs:
    targetType: inline
    workingDirectory: $(Pipeline.Workspace)
-    script: sudo apt install -y ./${{ parameters.packageName }}
+    ${{ if eq(parameters.os, 'ubuntu2204') }}:
+      script: sudo apt install -y ./${{ parameters.packageName[parameters.os] }}
+    ${{ elseif eq(parameters.os, 'almalinux8') }}:
+      script: sudo dnf install -y ./${{ parameters.packageName[parameters.os] }}
 - task: Bash@3
  displayName: Clean up AOCL
  inputs:
    targetType: inline
    workingDirectory: $(Pipeline.Workspace)
-    script: rm -f ${{ parameters.packageName }}
+    script: rm -f ${{ parameters.packageName[parameters.os] }}
--- a/.azuredevops/templates/steps/dependencies-aqlprofile.yml
+++ b/.azuredevops/templates/steps/dependencies-aqlprofile.yml
@@ -1,25 +1,44 @@
+parameters:
+- name: os
+  type: string
+  default: ubuntu2204
+
 steps:
 - task: Bash@3
  displayName: Get aqlprofile package name
  inputs:
    targetType: inline
-    script: |
-      export packageName=$(curl -s https://repo.radeon.com/rocm/apt/$(REPO_RADEON_VERSION)/pool/main/h/hsa-amd-aqlprofile/ | grep -oP "href=\"\K[^\"]*$(lsb_release -rs)[^\"]*\.deb")
-      echo "##vso[task.setvariable variable=packageName;isreadonly=true]$packageName"
+    ${{ if eq(parameters.os, 'ubuntu2204') }}:
+      script: |
+        export packageName=$(curl -s https://repo.radeon.com/rocm/apt/$(REPO_RADEON_VERSION)/pool/main/h/hsa-amd-aqlprofile/ | grep -oP "href=\"\K[^\"]*$(lsb_release -rs)[^\"]*\.deb")
+        echo "##vso[task.setvariable variable=packageName;isreadonly=true]$packageName"
+    ${{ if eq(parameters.os, 'almalinux8') }}:
+      script: |
+        export packageName=$(curl -s https://repo.radeon.com/rocm/rhel8/$(REPO_RADEON_VERSION)/main/ | grep -oP "hsa-amd-aqlprofile-[^\"]+\.rpm" | head -n1)
+        echo "##vso[task.setvariable variable=packageName;isreadonly=true]$packageName"
 - task: Bash@3
  displayName: 'Download aqlprofile'
  inputs:
    targetType: inline
-    script: wget -nv https://repo.radeon.com/rocm/apt/$(REPO_RADEON_VERSION)/pool/main/h/hsa-amd-aqlprofile/$(packageName)
    workingDirectory: '$(Pipeline.Workspace)'
+    ${{ if eq(parameters.os, 'ubuntu2204') }}:
+      script: wget -nv https://repo.radeon.com/rocm/apt/$(REPO_RADEON_VERSION)/pool/main/h/hsa-amd-aqlprofile/$(packageName)
+    ${{ if eq(parameters.os, 'almalinux8') }}:
+      script: wget -nv https://repo.radeon.com/rocm/rhel8/$(REPO_RADEON_VERSION)/main/$(packageName)
 - task: Bash@3
  displayName: 'Extract aqlprofile'
  inputs:
    targetType: inline
-    script: |
-      mkdir hsa-amd-aqlprofile
-      dpkg-deb -R $(packageName) hsa-amd-aqlprofile
    workingDirectory: '$(Pipeline.Workspace)'
+    ${{ if eq(parameters.os, 'ubuntu2204') }}:
+      script: |
+        mkdir hsa-amd-aqlprofile
+        dpkg-deb -R $(packageName) hsa-amd-aqlprofile
+    ${{ if eq(parameters.os, 'almalinux8') }}:
+      script: |
+        mkdir hsa-amd-aqlprofile
+        sudo dnf -y install rpm-build cpio
+        rpm2cpio $(packageName) | (cd hsa-amd-aqlprofile && cpio -idmv)
 - task: Bash@3
  displayName: 'Copy aqlprofile files'
  inputs:
--- a/.azuredevops/templates/steps/dependencies-cmake-latest.yml
+++ b/.azuredevops/templates/steps/dependencies-cmake-latest.yml
@@ -1,10 +1,23 @@
-# replace cmake from apt install with newest version using snap install
 steps:
 - task: Bash@3
-  displayName: update cmake
+  displayName: Install CMake 3.31
  inputs:
    targetType: inline
    script: |
-      sudo apt purge cmake -y
-      sudo snap install cmake --classic --channel=3.31/stable
-      hash -r
+      CMAKE_VERSION=3.31.0
+      CMAKE_ROOT="$(Pipeline.Workspace)/cmake"
+
+      echo "Downloading CMake $CMAKE_VERSION..."
+      curl -fsSL -o cmake.tar.gz https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz
+
+      echo "Extracting to $CMAKE_ROOT..."
+      sudo mkdir -p $CMAKE_ROOT
+      sudo tar --strip-components=1 -xz -C $CMAKE_ROOT -f cmake.tar.gz
+
+      echo "##vso[task.prependpath]$CMAKE_ROOT/bin"
+- task: Bash@3
+  displayName: cmake --version
+  inputs:
+    targetType: inline
+    script: |
+      cmake --version
--- a/.azuredevops/templates/steps/dependencies-dnf.yml
+++ b/.azuredevops/templates/steps/dependencies-dnf.yml
@@ -28,8 +28,9 @@ parameters:
    bison: bison
    ccache: ccache
    cmake: cmake
-    cuda-toolkit: cuda-toolkit
-    cudnn: libcudnn9-cuda-11
+    cuda-toolkit-12-9: cuda-compiler-12-9 cuda-toolkit-12-9
+    libcudnn9-dev-cuda-12: libcudnn9-cuda-12
+    dejagnu: dejagnu
    doxygen: doxygen
    # note: doxygen-doc is not available in dnf
    # libavcodec-dev, libavformat-dev, libavutil-dev come with ffmpeg-devel
@@ -42,15 +43,20 @@ parameters:
    graphviz: graphviz
    libbabeltrace-dev: libbabeltrace-devel
    libbison-dev: bison-devel
+    libboost-program-options-dev: boost-devel
    # note: libdrm-amdgpu1 is not available in dnf
    libdrm-dev: libdrm-devel
    libdrm-amdgpu-dev: libdrm-amdgpu-devel
-    libdw-dev: libdw-devel
-    libelf-dev: elfutils-libelf-devel elfutils-devel
-    libffi-dev: libffi-devellibtool
+    libdw-dev: elfutils-devel
+    libelf-dev: elfutils-libelf-devel
+    libexpat-dev: expat-devel
+    libffi-dev: libffi-devel
+    libfftw3-dev: fftw-devel
+    libfmt-dev: fmt-devel
    libgmp-dev: gmp-devel
    liblzma-dev: xz-devel
    libmpfr-dev: mpfr-devel
+    libmsgpack-dev: msgpack-devel
    libncurses5-dev: ncurses-devel
    libnuma-dev: numactl-devel
    libopenmpi-dev: openmpi-devel
@@ -63,6 +69,7 @@ parameters:
    libva-amdgpu-dev: libva-amdgpu-devel
    mesa-amdgpu-va-drivers: mesa-amdgpu-va-drivers
    mesa-common-dev: mesa-libGL-devel
+    ncurses-dev: ncurses-devel
    # note: llvm needs ninja-build version newer than what dnf provides
    ocl-icd-libopencl1: ocl-icd
    ocl-icd-opencl-dev: ocl-icd-devel
--- a/.azuredevops/templates/steps/dependencies-rocm.yml
+++ b/.azuredevops/templates/steps/dependencies-rocm.yml
@@ -19,16 +19,6 @@ parameters:
 - name: gpuTarget
  type: string
  default: ''
-# set to true if you're calling this template file multiple files in same pipeline
-# only leave last call false to optimize sequence
- name: skipLibraryLinking
-  type: boolean
-  default: false
-# set to true if llvm-project is not downloaded in a particular call
-# or if you just don't want the symlink
- name: skipLlvmSymlink
-  type: boolean
-  default: false
 # set to true if dlopen calls for HIP libraries are causing failures
 # because they do not follow shared library symlink convention
 - name: setupHIPLibrarySymlinks
@@ -130,7 +120,7 @@ parameters:
    hipRAND:
      pipelineId: $(HIPRAND_PIPELINE_ID)
      stagingBranch: develop
-      mainlineBranch: mainline
+      mainlineBranch: develop
      hasGpuTarget: true
    hipSOLVER:
      pipelineId: $(HIPSOLVER_PIPELINE_ID)
@@ -305,7 +295,7 @@ parameters:
    rocRAND:
      pipelineId: $(ROCRAND_PIPELINE_ID)
      stagingBranch: develop
-      mainlineBranch: mainline
+      mainlineBranch: develop
      hasGpuTarget: true
    rocr_debug_agent:
      pipelineId: $(ROCR_DEBUG_AGENT_PIPELINE_ID)
@@ -367,6 +357,7 @@ steps:
        componentName: ${{ split(dependency, ':')[0] }}
        pipelineId: ${{ parameters.componentVarList[split(dependency, ':')[0]].pipelineId }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        extractAndDeleteFiles: false
        ${{ if parameters.componentVarList[split(dependency, ':')[0]].hasGpuTarget }}:
          fileFilter: "${{ split(dependency, ':')[1] }}*_${{ parameters.os }}_${{ parameters.gpuTarget }}"
        # dependencySource = staging
@@ -397,6 +388,7 @@ steps:
        ${{ if parameters.componentVarList[dependency].hasGpuTarget }}:
          gpuTarget: ${{ parameters.gpuTarget }}
        preTargetFilter: ${{ dependency }}
+        os: ${{ parameters.os }}
        buildType: current
  - ${{ else }}:
    - template: artifact-download.yml
@@ -404,6 +396,7 @@ steps:
        componentName: ${{ dependency }}
        pipelineId: ${{ parameters.componentVarList[dependency].pipelineId }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        extractAndDeleteFiles: false
        ${{ if parameters.componentVarList[dependency].hasGpuTarget }}:
          fileFilter: ${{ parameters.os }}_${{ parameters.gpuTarget }}
        ${{ else }}:
@@ -429,23 +422,43 @@ steps:
        # default = staging
        ${{ else }}:
          branchName: ${{ parameters.componentVarList[dependency].stagingBranch }}
-# Set link to redirect llvm folder
- ${{ if eq(parameters.skipLlvmSymlink, false) }}:
+- task: ExtractFiles@1
+  displayName: Extract ROCm artifacts
+  inputs:
+    archiveFilePatterns: $(Pipeline.Workspace)/d/**/*.tar.gz
+    destinationFolder: $(Agent.BuildDirectory)/rocm
+    cleanDestinationFolder: false
+    overwriteExistingFiles: true
+- task: DeleteFiles@1
+  displayName: Clean up ROCm artifacts
+  inputs:
+    SourceFolder: $(Pipeline.Workspace)/d
+    Contents: '**/*.tar.gz'
+    RemoveDotFiles: true
+- ${{ if containsValue(parameters.dependencyList, 'llvm-project') }}:
  - task: Bash@3
    displayName: Symlink from rocm/llvm to rocm/lib/llvm
    inputs:
      targetType: inline
      script: |
        sudo mkdir -p $(Agent.BuildDirectory)/rocm/lib
-        sudo ln -s $(Agent.BuildDirectory)/rocm/llvm $(Agent.BuildDirectory)/rocm/lib/llvm
+        sudo ln -sr $(Agent.BuildDirectory)/rocm/llvm $(Agent.BuildDirectory)/rocm/lib/llvm
+        echo "Created symlink from rocm/llvm to rocm/lib/llvm"
  - task: Bash@3
    displayName: Symlink executables from rocm/llvm/bin to rocm/bin
    inputs:
      targetType: inline
      script: |
        for file in amdclang amdclang++ amdclang-cl amdclang-cpp amdflang amdlld aompcc mygpu mycpu offload-arch; do
-          sudo ln -s $(Agent.BuildDirectory)/rocm/llvm/bin/$file $(Agent.BuildDirectory)/rocm/bin/$file
+          sudo ln -sr $(Agent.BuildDirectory)/rocm/llvm/bin/$file $(Agent.BuildDirectory)/rocm/bin/$file
+          echo "Created symlink from rocm/llvm/bin/$file to rocm/bin/$file"
        done
+- ${{ if containsValue(parameters.dependencyList, 'rocm-core') }}:
+  - task: Bash@3
+    displayName: Print rocm/.info/version
+    inputs:
+      targetType: inline
+      script: cat $(Agent.BuildDirectory)/rocm/.info/version
 # dlopen calls within a ctest or pytest sequence runs into issues when shared library symlink convention is not followed
 # the convention is as follows:
 # unversioned .so is a symlink to major version .so
@@ -482,17 +495,16 @@ steps:
  inputs:
    targetType: inline
    script: ls -la1R $(Agent.BuildDirectory)/rocm
- ${{ if eq(parameters.skipLibraryLinking, false) }}:
-  - task: Bash@3
-    displayName: 'Link ROCm shared libraries'
-    inputs:
-      targetType: inline
-# OS ignores if the ROCm lib folder shows up more than once
-      script: |
-        echo $(Agent.BuildDirectory)/rocm/lib | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
-        echo $(Agent.BuildDirectory)/rocm/llvm/lib | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
-        echo $(Agent.BuildDirectory)/rocm/lib64 | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
-        echo $(Agent.BuildDirectory)/rocm/llvm/lib64 | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
-        sudo cat /etc/ld.so.conf.d/rocm-ci.conf
-        sudo ldconfig -v
-        ldconfig -p
+- task: Bash@3
+  displayName: 'Link ROCm shared libraries'
+  inputs:
+    targetType: inline
+    # OS ignores if the ROCm lib folder shows up more than once
+    script: |
+      echo $(Agent.BuildDirectory)/rocm/lib | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
+      echo $(Agent.BuildDirectory)/rocm/llvm/lib | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
+      echo $(Agent.BuildDirectory)/rocm/lib64 | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
+      echo $(Agent.BuildDirectory)/rocm/llvm/lib64 | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
+      sudo cat /etc/ld.so.conf.d/rocm-ci.conf
+      sudo ldconfig -v
+      ldconfig -p
--- a/.azuredevops/templates/steps/manifest.yml
+++ b/.azuredevops/templates/steps/manifest.yml
@@ -23,13 +23,14 @@ steps:
  inputs:
    targetType: inline
    script: |
-      sudo apt-get install -y jq
+      ${{ iif(or(eq(parameters.os, 'ubuntu2204'), eq(parameters.os, 'ubuntu2404')), 'sudo apt-get install -y jq', '') }}

      # RESOURCES_REPOSITORIES is a runtime variable (not an env var!) that contains quotations and newlines
      # So we need to save it to a file to properly preserve its formatting and contents
      cat <<EOF > resources.repositories
      $(RESOURCES_REPOSITORIES)
      EOF
+      echo "Value of resources.repositories:"
      cat resources.repositories

      IS_TAG_BUILD=$(jq 'has("release_repo")' resources.repositories)
@@ -66,8 +67,6 @@ steps:
        )
      ' resources.repositories)

-      manifest_json=$(Build.ArtifactStagingDirectory)/manifest_${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.json
-
      dependencies=()
      for manifest_file in $(Pipeline.Workspace)/d/**/manifest_*.json; do
        echo "Processing $manifest_file"
@@ -78,6 +77,10 @@ steps:
      done
      dependencies_json=$(printf '%s\n' "${dependencies[@]}" | jq -s '.')

+      manifest_filename="manifest_${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}"
+      echo "##vso[task.setvariable variable=manifest_filename]$manifest_filename"
+      manifest_json=$(Build.ArtifactStagingDirectory)/$manifest_filename.json
+
      jq -n \
        --argjson current "$current" \
        --argjson dependencies "$dependencies_json" \
@@ -111,8 +114,14 @@ steps:
        ')
      dependencies_rows=$(echo $dependencies_rows)
      echo "##vso[task.setvariable variable=dependencies_rows;]$dependencies_rows"
-
-      cat $manifest_json
+- task: Bash@3
+  displayName: Print manifest.json
+  condition: always()
+  continueOnError: true
+  inputs:
+    targetType: inline
+    script: |
+      cat $(Build.ArtifactStagingDirectory)/$(manifest_filename).json
 - task: Bash@3
  displayName: Create manifest.html
  condition: always()
@@ -120,10 +129,10 @@ steps:
  inputs:
    targetType: inline
    script: |
-      manifest_html=$(Build.ArtifactStagingDirectory)/manifest_${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.html
+      manifest_html="$(Build.ArtifactStagingDirectory)/$(manifest_filename).html"
      cat <<EOF > $manifest_html
      <html>
-      <h1>Manifest</h1>
+      <h1>$(manifest_filename)</h1>
      <h2>Current</h2>
      <table border="1">
      <tr>
@@ -163,7 +172,7 @@ steps:
  continueOnError: true
  inputs:
    tabName: Manifest
-    reportDir: $(Build.ArtifactStagingDirectory)/manifest_${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.html
+    reportDir: $(Build.ArtifactStagingDirectory)/$(manifest_filename).html
 - task: Bash@3
  displayName: Save manifest artifact file name
  condition: always()
@@ -172,5 +181,5 @@ steps:
    workingDirectory: $(Pipeline.Workspace)
    targetType: inline
    script: |
-      echo "manifest_${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.html" >> pipelineArtifacts.txt
-      echo "manifest_${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.json" >> pipelineArtifacts.txt
+      echo "$(manifest_filename).html" >> pipelineArtifacts.txt
+      echo "$(manifest_filename).json" >> pipelineArtifacts.txt
--- a/.azuredevops/templates/steps/miopen-get-ck-build.yml
+++ b/.azuredevops/templates/steps/miopen-get-ck-build.yml
@@ -17,7 +17,6 @@ steps:
    script: |
      AZ_API="https://dev.azure.com/ROCm-CI/ROCm-CI/_apis"
      GH_API="https://api.github.com/repos/ROCm"
-      ARTIFACT_NAME="composablekernelbuild${{ parameters.gpuTarget }}"
      EXIT_CODE=0

      # Try to find an Azure build for the specific CK commit called out in MIOpen's requirements.txt
@@ -39,8 +38,15 @@ steps:
        echo "Found specific CK build ID: $CK_BUILD_ID"
      fi

-      AZURE_URL="$AZ_API/build/builds/$CK_BUILD_ID/artifacts?artifactName=$ARTIFACT_NAME&api-version=7.1"
-      ARTIFACT_URL=$(curl -s $AZURE_URL | jq '.resource.downloadUrl' | tr -d '"')
+      AZURE_URL="$AZ_API/build/builds/$CK_BUILD_ID/artifacts?api-version=7.1"
+      ARTIFACT_URL=$(curl -s $AZURE_URL | \
+        jq --arg gfx "${{ parameters.gpuTarget }}" '
+          .value
+          | map(select(.name | test($gfx)))
+          | max_by(.name | capture("_(?<dropNumber>\\d+)").dropNumber | tonumber)
+          | .resource.downloadUrl
+        ' | \
+        tr -d '"')

      # If using the specific CK commit and it doesn't have any valid artifacts, use latest successful CK build instead
      if { [[ -z "$ARTIFACT_URL" ]] || [[ "$ARTIFACT_URL" == "null" ]]; } && [[ $EXIT_CODE -eq 0 ]]; then
@@ -48,8 +54,15 @@ steps:
        LATEST_BUILD_URL="$AZ_API/build/builds?definitions=$(COMPOSABLE_KERNEL_PIPELINE_ID)&statusFilter=completed&resultFilter=succeeded&\$top=1&api-version=7.1"
        CK_BUILD_ID=$(curl -s $LATEST_BUILD_URL | jq '.value[0].id')
        echo "Found latest CK build ID: $CK_BUILD_ID"
-        AZURE_URL="$AZ_API/build/builds/$CK_BUILD_ID/artifacts?artifactName=$ARTIFACT_NAME&api-version=7.1"
-        ARTIFACT_URL=$(curl -s $AZURE_URL | jq '.resource.downloadUrl' | tr -d '"')
+        AZURE_URL="$AZ_API/build/builds/$CK_BUILD_ID/artifacts?api-version=7.1"
+        ARTIFACT_URL=$(curl -s $AZURE_URL | \
+          jq --arg os "ubuntu2204" --arg gfx "${{ parameters.gpuTarget }}" '
+            .value
+            | map(select(.name | test($os) and test($gfx)))
+            | max_by(.name | capture("_(?<dropNumber>\\d+)").dropNumber | tonumber)
+            | .resource.downloadUrl
+          ' | \
+          tr -d '"')
        EXIT_CODE=2
      fi

@@ -57,8 +70,8 @@ steps:
      wget --tries=5 --waitretry=10 --retry-connrefused -nv $ARTIFACT_URL -O $(System.ArtifactsDirectory)/ck.zip
      unzip $(System.ArtifactsDirectory)/ck.zip -d $(System.ArtifactsDirectory)
      mkdir -p $(Agent.BuildDirectory)/rocm
-      tar -zxvf $(System.ArtifactsDirectory)/$ARTIFACT_NAME/*.tar.gz -C $(Agent.BuildDirectory)/rocm
-      rm -r $(System.ArtifactsDirectory)/ck.zip $(System.ArtifactsDirectory)/$ARTIFACT_NAME
+      tar -zxvf $(System.ArtifactsDirectory)/composable_kernel*/*.tar.gz -C $(Agent.BuildDirectory)/rocm
+      rm -r $(System.ArtifactsDirectory)/ck.zip $(System.ArtifactsDirectory)/composable_kernel*

      if [[ $EXIT_CODE -ne 0 ]]; then
        BUILD_COMMIT=$(curl -s $AZ_API/build/builds/$CK_BUILD_ID | jq '.sourceVersion' | tr -d '"')
--- a/.azuredevops/templates/steps/test.yml
+++ b/.azuredevops/templates/steps/test.yml
@@ -1,25 +1,28 @@
 parameters:
- name: os
-  type: string
-  default: 'ubuntu2204'
 - name: componentName
  type: string
  default: ''
+- name: os
+  type: string
+  default: ubuntu2204
 - name: testDir
  type: string
-  default: 'build'
+  default: build
 - name: testExecutable
  type: string
-  default: 'ctest'
+  default: ctest
 - name: testParameters
  type: string
-  default: '--output-on-failure --force-new-ctest-process --output-junit test_output.xml'
+  default: --output-on-failure --force-new-ctest-process --output-junit test_output.xml
+- name: extraTestParameters
+  type: string
+  default: ''
 - name: testOutputFile
  type: string
  default: test_output.xml
 - name: testOutputFormat
  type: string
-  default: 'JUnit'
+  default: JUnit
  values:
    - JUnit
    - NUnit
@@ -29,31 +32,28 @@ parameters:
 - name: testPublishResults
  type: boolean
  default: true
- name: allowPartiallySucceededBuilds
+- name: allowComponentTestFailure
  type: object
  default:
    - amdsmi
-    - aomp
    - HIPIFY
-    - MIVisionX
    - rocm_smi_lib
-    - rocprofiler-sdk
    - roctracer
+    # the following do not use this template but allow test failures, included for completeness
+    - aomp
+    - ROCgdb

 steps:
 # run test, continue on failure to publish results
 # and to publish build artifacts
 - task: Bash@3
  displayName: '${{ parameters.componentName }} Test'
-  continueOnError: ${{ containsValue(parameters.allowPartiallySucceededBuilds, parameters.componentName) }}
+  continueOnError: ${{ containsValue(parameters.allowComponentTestFailure, parameters.componentName) }}
  inputs:
    targetType: inline
-    ${{ if ne(parameters.os, 'almalinux8') }}:
-      script: ${{ parameters.testExecutable }} ${{ parameters.testParameters }}
-    ${{ else }}:
-      script: |
-        source /opt/rh/gcc-toolset-14/enable
-        ${{ parameters.testExecutable }} ${{ parameters.testParameters }}
+    script: |
+      ${{ iif(eq(parameters.os, 'almalinux8'), 'source /opt/rh/gcc-toolset-14/enable', '') }}
+      ${{ parameters.testExecutable }} ${{ parameters.testParameters }} ${{ parameters.extraTestParameters }}
    workingDirectory: ${{ parameters.testDir }}
 - ${{ if parameters.testPublishResults }}:
  - task: PublishTestResults@2
--- a/.azuredevops/variables-global.yml
+++ b/.azuredevops/variables-global.yml
@@ -3,6 +3,8 @@
 variables:
 - name: RESOURCES_REPOSITORIES
  value: $[ convertToJson(resources.repositories) ]
+- name: CCACHE_DIR
+  value: $(Pipeline.Workspace)/ccache
 - name: CI_ROOT_PATH
  value: /.azuredevops
 - name: CI_COMPONENT_PATH
@@ -30,320 +32,136 @@ variables:
 - name: GFX90A_TEST_POOL
  value: gfx90a_test_pool
 - name: LATEST_RELEASE_VERSION
-  value: 6.4.0
+  value: 6.4.1
 - name: REPO_RADEON_VERSION
-  value: 6.4
+  value: 6.4.1
 - name: NEXT_RELEASE_VERSION
-  value: 6.5.0
+  value: 7.0.0
 - name: LATEST_RELEASE_TAG
-  value: rocm-6.4.0
+  value: rocm-6.4.1
 - name: DOCKER_SKIP_GFX
  value: gfx90a
- name: AMDMIGRAPHX_GFX942_TEST_PIPELINE_ID
-  value: 197
 - name: AMDMIGRAPHX_PIPELINE_ID
  value: 113
- name: AMDMIGRAPHX_TAGGED_PIPELINE_ID
-  value: 60
 - name: AMDSMI_PIPELINE_ID
  value: 99
- name: AMDSMI_TAGGED_PIPELINE_ID
-  value: 33
 - name: AOMP_EXTRAS_PIPELINE_ID
  value: 111
- name: AOMP_EXTRAS_TAGGED_PIPELINE_ID
-  value: 75
 - name: AOMP_PIPELINE_ID
  value: 115
- name: AOMP_TAGGED_PIPELINE_ID
-  value: 76
- name: CCACHE_DIR
-  value: $(Pipeline.Workspace)/ccache
 - name: CLR_PIPELINE_ID
  value: 145
- name: CLR_TAGGED_PIPELINE_ID
-  value: 71
- name: COMPOSABLE_KERNEL_GFX942_TEST_PIPELINE_ID
-  value: 179
 - name: COMPOSABLE_KERNEL_PIPELINE_ID
  value: 86
- name: COMPOSABLE_KERNEL_TAGGED_PIPELINE_ID
-  value: 38
 - name: FLANG_LEGACY_PIPELINE_ID
  value: 77
- name: FLANG_LEGACY_TAGGED_PIPELINE_ID
-  value: 77
 - name: HALF_PIPELINE_ID
  value: 101
- name: HALF_TAGGED_PIPELINE_ID
-  value: 11
 - name: HALF560_PIPELINE_ID
  value: 68
 - name: HALF560_BUILD_ID
  value: 621
 - name: HIP_PIPELINE_ID
  value: 93
- name: HIP_TAGGED_PIPELINE_ID
-  value: 31
 - name: HIP_TESTS_PIPELINE_ID
  value: 233
- name: HIP_TESTS_TAGGED_PIPELINE_ID
-  value: 220
 - name: HIPBLAS_COMMON_PIPELINE_ID
-  value: 223
- name: HIPBLAS_COMMON_TAGGED_PIPELINE_ID
-  value: 224
- name: HIPBLAS_GFX942_TEST_PIPELINE_ID
-  value: 202
+  value: 300
 - name: HIPBLAS_PIPELINE_ID
  value: 87
- name: HIPBLAS_TAGGED_PIPELINE_ID
-  value: 44
- name: HIPBLASLT_GFX942_TEST_PIPELINE_ID
-  value: 187
 - name: HIPBLASLT_PIPELINE_ID
-  value: 112
- name: HIPBLASLT_TAGGED_PIPELINE_ID
-  value: 45
- name: HIPCUB_GFX942_TEST_PIPELINE_ID
-  value: 186
+  value: 301
 - name: HIPCUB_PIPELINE_ID
  value: 277
- name: HIPCUB_TAGGED_PIPELINE_ID
-  value: 46
- name: HIPFFT_GFX942_TEST_PIPELINE_ID
-  value: 198
 - name: HIPFFT_PIPELINE_ID
  value: 121
- name: HIPFFT_TAGGED_PIPELINE_ID
-  value: 12
 - name: HIPFORT_PIPELINE_ID
  value: 102
- name: HIPFORT_TAGGED_PIPELINE_ID
-  value: 34
 - name: HIPIFY_PIPELINE_ID
  value: 92
- name: HIPIFY_TAGGED_PIPELINE_ID
-  value: 13
- name: HIPRAND_GFX942_TEST_PIPELINE_ID
-  value: 188
 - name: HIPRAND_PIPELINE_ID
-  value: 90
- name: HIPRAND_TAGGED_PIPELINE_ID
-  value: 42
- name: HIPSOLVER_GFX942_TEST_PIPELINE_ID
-  value: 201
+  value: 275
 - name: HIPSOLVER_PIPELINE_ID
  value: 84
- name: HIPSOLVER_TAGGED_PIPELINE_ID
-  value: 52
- name: HIPSPARSE_GFX942_TEST_PIPELINE_ID
-  value: 195
 - name: HIPSPARSE_PIPELINE_ID
  value: 83
- name: HIPSPARSE_TAGGED_PIPELINE_ID
-  value: 14
- name: HIPSPARSELT_GFX942_TEST_PIPELINE_ID
-  value: 200
 - name: HIPSPARSELT_PIPELINE_ID
-  value: 104
- name: HIPSPARSELT_TAGGED_PIPELINE_ID
-  value: 53
- name: HIPTENSOR_GFX942_TEST_PIPELINE_ID
-  value: 192
+  value: 309
 - name: HIPTENSOR_PIPELINE_ID
  value: 105
- name: HIPTENSOR_TAGGED_PIPELINE_ID
-  value: 56
 - name: LLVM_PROJECT_PIPELINE_ID
  value: 2
- name: LLVM_PROJECT_TAGGED_PIPELINE_ID
-  value: 8
 - name: MIOPEN_PIPELINE_ID
  value: 108
- name: MIOPEN_TAGGED_PIPELINE_ID
-  value: 58
 - name: MIVISIONX_PIPELINE_ID
  value: 80
- name: MIVISIONX_TAGGED_PIPELINE_ID
-  value: 18
- name: OMNIPERF_PIPELINE_ID
-  value: 241
- name: OMNIPERF_TAGGED_PIPELINE_ID
-  value: 242
- name: OMNITRACE_PIPELINE_ID
-  value: 253
- name: OMNITRACE_TAGGED_PIPELINE_ID
-  value: 252
- name: RCCL_GFX942_TEST_PIPELINE_ID
-  value: 184
 - name: RCCL_PIPELINE_ID
  value: 107
- name: RCCL_TAGGED_PIPELINE_ID
-  value: 15
 - name: RDC_PIPELINE_ID
  value: 100
- name: RDC_TAGGED_PIPELINE_ID
-  value: 59
 - name: ROCAL_PIPELINE_ID
  value: 151
- name: ROCALUTION_GFX942_TEST_PIPELINE_ID
-  value: 196
 - name: ROCALUTION_PIPELINE_ID
  value: 89
- name: ROCALUTION_TAGGED_PIPELINE_ID
-  value: 16
- name: ROCBLAS_GFX942_TEST_PIPELINE_ID
-  value: 185
 - name: ROCBLAS_PIPELINE_ID
-  value: 85
- name: ROCBLAS_TAGGED_PIPELINE_ID
-  value: 32
+  value: 302
 - name: ROCDBGAPI_PIPELINE_ID
  value: 135
- name: ROCDBGAPI_TAGGED_PIPELINE_ID
-  value: 17
 - name: ROCDECODE_PIPELINE_ID
  value: 79
- name: ROCDECODE_TAGGED_PIPELINE_ID
-  value: 21
- name: ROCFFT_GFX942_TEST_PIPELINE_ID
-  value: 189
 - name: ROCFFT_PIPELINE_ID
  value: 120
- name: ROCFFT_TAGGED_PIPELINE_ID
-  value: 19
 - name: ROCGDB_PIPELINE_ID
  value: 134
- name: ROCGDB_TAGGED_PIPELINE_ID
-  value: 50
 - name: ROCJPEG_PIPELINE_ID
  value: 262
- name: ROCJPEG_TAGGED_PIPELINE_ID
-  value: 263
 - name: ROCM_BANDWIDTH_TEST_PIPELINE_ID
  value: 88
- name: ROCM_BANDWIDTH_TEST_TAGGED_PIPELINE_ID
-  value: 23
 - name: ROCM_CMAKE_PIPELINE_ID
  value: 6
- name: ROCM_CMAKE_TAGGED_PIPELINE_ID
-  value: 7
 - name: ROCM_CORE_PIPELINE_ID
  value: 103
- name: ROCM_CORE_TAGGED_PIPELINE_ID
-  value: 22
- name: ROCM_EXAMPLES_GFX942_TEST_PIPELINE_ID
-  value: 204
 - name: ROCM_EXAMPLES_PIPELINE_ID
  value: 216
- name: ROCM_EXAMPLES_TAGGED_PIPELINE_ID
-  value: 245
 - name: ROCM_SMI_LIB_PIPELINE_ID
  value: 96
- name: ROCM_SMI_LIB_TAGGED_PIPELINE_ID
-  value: 47
 - name: ROCMINFO_PIPELINE_ID
  value: 91
- name: ROCMINFO_TAGGED_PIPELINE_ID
-  value: 27
 - name: ROCMLIR_PIPELINE_ID
  value: 229
- name: ROCMLIR_TAGGED_PIPELINE_ID
-  value: 62
 - name: ROCMVALIDATIONSUITE_PIPELINE_ID
  value: 106
- name: ROCMVALIDATIONSUITE_TAGGED_PIPELINE_ID
-  value: 43
- name: ROCPRIM_GFX942_TEST_PIPELINE_ID
-  value: 180
 - name: ROCPRIM_PIPELINE_ID
  value: 273
- name: ROCPRIM_TAGGED_PIPELINE_ID
-  value: 20
- name: ROCPROFILER_GFX942_TEST_PIPELINE_ID
-  value: 190
 - name: ROCPROFILER_COMPUTE_PIPELINE_ID
  value: 257
- name: ROCPROFILER_COMPUTE_TAGGED_PIPELINE_ID
-  value: 258
 - name: ROCPROFILER_REGISTER_PIPELINE_ID
  value: 1
- name: ROCPROFILER_REGISTER_TAGGED_PIPELINE_ID
-  value: 25
 - name: ROCPROFILER_SDK_PIPELINE_ID
  value: 246
- name: ROCPROFILER_SDK_TAGGED_PIPELINE_ID
-  value: 234
 - name: ROCPROFILER_SYSTEMS_PIPELINE_ID
  value: 255
- name: ROCPROFILER_SYSTEMS_TAGGED_PIPELINE_ID
-  value: 254
 - name: ROCPROFILER_PIPELINE_ID
  value: 143
- name: ROCPROFILER_TAGGED_PIPELINE_ID
-  value: 28
 - name: ROCPYDECODE_PIPELINE_ID
  value: 239
- name: ROCPYDECODE_TAGGED_PIPELINE_ID
-  value: 232
 - name: ROCR_DEBUG_AGENT_PIPELINE_ID
  value: 136
- name: ROCR_DEBUG_AGENT_TAGGED_PIPELINE_ID
-  value: 29
 - name: ROCR_RUNTIME_PIPELINE_ID
  value: 10
- name: ROCR_RUNTIME_TAGGED_PIPELINE_ID
-  value: 24
- name: ROCRAND_GFX942_TEST_PIPELINE_ID
-  value: 183
 - name: ROCRAND_PIPELINE_ID
-  value: 95
- name: ROCRAND_TAGGED_PIPELINE_ID
-  value: 41
- name: ROCSOLVER_GFX942_TEST_PIPELINE_ID
-  value: 199
+  value: 274
 - name: ROCSOLVER_PIPELINE_ID
  value: 81
- name: ROCSOLVER_TAGGED_PIPELINE_ID
-  value: 55
- name: ROCSPARSE_GFX942_TEST_PIPELINE_ID
-  value: 191
 - name: ROCSPARSE_PIPELINE_ID
-  value: 98
- name: ROCSPARSE_TAGGED_PIPELINE_ID
-  value: 67
- name: ROCT_THUNK_INTERFACE_PIPELINE_ID
-  value: 3
- name: ROCT_THUNK_INTERFACE_TAGGED_PIPELINE_ID
-  value: 9
- name: ROCTHRUST_GFX942_TEST_PIPELINE_ID
-  value: 194
+  value: 314
 - name: ROCTHRUST_PIPELINE_ID
  value: 276
- name: ROCTHRUST_TAGGED_PIPELINE_ID
-  value: 26
- name: ROCTRACER_GFX942_TEST_PIPELINE_ID
-  value: 181
 - name: ROCTRACER_PIPELINE_ID
  value: 141
- name: ROCTRACER_TAGGED_PIPELINE_ID
-  value: 30
- name: ROCWMMA_GFX942_TEST_PIPELINE_ID
-  value: 193
 - name: ROCWMMA_PIPELINE_ID
  value: 109
- name: ROCWMMA_TAGGED_PIPELINE_ID
-  value: 57
- name: RPP_GFX942_TEST_PIPELINE_ID
-  value: 182
 - name: RPP_PIPELINE_ID
  value: 78
- name: RPP_TAGGED_PIPELINE_ID
-  value: 39
 - name: TRANSFERBENCH_PIPELINE_ID
  value: 265
- name: TRANSFERBENCH_TAGGED_PIPELINE_ID
-  value: 266
- name: BOOST_DEPENDENCY_PIPELINE_ID
-  value: 250
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -6,7 +6,7 @@ version: 2
 sphinx:
   configuration: docs/conf.py

-formats: [htmlzip]
+formats: [htmlzip, pdf]

 python:
   install:
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -6,6 +6,7 @@ ACS
 AccVGPR
 AccVGPRs
 ALU
+AllReduce
 AMD
 AMDGPU
 AMDGPUs
@@ -13,6 +14,7 @@ AMDMIGraphX
 AMI
 AOCC
 AOMP
+AOT
 AOTriton
 APBDIS
 APIC
@@ -32,6 +34,7 @@ Andrej
 Arb
 Autocast
 BARs
+BatchNorm
 BLAS
 BMC
 BabelStream
@@ -42,6 +45,7 @@ Bootloader
 CAS
 CCD
 CDNA
+CGUI
 CHTML
 CIFAR
 CLI
@@ -79,10 +83,13 @@ ConnectX
 CuPy
 da
 Dashboarding
+Dataloading
 DBRX
 DDR
 DF
 DGEMM
+DGL
+DGLGraph
 dGPU
 dGPUs
 DIMM
@@ -100,6 +107,7 @@ DataFrame
 DataLoader
 DataParallel
 Debian
+decompositions
 DeepSeek
 DeepSpeed
 Dependabot
@@ -108,6 +116,7 @@ DevCap
 DirectX
 Dockerfile
 Doxygen
+dropless
 ELMo
 ENDPGM
 EPYC
@@ -125,10 +134,12 @@ FX
 Filesystem
 FindDb
 Flang
+FlashAttention
 FluxBenchmark
 Fortran
 Fuyu
 GALB
+GAT
 GCC
 GCD
 GCDs
@@ -156,6 +167,8 @@ GPT
 GPU
 GPU's
 GPUs
+Graphbolt
+GraphSage
 GRBM
 GenAI
 GenZ
@@ -168,6 +181,7 @@ HIPCC
 HIPExtension
 HIPIFY
 HIPification
+hipification
 HIPify
 HPC
 HPCG
@@ -182,6 +196,7 @@ Higgs
 Hyperparameters
 Huggingface
 ICD
+ICT
 ICV
 IDE
 IDEs
@@ -216,6 +231,7 @@ KV
 KVM
 Karpathy's
 KiB
+Kineto
 Keras
 Khronos
 LAPACK
@@ -228,6 +244,7 @@ LM
 LSAN
 LSan
 LTS
+LSTMs
 LanguageCrossEntropy
 LoRA
 MEM
@@ -255,6 +272,7 @@ Makefiles
 Matplotlib
 Matrox
 MaxText
+Megablocks
 Megatrends
 Megatron
 Mellanox
@@ -264,6 +282,8 @@ Miniconda
 MirroredStrategy
 Mixtral
 MosaicML
+MoEs
+Mpops
 Multicore
 Multithreaded
 MyEnvironment
@@ -277,6 +297,7 @@ NIC
 NICs
 NLI
 NLP
+NN
 NPKit
 NPS
 NSP
@@ -313,6 +334,7 @@ OpenMPI
 OpenSSL
 OpenVX
 OpenXLA
+Optim
 Oversubscription
 PagedAttention
 Pallas
@@ -351,6 +373,7 @@ RDC's
 RDMA
 RDNA
 README
+Recomputation
 RHEL
 RMW
 RNN
@@ -383,11 +406,13 @@ Ryzen
 SALU
 SBIOS
 SCA
+ScaledGEMM
 SDK
 SDMA
 SDPA
 SDRAM
 SENDMSG
+SGLang
 SGPR
 SGPRs
 SHA
@@ -423,6 +448,8 @@ TCI
 TCIU
 TCP
 TCR
+TensorRT
+TensorFloat
 TF
 TFLOPS
 TP
@@ -509,6 +536,7 @@ allocator
 allocators
 amdgpu
 api
+aten
 atmi
 atomics
 autogenerated
@@ -679,6 +707,7 @@ installable
 interop
 interprocedural
 intra
+intrinsics
 invariants
 invocating
 ipo
@@ -697,11 +726,13 @@ linearized
 linter
 linux
 llvm
+lm
 localscratch
 logits
 lossy
 macOS
 matchers
+megatron
 microarchitecture
 migraphx
 migratable
@@ -773,6 +804,7 @@ quantile
 quantizer
 quasirandom
 queueing
+qwen
 radeon
 rccl
 rdc
@@ -781,6 +813,7 @@ reStructuredText
 redirections
 refactorization
 reformats
+reinforcememt
 repo
 repos
 representativeness
@@ -788,6 +821,7 @@ req
 resampling
 rescaling
 reusability
+RLHF
 roadmap
 roc
 rocAL
@@ -825,6 +859,7 @@ roctracer
 rst
 runtime
 runtimes
+ResNet
 sL
 scalability
 scalable
@@ -833,6 +868,7 @@ seealso
 sendmsg
 seqs
 serializers
+sglang
 shader
 sharding
 sigmoid
@@ -840,6 +876,7 @@ sm
 smi
 softmax
 spack
+spmm
 src
 stochastically
 strided
@@ -848,6 +885,7 @@ subdirectory
 subexpression
 subfolder
 subfolders
+submatrix
 submodule
 submodules
 subnet
@@ -872,6 +910,7 @@ torchvision
 tqdm
 tracebacks
 txt
+TopK
 uarch
 uncached
 uncacheable
@@ -899,6 +938,7 @@ vectorize
 vectorized
 vectorizer
 vectorizes
+verl
 virtualize
 virtualized
 vjxb
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,9 +4,165 @@ This page is a historical overview of changes made to ROCm components. This
 consolidated changelog documents key modifications and improvements across
 different versions of the ROCm software stack and its components.

+## ROCm 6.4.2
+
+See the [ROCm 6.4.2 release notes](https://rocm.docs.amd.com/en/docs-6.4.2/about/release-notes.html)
+for a complete overview of this release.
+
+### **AMD SMI** (25.5.1)
+
+#### Added
+
+- Compute Unit Occupancy information per process.
+
+- Support for getting the GPU Board voltage.
+
+- New firmware PLDM_BUNDLE. `amd-smi firmware` can now show the PLDM Bundle on supported systems.
+
+- `amd-smi ras --afid --cper-file <file_path>` to decode CPER records.
+
+#### Changed
+
+- Padded `asic_serial` in `amdsmi_get_asic_info` with 0s.
+
+- Renamed field `COMPUTE_PARTITION` to `ACCELERATOR_PARTITION` in CLI call `amd-smi --partition`.
+
+#### Resolved issues
+
+- Corrected VRAM memory calculation in `amdsmi_get_gpu_process_list`. Previously, the VRAM memory usage reported by `amdsmi_get_gpu_process_list` was inaccurate and was calculated using KB instead of KiB.
+
+```{note}
+See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/rocm-rel-6.4/CHANGELOG.md) for details, examples, and in-depth descriptions.
+```
+
+### **HIP** (6.4.2)
+
+#### Added
+
+* HIP API implementation for `hipEventRecordWithFlags`, records an event in the specified stream with flags.
+* Support for the pointer attribute `HIP_POINTER_ATTRIBUTE_CONTEXT`.
+* Support for the flags `hipEventWaitDefault` and `hipEventWaitExternal`.
+
+#### Optimized
+
+* Improved implementation in `hipEventSynchronize`, HIP runtime now makes internal callbacks as non-blocking operations to improve performance.
+
+#### Resolved issues
+
+* Issue of dependency on `libgcc-s1` during rocm-dev install on Debian Buster. HIP runtime removed this Debian package dependency, and uses `libgcc1` instead for this distros.
+* Building issue for `COMGR` dynamic load on Fedora and other Distros. HIP runtime now doesn't link against `libamd_comgr.so`.
+* Failure in the API `hipStreamDestroy`, when stream type is `hipStreamLegacy`. The API now returns error code `hipErrorInvalidResourceHandle` on this condition.
+* Kernel launch errors, such as `shared object initialization failed`, `invalid device function` or `kernel execution failure`. HIP runtime now loads `COMGR` properly considering the file with its name and mapped image.
+* Memory access fault in some applications. HIP runtime fixed offset accumulation in memory address.
+* The memory leak in virtual memory management (VMM). HIP runtime now uses the size of handle for allocated memory range instead of actual size for physical memory, which fixed the issue of address clash with VMM.
+* Large memory allocation issue. HIP runtime now checks GPU video RAM and system RAM properly and sets size limits during memory allocation either on the host or the GPU device.
+* Support of `hipDeviceMallocContiguous` flags in `hipExtMallocWithFlags()`. It now enables `HSA_AMD_MEMORY_POOL_CONTIGUOUS_FLAG` in the memory pool allocation on GPU device.
+* Radom memory segmentation fault in handling `GraphExec` object release and `hipDeviceSyncronization`. HIP runtime now uses internal device synchronize function in `__hipUnregisterFatBinary`. 
+
+### **hipBLASLt** (0.12.1)
+
+#### Added
+
+* Support for gfx1151 on Linux, complementing the previous support in the HIP SDK for Windows.
+
+### **RCCL** (2.22.3)
+
+#### Added
+
+* Added support for the LL128 protocol on gfx942.
+
+### **rocBLAS** (4.4.1)
+
+#### Resolved issues
+
+* rocBLAS might have failed to produce correct results for cherk/zherk on gfx90a/gfx942 with problem sizes k > 500 due to the imaginary portion on the C matrix diagonal not being zeros. rocBLAS now zeros the imaginary portion.
+
+### **ROCm Compute Profiler** (3.1.1)
+
+#### Added
+
+* 8-bit floating point (FP8) metrics support for AMD Instinct MI300 GPUs.
+* Additional data types for roofline: FP8, FP16, BF16, FP32, FP64, I8, I32, I64 (dependent on the GPU architecture).
+* Data type selection option ``--roofline-data-type / -R`` for roofline profiling. The default data type is FP32.
+
+#### Changed
+
+* Changed dependency from `rocm-smi` to `amd-smi`.
+
+#### Resolved issues
+
+* Fixed a crash related to Agent ID caused by the new format of the `rocprofv3` output CSV file.
+
+### **ROCm Systems Profiler** (1.0.2)
+
+#### Optimized
+
+* Improved readability of the OpenMP target offload traces by showing on a single Perfetto track.
+
+#### Resolved issues
+
+* Fixed the file path to the script that merges Perfetto files from multi-process MPI runs. The script has also been renamed from `merge-multiprocess-output.sh` to `rocprof-sys-merge-output.sh`.
+
+### **ROCm Validation Suite** (1.1.0)
+
+#### Added
+
+* NPS2/DPX and NPS4/CPX partition modes support for AMD Instinct MI300X.
+
+### **rocPRIM** (3.4.1)
+
+#### Upcoming changes
+
+* Changes to the template parameters of warp and block algorithms will be made in an upcoming release.
+* Due to an upcoming compiler change, the following symbols related to warp size have been marked as deprecated and will be removed in an upcoming major release:
+    * `rocprim::device_warp_size()`. This has been replaced by `rocprim::arch::wavefront::min_size()` and `rocprim::arch::wavefront::max_size()` for compile-time constants. Use these when allocating global or shared memory. For run-time constants, use `rocprim::arch::wavefront::size()`.
+  * `rocprim::warp_size()`
+  * `ROCPRIM_WAVEFRONT_SIZE`
+
+* The default scan accumulator types for device-level scan algorithms will be changed in an upcoming release, resulting in a breaking change. Previously, the default accumulator type was set to the input type for the inclusive scans and to the initial value type for the exclusive scans. This could lead to unexpected overflow if the input or initial type was smaller than the output type when the accumulator type wasn't explicitly set using the `AccType` template parameter. The new default accumulator types will be set to the type that results when the input or initial value type is applied to the scan operator.  
+
+    The following is the complete list of affected functions and how their default accumulator types are changing:
+    
+    * `rocprim::inclusive_scan`
+        * current default: `class AccType = typename std::iterator_traits<InputIterator>::value_type>`
+        * future default: `class AccType = rocprim::invoke_result_binary_op_t<typename std::iterator_traits<InputIterator>::value_type, BinaryFunction>`
+    * `rocprim::deterministic_inclusive_scan`
+        * current default: `class AccType = typename std::iterator_traits<InputIterator>::value_type>`
+        * future default: `class AccType = rocprim::invoke_result_binary_op_t<typename std::iterator_traits<InputIterator>::value_type, BinaryFunction>`
+    * `rocprim::exclusive_scan`
+        * current default: `class AccType = detail::input_type_t<InitValueType>>`
+        * future default: `class AccType = rocprim::invoke_result_binary_op_t<rocprim::detail::input_type_t<InitValueType>, BinaryFunction>`
+    * `rocprim::deterministic_exclusive_scan`
+        * current default: `class AccType = detail::input_type_t<InitValueType>>`
+        * future default: `class AccType = rocprim::invoke_result_binary_op_t<rocprim::detail::input_type_t<InitValueType>, BinaryFunction>`
+
+* `rocprim::load_cs` and `rocprim::store_cs` are deprecated and will be removed in an upcoming release. Alternatively, you can use `rocprim::load_nontemporal` and `rocprim::store_nontemporal` to load and store values in specific conditions (like bypassing the cache) for `rocprim::thread_load` and `rocprim::thread_store`.
+
+### **rocSHMEM** (2.0.1)
+
+#### Resolved issues
+
+* Incorrect output for `rocshmem_ctx_my_pe` and `rocshmem_ctx_n_pes`.
+* Multi-team errors by providing team specific buffers in `rocshmem_ctx_wg_team_sync`.
+* Missing implementation of `rocshmem_g` for IPC conduit.
+
+### **rocSOLVER** (3.28.2)
+
+#### Added
+
+* Hybrid computation support for existing routines, such as STERF.
+* SVD for general matrices based on Cuppen's Divide and Conquer algorithm:
+    - GESDD (with batched and strided\_batched versions)
+
+#### Optimized
+
+* Reduced the device memory requirements for STEDC, SYEVD/HEEVD, and SYGVD/HEGVD.
+* Improved the performance of STEDC and divide and conquer Eigensolvers.
+* Improved the performance of SYTRD, the initial step of the Eigensolvers that start with the tridiagonalization of the input matrix.
+
 ## ROCm 6.4.1

-See the [ROCm 6.4.1 release notes](https://rocm-stg.amd.com/en/latest/about/release-notes.html)
+See the [ROCm 6.4.1 release notes](https://rocm.docs.amd.com/en/docs-6.4.1/about/release-notes.html)
 for a complete overview of this release.

 ### **AMD SMI** (25.4.2)
@@ -24,7 +180,7 @@ for a complete overview of this release.

 #### Optimized

-* Improved load times for CLI commands when the GPU has multiple parititons.
+* Improved load times for CLI commands when the GPU has multiple partitions.

 #### Resolved issues

@@ -34,9 +190,8 @@ for a complete overview of this release.

 * When using the `--follow` flag with `amd-smi ras --cper`, CPER entries are not streamed continuously as intended. This will be fixed in an upcoming ROCm release.

-```{note}
-See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/rocm-rel-6.4/CHANGELOG.md) for details, examples, and in-depth descriptions.
-```
+> [!NOTE]
+> See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/rocm-rel-6.4/CHANGELOG.md) for details, examples, and in-depth descriptions.

 ### **HIP** (6.4.1)

@@ -117,9 +272,8 @@ See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/roc

 - Fixed partition enumeration. It now refers to the correct DRM Render and Card paths.

-```{note}
-See the full [ROCm SMI changelog](https://github.com/ROCm/rocm_smi_lib/blob/release/rocm-rel-6.4/CHANGELOG.md) for details, examples, and in-depth descriptions.
-```
+> [!NOTE]
+> See the full [ROCm SMI changelog](https://github.com/ROCm/rocm_smi_lib/blob/release/rocm-rel-6.4/CHANGELOG.md) for details, examples, and in-depth descriptions.

 ### **ROCm Systems Profiler** (1.0.1)

@@ -257,9 +411,8 @@ Some workaround options are as follows:

 - The `pasid` field in struct `amdsmi_process_info_t` will be deprecated in a future ROCm release.

-```{note}
-See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/rocm-rel-6.4/CHANGELOG.md) for details, examples, and in-depth descriptions.
-```
+> [!NOTE]
+> See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/rocm-rel-6.4/CHANGELOG.md) for details, examples, and in-depth descriptions.

 ### **AMDMIGraphX** (2.12.0)

@@ -867,9 +1020,8 @@ The following lists the backward incompatible changes planned for upcoming major

 - Fixed `rsmi_dev_target_graphics_version_get`, `rocm-smi --showhw`, and `rocm-smi --showprod` not displaying graphics version correctly for Instinct MI200 series, MI100 series, and RDNA3-based GPUs. 

-```{note}
-See the full [ROCm SMI changelog](https://github.com/ROCm/rocm_smi_lib/blob/release/rocm-rel-6.4/CHANGELOG.md) for details, examples, and in-depth descriptions.
-```
+> [!NOTE]
+> See the full [ROCm SMI changelog](https://github.com/ROCm/rocm_smi_lib/blob/release/rocm-rel-6.4/CHANGELOG.md) for details, examples, and in-depth descriptions.

 ### **ROCm Systems Profiler** (1.0.0)

@@ -1295,9 +1447,8 @@ for a complete overview of this release.
 * Fixed `amd-smi monitor`'s reporting of encode and decode information. `VCLOCK` and `DCLOCK` are
  now associated with both `ENC_UTIL` and `DEC_UTIL`.

-```{note}
-See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/6.3.x/CHANGELOG.md) for more details and examples.
-```
+> [!NOTE]
+> See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/6.3.x/CHANGELOG.md) for more details and examples.

 ### **HIP** (6.3.1)

@@ -1501,9 +1652,8 @@ for a complete overview of this release.
  - The new partition command can display GPU information, including memory and accelerator partition information.
  - The command will be at full functionality once additional partition information from `amdsmi_get_gpu_accelerator_partition_profile()` has been implemented.

-```{note}
-See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/6.3.x/CHANGELOG.md) for more details and examples.
-```
+> [!NOTE]
+> See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/6.3.x/CHANGELOG.md) for more details and examples.

 ### **HIP** (6.3.0)

@@ -1637,18 +1787,17 @@ See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/6.3.x/CHANG

 * Support for `fp8` data types

-### **hipRAND** (2.11.0[*](#id22))
+### **hipRAND** (2.11.0)
+
+> [!NOTE]
+> In ROCm 6.3.0, the hipRAND package version is incorrectly set to `2.11.0`.
+> In ROCm 6.2.4, the hipRAND package version was `2.11.1`.
+> The hipRAND version number will be corrected in a future ROCm release.

 #### Changed

 * Updated the default value for the `-a` argument from `rmake.py` to `gfx906:xnack-,gfx1030,gfx1100,gfx1101,gfx1102`.

-#### Known issues
-
-* In ROCm 6.3.0, the hipRAND package version is incorrectly set to `2.11.0`. In ROCm
-  6.2.4, the hipRAND package version was `2.11.1`. The hipRAND version number will be corrected in a
-  future ROCm release.
-
 #### Resolved issues

 * Fixed an issue in `rmake.py` where the list storing the CMake options would contain individual characters instead of a full string of options.
@@ -1849,7 +1998,7 @@ See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/6.3.x/CHANG

 #### Known issues

-* See [MIVisionX memory access fault in Canny edge detection](#mivisionx-memory-access-fault-in-canny-edge-detection).
+* See [MIVisionX memory access fault in Canny edge detection](https://github.com/ROCm/ROCm/issues/4086).
 * Package installation requires the manual installation of OpenCV.
 * Installation on CentOS/RedHat/SLES requires the manual installation of the `FFMPEG Dev` package.
 * Hardware decode requires installation with `--usecase=graphics` in addition to `--usecase=rocm`.
@@ -2040,9 +2189,9 @@ See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/6.3.x/CHANG

 #### Known issues

- See [ROCm Compute Profiler post-upgrade](#rocm-compute-profiler-post-upgrade).
+- See [ROCm Compute Profiler post-upgrade](https://github.com/ROCm/ROCm/issues/4082).

- See [ROCm Compute Profiler CTest failure in CI](#rocm-compute-profiler-ctest-failure-in-ci).
+- See [ROCm Compute Profiler CTest failure in CI](https://github.com/ROCm/ROCm/issues/4085).

 ### **ROCm Data Center Tool** (0.3.0)

@@ -2055,7 +2204,7 @@ See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/6.3.x/CHANG

 #### Known issues

- See [ROCm Data Center Tool incorrect RHEL9 package version](#rocm-data-center-tool-incorrect-rhel9-package-version).
+- See [ROCm Data Center Tool incorrect RHEL9 package version](https://github.com/ROCm/ROCm/issues/4089).

 ### **ROCm SMI** (7.4.0)

@@ -2093,9 +2242,8 @@ memory partition modes upon an invalid argument return from memory partition mod

 - C++ tests for `memorypartition_read_write` are to be re-enabled in a future ROCm release.

-```{note}
-See the full [ROCm SMI changelog](https://github.com/ROCm/rocm_smi_lib/blob/6.3.x/CHANGELOG.md) for more details and examples.
-```
+> [!NOTE]
+> See the full [ROCm SMI changelog](https://github.com/ROCm/rocm_smi_lib/blob/6.3.x/CHANGELOG.md) for more details and examples.

 ### **ROCm Systems Profiler** (0.1.0)

@@ -2109,7 +2257,7 @@ See the full [ROCm SMI changelog](https://github.com/ROCm/rocm_smi_lib/blob/6.3.

 #### Known issues

- See [ROCm Systems Profiler post-upgrade](#rocm-systems-profiler-post-upgrade).
+- See [ROCm Systems Profiler post-upgrade](https://github.com/ROCm/ROCm/issues/4083).

 ### **ROCm Validation Suite** (1.1.0)

@@ -2123,7 +2271,7 @@ See the full [ROCm SMI changelog](https://github.com/ROCm/rocm_smi_lib/blob/6.3.

 #### Known issues

- See [ROCm Validation Suite needs specified configuration file](#rocm-validation-suite-needs-specified-configuration-file).
+- See [ROCm Validation Suite needs specified configuration file](https://github.com/ROCm/ROCm/issues/4090).

 ### **rocPRIM** (3.3.0)

@@ -2866,10 +3014,8 @@ for a complete overview of this release.

 See [issue #3500](https://github.com/ROCm/ROCm/issues/3500) on GitHub.

-```{note}
-See the [detailed AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/docs/6.2.0/CHANGELOG.md)
-on GitHub for more information.
-```
+> [!NOTE]
+> See the [detailed AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/docs/6.2.0/CHANGELOG.md) on GitHub for more information.

 ### **Composable Kernel** (1.1.0)

@@ -3468,9 +3614,8 @@ The compiler may incorrectly compile a program that uses the
 the function is undefined along some path to the function. For most functions,
 uninitialized inputs cause undefined behavior.

-```{note}
-The ``-Wall`` compilation flag prompts the compiler to generate a warning if a variable is uninitialized along some path.
-```
+> [!NOTE]
+> The ``-Wall`` compilation flag prompts the compiler to generate a warning if a variable is uninitialized along some path.

 As a workaround, initialize the parameters to ``__shfl``. For example:

@@ -3791,10 +3936,8 @@ See [issue #3498](https://github.com/ROCm/ROCm/issues/3498) on GitHub.

 - Fixed Partition ID CLI output.

-```{note}
-See the [detailed ROCm SMI changelog](https://github.com/ROCm/rocm_smi_lib/blob/docs/6.2.0/CHANGELOG.md)
-on GitHub for more information.
-```
+> [!NOTE]
+> See the [detailed ROCm SMI changelog](https://github.com/ROCm/rocm_smi_lib/blob/docs/6.2.0/CHANGELOG.md) on GitHub for more information.

 ### **ROCm Validation Suite** (1.0.0)

@@ -4164,9 +4307,8 @@ for a complete overview of this release.
 * Fixed the `amdsmitstReadWrite.TestPowerCapReadWrite` test for RDNA3, RDNA2, and MI100 devices.
 * Fixed an issue with the `amdsmi_get_gpu_memory_reserved_pages` and `amdsmi_get_gpu_bad_page_info` Python interface calls.

-```{note}
-See the AMD SMI [detailed changelog](https://github.com/ROCm/amdsmi/blob/rocm-6.1.x/CHANGELOG.md) with code samples for more information.
-```
+> [!NOTE]
+> See the AMD SMI [detailed changelog](https://github.com/ROCm/amdsmi/blob/rocm-6.1.x/CHANGELOG.md) with code samples for more information.

 ### **RCCL** (2.18.6)

@@ -4246,9 +4388,8 @@ for a complete overview of this release.

 - `amd-smi bad-pages` can result in a `ValueError: Null pointer access` error when using some PMU firmware versions.

-```{note}
-See the [detailed changelog](https://github.com/ROCm/amdsmi/blob/docs/6.1.1/CHANGELOG.md) with code samples for more information.
-```
+> [!NOTE]
+> See the [detailed changelog](https://github.com/ROCm/amdsmi/blob/docs/6.1.1/CHANGELOG.md) with code samples for more information.

 ### **hipBLASLt** (0.7.0)

@@ -4317,9 +4458,8 @@ See the [detailed changelog](https://github.com/ROCm/amdsmi/blob/docs/6.1.1/CHAN

 - ROCm SMI reports GPU utilization incorrectly for RDNA3 GPUs in some situations. See the issue on [GitHub](https://github.com/ROCm/ROCm/issues/3112).

-```{note}
-See the [detailed ROCm SMI changelog](https://github.com/ROCm/rocm_smi_lib/blob/docs/6.1.1/CHANGELOG.md) with code samples for more information.
-```
+> [!NOTE]
+> See the [detailed ROCm SMI changelog](https://github.com/ROCm/rocm_smi_lib/blob/docs/6.1.1/CHANGELOG.md) with code samples for more information.

 ## ROCm 6.1.0

@@ -5036,16 +5176,16 @@ on GitHub for a complete overview of this release.

 ### **rocSPARSE** (2.5.4)

-##### Added
+#### Added

 - Added more mixed precisions for SpMV, (matrix: float, vectors: double, calculation: double) and (matrix: rocsparse_float_complex, vectors: rocsparse_double_complex, calculation: rocsparse_double_complex)
 - Added support for gfx940, gfx941 and gfx942

-##### Optimized
+#### Optimized

 - Fixed a bug in csrsm and bsrsm

-##### Known issues
+#### Known issues

 In csritlu0, the algorithm rocsparse_itilu0_alg_sync_split_fusion has some accuracy issues to investigate with XNACK enabled. The fallback is rocsparse_itilu0_alg_sync_split.

@@ -5131,7 +5271,7 @@ on GitHub for a complete overview of this release.

 ### **HIP** (5.6.0)

-##### Added
+#### Added

 - Added hipRTC support for amd_hip_fp16
 - Added hipStreamGetDevice implementation to get the device associated with the stream
@@ -5140,7 +5280,7 @@ on GitHub for a complete overview of this release.
 - hipArrayGetDescriptor for getting 1D or 2D array descriptor
 - hipArray3DGetDescriptor to get 3D array descriptor

-##### Changed
+#### Changed

 - hipMallocAsync to return success for zero size allocation to match hipMalloc
 - Separation of hipcc perl binaries from HIP project to hipcc project. hip-devel package depends on newly added hipcc package
@@ -5445,15 +5585,15 @@ $ gcc main.c -I/opt/rocm-5.6.0/include -L/opt/rocm-5.6.0/lib -lrocprofiler64-v2
 The resulting `a.out` will depend on
 `/opt/rocm-5.6.0/lib/librocprofiler64.so.2`.

-##### Added
+#### Added

 - 'end_time' need to be disabled in roctx_trace.txt

-##### Optimized
+#### Optimized

 - Improved Test Suite

-##### Resolved issues
+#### Resolved issues

 - rocprof in ROcm/5.4.0 gpu selector broken.
 - rocprof in ROCm/5.4.1 fails to generate kernel info.
--- a/README.md
+++ b/README.md
@@ -19,142 +19,17 @@ ROCm supports programming models, such as OpenMP and OpenCL, and includes all ne
 source software compilers, debuggers, and libraries. ROCm is fully integrated into machine learning
 (ML) frameworks, such as PyTorch and TensorFlow.

-## Getting the ROCm Source Code
+> [!IMPORTANT]
+> A new open source build platform for ROCm is under development at
+> https://github.com/ROCm/TheRock, featuring a unified CMake build with bundled
+> dependencies, Windows support, and more.
+>
+> The instructions below describe the prior process for building from source
+> which will be replaced once TheRock is mature enough.

-AMD ROCm is built from open source software. It is, therefore, possible to modify the various components of ROCm by downloading the source code and rebuilding the components. The source code for ROCm components can be cloned from each of the GitHub repositories using git.  For easy access to download the correct versions of each of these tools, the ROCm repository contains a repo manifest file called [default.xml](./default.xml). You can use this manifest file to download the source code for ROCm software.
+## Getting and Building ROCm from Source

-### Installing the repo tool
-
-The repo tool from Google allows you to manage multiple git repositories simultaneously. Run the following commands to install the repo tool:
-
-```bash
-mkdir -p ~/bin/
-curl https://storage.googleapis.com/git-repo-downloads/repo > ~/bin/repo
-chmod a+x ~/bin/repo
-```
-
-**Note:** The ```~/bin/``` folder is used as an example. You can specify a different folder to install the repo tool into if you desire.
-
-### Installing git-lfs
-
-Some ROCm projects use the Git Large File Storage (LFS) format that may require you to install git-lfs. Refer to [Git Large File Storage](https://github.com/git-lfs/git-lfs/blob/main/INSTALLING.md) for more information. For example, to install git-lfs for Ubuntu, use the following command:
-
-```bash
-sudo apt-get install git-lfs
-```
-
-### Downloading the ROCm source code
-
-The following example shows how to use the repo tool to download the ROCm source code. If you choose a directory other than ~/bin/ to install the repo tool, you must use that chosen directory in the code as shown below:
-
-```bash
-mkdir -p ~/ROCm/
-cd ~/ROCm/
-export ROCM_VERSION=6.4.1
-~/bin/repo init -u http://github.com/ROCm/ROCm.git -b roc-6.4.x -m tools/rocm-build/rocm-${ROCM_VERSION}.xml
-~/bin/repo sync
-```
-
-**Note:** Using this sample code will cause the repo tool to download the open source code associated with the specified ROCm release. Ensure that you have ssh-keys configured on your machine for your GitHub ID prior to the download as explained at [Connecting to GitHub with SSH](https://docs.github.com/en/authentication/connecting-to-github-with-ssh).
-
-## Building the ROCm source code
-
-Each ROCm component repository contains directions for building that component, such as the rocSPARSE documentation [Installation and Building for Linux](https://rocm.docs.amd.com/projects/rocSPARSE/en/latest/install/Linux_Install_Guide.html). Refer to the specific component documentation for instructions on building the repository.
-
-Each release of the ROCm software supports specific hardware and software configurations. Refer to [System requirements (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html) for the current supported hardware and OS.
-
-## Build ROCm from source
-
-The Build will use as many processors as it can find to build in parallel. Some of the compiles can consume as much as 10GB of RAM, so make sure you have plenty of Swap Space !
-
-By default the ROCm build will compile for all supported GPU architectures and will take approximately 500 CPU hours.
-The Build time will reduce significantly if we limit the GPU Architecture/s against which we need to build by using the environment variable GPU_ARCHS as mentioned below.
-
-```bash
-# --------------------------------------
-# Step1: clone source code
-# --------------------------------------
-
-mkdir -p ~/WORKSPACE/      # Or any folder name other than WORKSPACE
-cd ~/WORKSPACE/
-export ROCM_VERSION=6.4.1
-~/bin/repo init -u http://github.com/ROCm/ROCm.git -b roc-6.4.x -m tools/rocm-build/rocm-${ROCM_VERSION}.xml
-~/bin/repo sync
-
-# --------------------------------------
-# Step 2: Prepare build environment
-# --------------------------------------
-
-# Option 1: Start a docker container
-# Pulling required base docker images:
-# Ubuntu22.04 built from ROCm/tools/rocm-build/docker/ubuntu22/Dockerfile
-docker pull rocm/rocm-build-ubuntu-22.04:6.4
-# Ubuntu24.04 built from ROCm/tools/rocm-build/docker/ubuntu24/Dockerfile
-docker pull rocm/rocm-build-ubuntu-24.04:6.4
-
-# Start docker container and mount the source code folder:
-docker run -ti \
-    -e ROCM_VERSION=${ROCM_VERSION} \
-    -e CCACHE_DIR=$HOME/.ccache \
-    -e CCACHE_ENABLED=true \
-    -e DOCK_WORK_FOLD=/src \
-    -w /src \
-    -v $PWD:/src \
-    -v /etc/passwd:/etc/passwd \
-    -v /etc/shadow:/etc/shadow \
-    -v ${HOME}/.ccache:${HOME}/.ccache \
-    -u $(id -u):$(id -g) \
-    <replace_with_required_ubuntu_base_docker_image> bash
-
-# Option 2: Install required packages into the host machine
-# For ubuntu22.04 system
-cd ROCm/tools/rocm-build/docker/ubuntu22
-cp * /tmp && cd /tmp
-bash install-prerequisites.sh
-# For ubuntu24.04 system
-cd ROCm/tools/rocm-build/docker/ubuntu24
-cp * /tmp && cd /tmp
-bash install-prerequisites.sh
-
-# --------------------------------------
-# Step 3: Run build command line
-# --------------------------------------
-
-# Select GPU targets before building:
-# When GPU_ARCHS is not set, default GPU targets supported by ROCm6.1 will be used.
-# To build against a subset of GFX architectures you can use the below env variable.
-# Support MI300 (gfx940, gfx941, gfx942).
-export GPU_ARCHS="gfx942"               # Example
-export GPU_ARCHS="gfx940;gfx941;gfx942" # Example
-
-# Pick and run build commands in the docker container:
-# Build rocm-dev packages
-make -f ROCm/tools/rocm-build/ROCm.mk -j ${NPROC:-$(nproc)} rocm-dev
-# Build all ROCm packages
-make -f ROCm/tools/rocm-build/ROCm.mk -j ${NPROC:-$(nproc)} all
-# list all ROCm components to find required components
-make -f ROCm/tools/rocm-build/ROCm.mk list_components
-# Build a single ROCm packages
-make -f ROCm/tools/rocm-build/ROCm.mk T_rocblas
-
-# Find built packages in ubuntu22.04:
-out/ubuntu-22.04/22.04/deb/
-# Find built packages in ubuntu24.04:
-out/ubuntu-24.04/24.04/deb/
-
-# Find built logs in ubuntu22.04:
-out/ubuntu-22.04/22.04/logs/
-# Find built logs in ubuntu24.04:
-out/ubuntu-24.04/24.04/logs/
-# All logs pertaining to failed components, end with .errrors extension.
-out/ubuntu-22.04/22.04/logs/rocblas.errors      # Example
-# All logs pertaining to building components, end with .inprogress extension.
-out/ubuntu-22.04/22.04/logs/rocblas.inprogress  # Example
-# All logs pertaining to passed components, use the component names.
-out/ubuntu-22.04/22.04/logs/rocblas             # Example
-```
-
-Note: [Overview for ROCm.mk](tools/rocm-build/README.md)
+Please use [TheRock](https://github.com/ROCm/TheRock) build system to build ROCm from source.

 ## ROCm documentation

--- a/RELEASE.md
+++ b/RELEASE.md
@@ -10,7 +10,7 @@
 <!-- markdownlint-disable reference-links-images            -->
 <!-- markdownlint-disable no-missing-space-atx              -->
 <!-- spellcheck-disable                                     -->
-# ROCm 6.4.1 release notes
+# ROCm 6.4.2 release notes

 The release notes provide a summary of notable changes since the previous ROCm release.

@@ -24,63 +24,81 @@ The release notes provide a summary of notable changes since the previous ROCm r

 - [ROCm known issues](#rocm-known-issues)

+- [ROCm resolved issues](#rocm-resolved-issues)
+
 - [ROCm upcoming changes](#rocm-upcoming-changes)

 ```{note}
-If you’re using Radeon™ PRO or Radeon GPUs in a workstation setting with a display connected, see the [Use ROCm on Radeon GPUs](https://rocm.docs.amd.com/projects/radeon/en/latest/docs/compatibility/native_linux/native_linux_compatibility.html)
+If you’re using AMD Radeon™ PRO or Radeon GPUs in a workstation setting with a display connected, see the [Use ROCm on Radeon GPUs](https://rocm.docs.amd.com/projects/radeon/en/latest/docs/compatibility/native_linux/native_linux_compatibility.html)
 documentation to verify compatibility and system requirements.
 ```

 ## Release highlights

-The following are notable new features and improvements in ROCm 6.4.1. For changes to individual components, see
+The following are notable new features and improvements in ROCm 6.4.2. For changes to individual components, see
 [Detailed component changes](#detailed-component-changes).

-### Addition of DPX partition mode under NPS2 memory mode
- 
-AMD Instinct MI300X now supports DPX partition mode under NPS2 memory mode. For more partitioning information, see the [Deep dive into the MI300 compute and memory partition modes](https://rocm.blogs.amd.com/software-tools-optimization/compute-memory-modes/README.html) blog and [AMD Instinct MI300X system optimization](https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#change-gpu-partition-modes).
+### ROCm Compute Profiler enhancements

-### Introducing the ROCm Data Science toolkit
+[ROCm Compute Profiler](https://rocm.docs.amd.com/projects/rocprofiler-compute/en/latest/index.html) includes the following changes:

-The ROCm Data Science toolkit (or ROCm-DS) is an open-source software collection for high-performance data science applications built on the core ROCm platform. You can leverage ROCm-DS to accelerate both new and existing data science workloads, allowing you to execute intensive applications with larger datasets at lightning speed. ROCm-DS is in an early access state. Running production workloads is not recommended. For more information, see [AMD ROCm-DS Documentation](https://rocm.docs.amd.com/projects/rocm-ds/en/latest/index.html).
+* The ``--roofline-data-type`` option now supports FP8, FP16, BF16, FP32, FP64, I8, I32, and I64 data types. This is dependent on the GPU architecture. For more information, see [Roofline options](https://rocm.docs.amd.com/projects/rocprofiler-compute/en/docs-6.4.2/how-to/profile/mode.html#roofline-options).
+
+* ROCm Compute Profiler now uses [AMD SMI](https://rocm.docs.amd.com/projects/amdsmi/en/latest/index.html) instead of [ROCm SMI](https://rocm.docs.amd.com/projects/rocm_smi_lib/en/latest/index.html). The AMD System Management Interface Library (AMD SMI) is a successor to ROCm SMI. It is a unified system management interface tool that provides a user-space interface for applications to monitor and control GPU applications and gives users the ability to query information about drivers and GPUs on the system. For more information, see [https://github.com/ROCm/amdsmi](https://github.com/ROCm/amdsmi) and the [AMD SMI documentation](https://rocm.docs.amd.com/projects/amdsmi/en/latest/index.html).
+
+* ROCm Compute Profiler has added 8-bit floating point (FP8) metrics support for AMD Instinct MI300 series accelerators. For more information, see [System Speed-of-Light](https://rocm.docs.amd.com/projects/rocprofiler-compute/en/docs-6.4.2/conceptual/system-speed-of-light.html).
+
+### rocSOLVER enhancements
+
+rocSOLVER has improved the performance of eigensolvers and singular value decomposition (SVD). For more information, see [rocSOLVER documentation](https://rocm.docs.amd.com/projects/rocSOLVER/en/docs-6.4.2/index.html).

 ### ROCm Offline Installer Creator updates

-The ROCm Offline Installer Creator 6.4.1 now allows you to use the SPACEBAR or ENTER keys for menu item selection in the GUI. It also adds support for Debian 12 and fixes an issue for “full” mode RHEL offline installer creation, where GDM packages were uninstalled during offline installation. See [ROCm Offline Installer Creator](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/rocm-offline-installer.html) for more information.
+The ROCm Offline Installer Creator 6.4.2 includes the following features and improvements:
+ 
+* Added support for Oracle Linux 8.10 and 9.6, and SLES 15 SP7.
+* Additional package options for the Offline Installer Creator, including `amd-smi`, `rocdecode`, `rocjpeg`, and `rdc`.
+* ROCm meta packages are now used for selecting ROCm components and use cases.
+* Improved separation of kernel/driver and ROCm prerequisite packages to reduce the size of ROCm-only or driver-only offline installers.
+ 
+In addition, the option to build an offline installer based on ROCm version 5.7.3 has been removed. To build an offline installer for ROCm 5.7.3, use the Offline Installer Creator from version 6.4.1 or earlier. See [ROCm Offline Installer Creator](https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.2/install/rocm-offline-installer.html) for more information.

 ### ROCm Runfile Installer updates

-The ROCm Runfile Installer 6.4.1 adds the following improvements:
- Relaxed version checks for installation on different distributions. Provided the dependencies are not installed by the Runfile Installer, you can target installation for a different path from the host system running the installer. For example, the installer can run on a system using Ubuntu 22.04 and install to a partition/system that is using Ubuntu 24.04.
- Performance improvements for detecting a previous ROCm install. 
- Removal of the extra `opt` directory created for the target during the ROCm installation.  For example, installing to `target=/home/amd` now installs ROCm to `/home/amd/rocm-6.4.1` and not `/home/amd/opt/rocm-6.4.1`. For installs using `target=/`, the installation will continue to use `/opt/`.
- The Runfile Installer can be used to uninstall any Runfile-based installation of the driver.
- In the CLI interface, the `postrocm` argument can now be run separately from the `rocm` argument.  In cases where `postrocm` was missed from the initial ROCm install, `postrocm` can now be run on the same target folder. For example, if you installed ROCm 6.4.1 using `install.run target=/myrocm rocm`, you can run the post-installation separately using the command `install.run target=/myrocm/rocm-6.4.1 postrocm`.
-
-For more information, see [ROCm Runfile Installer](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/rocm-runfile-installer.html).
+The ROCm Runfile Installer 6.4.2 adds support for Oracle Linux 8.10 and 9.6 (using the RHEL 8 or 9 .run files), Debian 12 (using the Ubuntu 22.04 .run file), and SLES 15 SP7. It also fixes permission settings issues during ROCm and AMDGPU driver installation. For more information, see [ROCm Runfile Installer](https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.2/install/rocm-runfile-installer.html).

 ### ROCm documentation updates

 ROCm documentation continues to be updated to provide clearer and more comprehensive guidance for a wider variety of user needs and use cases.

-* [Tutorials for AI developers](https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/) have been expanded with five new tutorials. These tutorials are Jupyter notebook-based, easy-to-follow documents. They are ideal for AI developers who want to learn about specific topics, including inference, fine-tuning, and training. For more information about the changes, see [Changelog for the AI Developer Hub](https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/changelog.html).
-* The [Training a model with LLM Foundry](https://rocm.docs.amd.com/en/latest/how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry.html) performance testing guide has been added. This guide describes how to use the preconfigured [ROCm/pytorch-training](https://hub.docker.com/layers/rocm/pytorch-training/v25.5/images/sha256-d47850a9b25b4a7151f796a8d24d55ea17bba545573f0d50d54d3852f96ecde5) training environment and [https://github.com/ROCm/MAD](https://github.com/ROCm/MAD) to test the training performance of the LLM Foundry framework on AMD Instinct MI325X and MI300X accelerators using the [MPT-30B](https://huggingface.co/mosaicml/mpt-30b) model.
-* The [Training a model with PyTorch](https://rocm.docs.amd.com/en/latest/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.html) performance testing guide has been updated to feature the latest [ROCm/pytorch-training](https://hub.docker.com/layers/rocm/pytorch-training/v25.5/images/sha256-d47850a9b25b4a7151f796a8d24d55ea17bba545573f0d50d54d3852f96ecde5) Docker image (a preconfigured training environment with ROCm and PyTorch). Support for [Llama 3.3 70B](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct) has been added.
-* The [Training a model with JAX MaxText](https://rocm.docs.amd.com/en/latest/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.html) performance testing guide has been updated to feature the latest [ROCm/jax-training](https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.5/images/sha256-4e0516358a227cae8f552fb866ec07e2edcf244756f02e7b40212abfbab5217b) Docker image (a preconfigured training environment with ROCm, JAX, and [MaxText](https://github.com/AI-Hypercomputer/maxtext)). Support for [Llama 3.3 70B](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct) has been added.
-* The [vLLM inference performance testing](https://rocm.docs.amd.com/en/latest/how-to/rocm-for-ai/inference/vllm-benchmark.html?model=pyt_vllm_qwq-32b) guide has been updated to feature the latest [ROCm/vLLM](https://hub.docker.com/layers/rocm/vllm/latest/images/sha256-5c8b4436dd0464119d9df2b44c745fadf81512f18ffb2f4b5dc235c71ebe26b4) Docker image (a preconfigured environment for inference with ROCm and [vLLM](https://docs.vllm.ai/en/latest/)). Support for the [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) model has been added.
-* The [PyTorch inference performance testing](https://rocm.docs.amd.com/en/latest/how-to/rocm-for-ai/inference/pytorch-inference-benchmark.html?model=pyt_clip_inference) guide has been added, featuring the [ROCm/PyTorch](https://hub.docker.com/layers/rocm/pytorch/latest/images/sha256-ab1d350b818b90123cfda31363019d11c0d41a8f12a19e3cb2cb40cf0261137d) Docker image (a preconfigured inference environment with ROCm and PyTorch) with initial support for the [CLIP](https://huggingface.co/laion/CLIP-ViT-B-32-laion2B-s34B-b79K) and [Chai-1](https://huggingface.co/chaidiscovery/chai-1) models.
+* [Tutorials for AI developers](https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/) have been expanded with the following four new tutorials:
+    * Inference tutorial: [AI agent with MCPs using vLLM and PydanticAI](https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/notebooks/inference/build_airbnb_agent_mcp.html)
+    * GPU development and optimization tutorials:
+        * [Kernel development and optimization with Triton](https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/notebooks/gpu_dev_optimize/triton_kernel_dev.html)
+        * [Profiling Llama-4 inference with vLLM](https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/notebooks/gpu_dev_optimize/llama4_profiling_vllm.html)
+        * [FP8 quantization with AMD Quark for vLLM](https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/notebooks/gpu_dev_optimize/fp8_quantization_quark_vllm.html)
+    
+    For more information about the changes, see [Changelog for the AI Developer Hub](https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/changelog.html).
+
+* ROCm provides a comprehensive ecosystem for deep learning development. For more details, see [Deep learning frameworks for ROCm](https://rocm.docs.amd.com/en/docs-6.4.2/how-to/deep-learning-rocm.html). As of July 2025, AMD ROCm provides support for the following additional deep learning frameworks:
+
+    * Deep Graph Library is an easy-to-use, high-performance, and scalable Python package for deep learning on graphs. DGL is framework agnostic, meaning if a deep graph model is a component in an end-to-end application, the rest of the logic is implemented using PyTorch. It is currently supported on ROCm 6.4.0. For more information, see [DGL compatibility](https://rocm.docs.amd.com/en/docs-6.4.2/compatibility/ml-compatibility/dgl-compatibility.html).
+    * Stanford Megatron-LM is a large-scale language model training framework. It’s designed to train massive transformer-based language models efficiently by model and data parallelism. It is currently supported on ROCm 6.3.0. For more information, see [Stanford Megatron-LM compatibility](https://rocm.docs.amd.com/en/docs-6.4.2/compatibility/ml-compatibility/stanford-megatron-lm-compatibility.html).
+    * Volcano Engine Reinforcement Learning for LLMs (verl) is a reinforcement learning framework designed for large language models (LLMs). verl offers a scalable, open-source fine-tuning solution optimized for AMD Instinct GPUs with full ROCm support. It is currently supported on ROCm 6.2.0. For more information, see [verl compatibility](https://rocm.docs.amd.com/en/docs-6.4.2/compatibility/ml-compatibility/verl-compatibility.html).
+
+* Documentation for the new [ROCprof Compute Viewer](https://rocm.docs.amd.com/projects/rocprof-compute-viewer/en/docs-6.4.2/) was added in May 2025. This tool is used to visualize and analyze GPU thread trace data collected using [rocprofv3](https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/latest/index.html). Note that [ROCprof Compute Viewer](https://rocm.docs.amd.com/projects/rocprof-compute-viewer/en/docs-6.4.2/) is in an early access state. Running production workloads is not recommended.
+
+* The AMDGPU installer documentation has been removed to encourage the use of the package manager for ROCm installation. While the package manager is the recommended method, you can still install ROCm using the AMDGPU installer by following the [legacy process](https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.1/install/install-methods/amdgpu-installer-index.html). Ensure to update the command with the intended ROCm version before running it. For more information, see [Installation via native package manager](https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.2/install/install-methods/package-manager-index.html).

 ## Operating system and hardware support changes

-ROCm 6.4.1 introduces support for the RDNA4 architecture-based [Radeon AI PRO
-R9700](https://www.amd.com/en/products/graphics/workstations/radeon-ai-pro/ai-9000-series/amd-radeon-ai-pro-r9700.html),
-[Radeon RX 9070](https://www.amd.com/en/products/graphics/desktops/radeon/9000-series/amd-radeon-rx-9070.html),
-[Radeon RX 9070 XT](https://www.amd.com/en/products/graphics/desktops/radeon/9000-series/amd-radeon-rx-9070xt.html),
-Radeon RX 9070 GRE, and
-[Radeon RX 9060 XT](https://www.amd.com/en/products/graphics/desktops/radeon/9000-series/amd-radeon-rx-9060xt.html) GPUs
-for compute workloads. It also adds support for RDNA3 architecture-based [Radeon PRO W7700](https://www.amd.com/en/products/graphics/workstations/radeon-pro/w7700.html) and [Radeon RX 7800 XT](https://www.amd.com/en/products/graphics/desktops/radeon/7000-series/amd-radeon-rx-7800-xt.html) GPUs. These GPUs are supported on Ubuntu 24.04.2, Ubuntu 22.04.5, RHEL 9.6, RHEL 9.5, and RHEL 9.4.
+ROCm 6.4.2 adds support for SLES 15 SP7. For more information, see [SLES installation](https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.2/install/install-methods/package-manager/package-manager-sles.html).
+
+ROCm 6.4.2 marks the end of support (EoS) for RHEL 9.5. 
+
+ROCm 6.4.2 adds support for RDNA3 architecture-based [Radeon RX 7700 XT](https://www.amd.com/en/products/graphics/desktops/radeon/7000-series/amd-radeon-rx-7700-xt.html) GPU. This GPU is supported on Ubuntu 24.04.2 and RHEL 9.6.
 For details, see the full list of [Supported GPUs
-(Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html#supported-gpus).
+(Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.2/reference/system-requirements.html#supported-gpus).

 See the [Compatibility
 matrix](../../docs/compatibility/compatibility-matrix.rst)
@@ -88,8 +106,8 @@ for more information about operating system and hardware compatibility.

 ## ROCm components

-The following table lists the versions of ROCm components for ROCm 6.4.1, including any version
-changes from 6.4.0 to 6.4.1. Click the component's updated version to go to a list of its changes.
+The following table lists the versions of ROCm components for ROCm 6.4.2, including any version
+changes from 6.4.1 to 6.4.2. Click the component's updated version to go to a list of its changes.
 Click {fab}`github` to go to the component's source code on GitHub.

 <div class="pst-scrollable-table-container">
@@ -111,47 +129,47 @@ Click {fab}`github` to go to the component's source code on GitHub.
            <tr>
                <th rowspan="9">Libraries</th>
                <th rowspan="9">Machine learning and computer vision</th>
-                <td><a href="https://rocm.docs.amd.com/projects/composable_kernel/en/docs-6.4.1/index.html">Composable Kernel</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/composable_kernel/en/docs-6.4.2/index.html">Composable Kernel</a></td>
                <td>1.1.0</td>
                <td><a href="https://github.com/ROCm/composable_kernel"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/AMDMIGraphX/en/docs-6.4.1/index.html">MIGraphX</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/AMDMIGraphX/en/docs-6.4.2/index.html">MIGraphX</a></td>
                <td>2.12.0</td>
                <td><a href="https://github.com/ROCm/AMDMIGraphX"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/MIOpen/en/docs-6.4.1/index.html">MIOpen</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/MIOpen/en/docs-6.4.2/index.html">MIOpen</a></td>
                <td>3.4.0</td>
                <td><a href="https://github.com/ROCm/MIOpen"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/MIVisionX/en/docs-6.4.1/index.html">MIVisionX</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/MIVisionX/en/docs-6.4.2/index.html">MIVisionX</a></td>
                <td>3.2.0</td>
                <td><a href="https://github.com/ROCm/MIVisionX"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocAL/en/docs-6.4.1/index.html">rocAL</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocAL/en/docs-6.4.2/index.html">rocAL</a></td>
                <td>2.2.0</td>
                <td><a href="https://github.com/ROCm/rocAL"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocDecode/en/docs-6.4.1/index.html">rocDecode</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocDecode/en/docs-6.4.2/index.html">rocDecode</a></td>
                <td>0.10.0</td>
                <td><a href="https://github.com/ROCm/rocDecode"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocJPEG/en/docs-6.4.1/index.html">rocJPEG</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocJPEG/en/docs-6.4.2/index.html">rocJPEG</a></td>
                <td>0.8.0</td>
                <td><a href="https://github.com/ROCm/rocJPEG"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocPyDecode/en/docs-6.4.1/index.html">rocPyDecode</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocPyDecode/en/docs-6.4.2/index.html">rocPyDecode</a></td>
                <td>0.3.1</td>
                <td><a href="https://github.com/ROCm/rocPyDecode"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rpp/en/docs-6.4.1/index.html">RPP</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rpp/en/docs-6.4.2/index.html">RPP</a></td>
                <td>1.9.10</td>
                <td><a href="https://github.com/ROCm/rpp"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
@@ -160,13 +178,13 @@ Click {fab}`github` to go to the component's source code on GitHub.
            <tr>
                <th rowspan="2"></th>
                <th rowspan="2">Communication</th>
-                <td><a href="https://rocm.docs.amd.com/projects/rccl/en/docs-6.4.1/index.html">RCCL</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rccl/en/docs-6.4.2/index.html">RCCL</a></td>
                <td>2.22.3&nbsp;&Rightarrow;&nbsp;<a href="#rccl-2-22-3">2.22.3</td>
                <td><a href="https://github.com/ROCm/rccl"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-            <td><a href="https://rocm.docs.amd.com/projects/rocSHMEM/en/docs-6.4.1/index.html">rocSHMEM</a></td>
-                <td>2.0.0</td>
+            <td><a href="https://rocm.docs.amd.com/projects/rocSHMEM/en/docs-6.4.2/index.html">rocSHMEM</a></td>
+                <td>2.0.0&nbsp;&Rightarrow;&nbsp;<a href="#rocshmem-2-0-1">2.0.1</td>
                <td><a href="https://github.com/ROCm/rocSHMEM"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
        </tbody>
@@ -174,82 +192,82 @@ Click {fab}`github` to go to the component's source code on GitHub.
            <tr>
                <th rowspan="16"></th>
                <th rowspan="16">Math</th>
-                <td><a href="https://rocm.docs.amd.com/projects/hipBLAS/en/docs-6.4.1/index.html">hipBLAS</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/hipBLAS/en/docs-6.4.2/index.html">hipBLAS</a></td>
                <td>2.4.0</td>
                <td><a href="https://github.com/ROCm/hipBLAS"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/hipBLASLt/en/docs-6.4.1/index.html">hipBLASLt</a></td>
-                <td>0.12.0&nbsp;&Rightarrow;&nbsp;<a href="#hipblaslt-0-12-1">0.12.1</td>
+                <td><a href="https://rocm.docs.amd.com/projects/hipBLASLt/en/docs-6.4.2/index.html">hipBLASLt</a></td>
+                <td>0.12.1&nbsp;&Rightarrow;&nbsp;<a href="#hipblaslt-0-12-1">0.12.1</td>
                <td><a href="https://github.com/ROCm/hipBLASLt"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/hipFFT/en/docs-6.4.1/index.html">hipFFT</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/hipFFT/en/docs-6.4.2/index.html">hipFFT</a></td>
                <td>1.0.18</td>
                <td><a href="https://github.com/ROCm/hipFFT"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/hipfort/en/docs-6.4.1/index.html">hipfort</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/hipfort/en/docs-6.4.2/index.html">hipfort</a></td>
                <td>0.6.0</td>
                <td><a href="https://github.com/ROCm/hipfort"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/hipRAND/en/docs-6.4.1/index.html">hipRAND</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/hipRAND/en/docs-6.4.2/index.html">hipRAND</a></td>
                <td>2.12.0</td>
                <td><a href="https://github.com/ROCm/hipRAND"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/hipSOLVER/en/docs-6.4.1/index.html">hipSOLVER</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/hipSOLVER/en/docs-6.4.2/index.html">hipSOLVER</a></td>
                <td>2.4.0</td>
                <td><a href="https://github.com/ROCm/hipSOLVER"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/hipSPARSE/en/docs-6.4.1/index.html">hipSPARSE</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/hipSPARSE/en/docs-6.4.2/index.html">hipSPARSE</a></td>
                <td>3.2.0</td>
                <td><a href="https://github.com/ROCm/hipSPARSE"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/hipSPARSELt/en/docs-6.4.1/index.html">hipSPARSELt</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/hipSPARSELt/en/docs-6.4.2/index.html">hipSPARSELt</a></td>
                <td>0.2.3</td>
                <td><a href="https://github.com/ROCm/hipSPARSELt"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocALUTION/en/docs-6.4.1/index.html">rocALUTION</a></td>
-                <td>3.2.2&nbsp;&Rightarrow;&nbsp;<a href="#rocalution-3-2-3">3.2.3</td></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocALUTION/en/docs-6.4.2/index.html">rocALUTION</a></td>
+                <td>3.2.3</td>
                <td><a href="https://github.com/ROCm/rocALUTION"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocBLAS/en/docs-6.4.1/index.html">rocBLAS</a></td>
-                <td>4.4.0</td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocBLAS/en/docs-6.4.2/index.html">rocBLAS</a></td>
+                <td>4.4.0&nbsp;&Rightarrow;&nbsp;<a href="#rocblas-4-4-1">4.4.1</td></td>
                <td><a href="https://github.com/ROCm/rocBLAS"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocFFT/en/docs-6.4.1/index.html">rocFFT</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocFFT/en/docs-6.4.2/index.html">rocFFT</a></td>
                <td>1.0.32</td>
                <td><a href="https://github.com/ROCm/rocFFT"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocRAND/en/docs-6.4.1/index.html">rocRAND</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocRAND/en/docs-6.4.2/index.html">rocRAND</a></td>
                <td>3.3.0</td>
                <td><a href="https://github.com/ROCm/rocRAND"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocSOLVER/en/docs-6.4.1/index.html">rocSOLVER</a></td>
-                <td>3.28.0</td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocSOLVER/en/docs-6.4.2/index.html">rocSOLVER</a></td>
+                <td>3.28.0&nbsp;&Rightarrow;&nbsp;<a href="#rocsolver-3-28-2">3.28.2</td>
                <td><a href="https://github.com/ROCm/rocSOLVER"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocSPARSE/en/docs-6.4.1/index.html">rocSPARSE</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocSPARSE/en/docs-6.4.2/index.html">rocSPARSE</a></td>
                <td>3.4.0</td>
                <td><a href="https://github.com/ROCm/rocSPARSE"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocWMMA/en/docs-6.4.1/index.html">rocWMMA</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocWMMA/en/docs-6.4.2/index.html">rocWMMA</a></td>
                <td>1.7.0</td>
                <td><a href="https://github.com/ROCm/rocWMMA"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/Tensile/en/docs-6.4.1/src/index.html">Tensile</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/Tensile/en/docs-6.4.2/src/index.html">Tensile</a></td>
                <td>4.43.0</td>
                <td><a href="https://github.com/ROCm/Tensile"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
@@ -258,22 +276,22 @@ Click {fab}`github` to go to the component's source code on GitHub.
            <tr>
                <th rowspan="4"></th>
                <th rowspan="4">Primitives</th>
-                <td><a href="https://rocm.docs.amd.com/projects/hipCUB/en/docs-6.4.1/index.html">hipCUB</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/hipCUB/en/docs-6.4.2/index.html">hipCUB</a></td>
                <td>3.4.0</td>
                <td><a href="https://github.com/ROCm/hipCUB"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/hipTensor/en/docs-6.4.1/index.html">hipTensor</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/hipTensor/en/docs-6.4.2/index.html">hipTensor</a></td>
                <td>1.5.0</td>
                <td><a href="https://github.com/ROCm/hipTensor"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocPRIM/en/docs-6.4.1/index.html">rocPRIM</a></td>
-                <td>3.4.0</td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocPRIM/en/docs-6.4.2/index.html">rocPRIM</a></td>
+                <td>3.4.0&nbsp;&Rightarrow;&nbsp;<a href="#rocprim-3-4-1">3.4.1</td>
                <td><a href="https://github.com/ROCm/rocPRIM"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocThrust/en/docs-6.4.1/index.html">rocThrust</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocThrust/en/docs-6.4.2/index.html">rocThrust</a></td>
                <td>3.3.0</td>
                <td><a href="https://github.com/ROCm/rocThrust"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
@@ -282,28 +300,28 @@ Click {fab}`github` to go to the component's source code on GitHub.
            <tr>
                <th rowspan="7">Tools</th>
                <th rowspan="7">System management</th>
-                <td><a href="https://rocm.docs.amd.com/projects/amdsmi/en/docs-6.4.1/index.html">AMD SMI</a></td>
-                <td>25.3.0&nbsp;&Rightarrow;&nbsp;<a href="#amd-smi-25-4-2">25.4.2</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/amdsmi/en/docs-6.4.2/index.html">AMD SMI</a></td>
+                <td>25.4.2&nbsp;&Rightarrow;&nbsp;<a href="#amd-smi-25-5-1">25.5.1</a></td>
                <td><a href="https://github.com/ROCm/amdsmi"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rdc/en/docs-6.4.1/index.html">ROCm Data Center Tool</a></td>
-                <td>0.3.0&nbsp;&Rightarrow;&nbsp;<a href="#rocm-data-center-tool-0-3-0">0.3.0</td>
+                <td><a href="https://rocm.docs.amd.com/projects/rdc/en/docs-6.4.2/index.html">ROCm Data Center Tool</a></td>
+                <td>0.3.0</td>
                <td><a href="https://github.com/ROCm/rdc"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocminfo/en/docs-6.4.1/index.html">rocminfo</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocminfo/en/docs-6.4.2/index.html">rocminfo</a></td>
                <td>1.0.0</td>
                <td><a href="https://github.com/ROCm/rocminfo"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocm_smi_lib/en/docs-6.4.1/index.html">ROCm SMI</a></td>
-                <td>7.5.0&nbsp;&Rightarrow;&nbsp;<a href="#rocm-smi-7-5-0">7.5.0</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocm_smi_lib/en/docs-6.4.2/index.html">ROCm SMI</a></td>
+                <td>7.5.0</td>
                <td><a href="https://github.com/ROCm/rocm_smi_lib"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/ROCmValidationSuite/en/docs-6.4.1/index.html">ROCmValidationSuite</a></td>
-                <td>1.1.0</td>
+                <td><a href="https://rocm.docs.amd.com/projects/ROCmValidationSuite/en/docs-6.4.2/index.html">ROCm Validation Suite</a></td>
+                <td>1.1.0&nbsp;&Rightarrow;&nbsp;<a href="#rocm-validation-suite-1-1-0">1.1.0</td>
                <td><a href="https://github.com/ROCm/ROCmValidationSuite"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
        </tbody>
@@ -311,38 +329,38 @@ Click {fab}`github` to go to the component's source code on GitHub.
            <tr>
                <th rowspan="6"></th>
                <th rowspan="6">Performance</th>
-                <td><a href="https://rocm.docs.amd.com/projects/rocm_bandwidth_test/en/docs-6.4.1/index.html">ROCm Bandwidth
+                <td><a href="https://rocm.docs.amd.com/projects/rocm_bandwidth_test/en/docs-6.4.2/index.html">ROCm Bandwidth
                        Test</a></td>
                <td>1.4.0</td>
                <td><a href="https://github.com/ROCm/rocm_bandwidth_test/"><i
                            class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocprofiler-compute/en/docs-6.4.1/index.html">ROCm Compute Profiler</a></td>
-                <td>3.1.0</td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocprofiler-compute/en/docs-6.4.2/index.html">ROCm Compute Profiler</a></td>
+                <td>3.1.0&nbsp;&Rightarrow;&nbsp;<a href="#rocm-compute-profiler-3-1-1">3.1.1</td>
                <td><a href="https://github.com/ROCm/rocprofiler-compute"><i
                            class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocprofiler-systems/en/docs-6.4.1/index.html">ROCm Systems Profiler</a></td>
-                <td>1.0.0&nbsp;&Rightarrow;&nbsp;<a href="#rocm-systems-profiler-1-0-1">1.0.1</td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocprofiler-systems/en/docs-6.4.2/index.html">ROCm Systems Profiler</a></td>
+                <td>1.0.1&nbsp;&Rightarrow;&nbsp;<a href="#rocm-systems-profiler-1-0-2">1.0.2</td>
                <td><a href="https://github.com/ROCm/rocprofiler-systems"><i
                            class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocprofiler/en/docs-6.4.1/index.html">ROCProfiler</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocprofiler/en/docs-6.4.2/index.html">ROCProfiler</a></td>
                <td>2.0.0</td>
                <td><a href="https://github.com/ROCm/ROCProfiler/"><i
                            class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/docs-6.4.1/index.html">ROCprofiler-SDK</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/docs-6.4.2/index.html">ROCprofiler-SDK</a></td>
                <td>0.6.0</td>
                <td><a href="https://github.com/ROCm/rocprofiler-sdk/"><i
                            class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr >
-                <td><a href="https://rocm.docs.amd.com/projects/roctracer/en/docs-6.4.1/index.html">ROCTracer</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/roctracer/en/docs-6.4.2/index.html">ROCTracer</a></td>
                <td>4.1.0</td>
                <td><a href="https://github.com/ROCm/ROCTracer/"><i
                            class="fab fa-github fa-lg"></i></a></td>
@@ -352,32 +370,32 @@ Click {fab}`github` to go to the component's source code on GitHub.
            <tr>
                <th rowspan="5"></th>
                <th rowspan="5">Development</th>
-                <td><a href="https://rocm.docs.amd.com/projects/HIPIFY/en/docs-6.4.1/index.html">HIPIFY</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/HIPIFY/en/docs-6.4.2/index.html">HIPIFY</a></td>
                <td>19.0.0</td>
                <td><a href="https://github.com/ROCm/HIPIFY/"><i
                            class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/ROCdbgapi/en/docs-6.4.1/index.html">ROCdbgapi</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/ROCdbgapi/en/docs-6.4.2/index.html">ROCdbgapi</a></td>
                <td>0.77.2</td>
                <td><a href="https://github.com/ROCm/ROCdbgapi/"><i
                            class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/ROCmCMakeBuildTools/en/docs-6.4.1/index.html">ROCm CMake</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/ROCmCMakeBuildTools/en/docs-6.4.2/index.html">ROCm CMake</a></td>
                <td>0.14.0</td>
                <td><a href="https://github.com/ROCm/rocm-cmake/"><i
                            class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/ROCgdb/en/docs-6.4.1/index.html">ROCm Debugger (ROCgdb)</a>
+                <td><a href="https://rocm.docs.amd.com/projects/ROCgdb/en/docs-6.4.2/index.html">ROCm Debugger (ROCgdb)</a>
                </td>
                <td>15.2</td>
                <td><a href="https://github.com/ROCm/ROCgdb/"><i
                            class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/rocr_debug_agent/en/docs-6.4.1/index.html">ROCr Debug Agent</a>
+                <td><a href="https://rocm.docs.amd.com/projects/rocr_debug_agent/en/docs-6.4.2/index.html">ROCr Debug Agent</a>
                </td>
                <td>2.0.4</td>
                <td><a href="https://github.com/ROCm/rocr_debug_agent/"><i
@@ -387,13 +405,13 @@ Click {fab}`github` to go to the component's source code on GitHub.
        <tbody class="rocm-components-compilers tbody-reverse-zebra">
            <tr>
                <th rowspan="2" colspan="2">Compilers</th>
-                <td><a href="https://rocm.docs.amd.com/projects/HIPCC/en/docs-6.4.1/index.html">HIPCC</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/HIPCC/en/docs-6.4.2/index.html">HIPCC</a></td>
                <td>1.1.1</td>
                <td><a href="https://github.com/ROCm/llvm-project/tree/amd-staging/amd/hipcc"><i
                            class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/llvm-project/en/docs-6.4.1/index.html">llvm-project</a></td>
+                <td><a href="https://rocm.docs.amd.com/projects/llvm-project/en/docs-6.4.2/index.html">llvm-project</a></td>
                <td>19.0.0</td>
                <td><a href="https://github.com/ROCm/llvm-project/"><i
                            class="fab fa-github fa-lg"></i></a></td>
@@ -402,13 +420,13 @@ Click {fab}`github` to go to the component's source code on GitHub.
        <tbody class="rocm-components-runtimes tbody-reverse-zebra">
            <tr>
                <th rowspan="2" colspan="2">Runtimes</th>
-                <td><a href="https://rocm.docs.amd.com/projects/HIP/en/docs-6.4.1/index.html">HIP</a></td>
-                <td>6.4.0&nbsp;&Rightarrow;&nbsp;<a href="#hip-6-4-1">6.4.1</td>
+                <td><a href="https://rocm.docs.amd.com/projects/HIP/en/docs-6.4.2/index.html">HIP</a></td>
+                <td>6.4.1&nbsp;&Rightarrow;&nbsp;<a href="#hip-6-4-2">6.4.2</td>
                <td><a href="https://github.com/ROCm/HIP/"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
            <tr>
-                <td><a href="https://rocm.docs.amd.com/projects/ROCR-Runtime/en/docs-6.4.1/index.html">ROCr Runtime</a></td>
-                <td>1.15.0&nbsp;&Rightarrow;&nbsp;<a href="#rocr-runtime-1-15-0">1.15.0</td>
+                <td><a href="https://rocm.docs.amd.com/projects/ROCR-Runtime/en/docs-6.4.2/index.html">ROCr Runtime</a></td>
+                <td>1.15.0</td>
                <td><a href="https://github.com/ROCm/ROCR-Runtime/"><i class="fab fa-github fa-lg"></i></a></td>
            </tr>
        </tbody>
@@ -423,172 +441,187 @@ The following sections describe key changes to ROCm components.
 For a historical overview of ROCm component updates, see the {doc}`ROCm consolidated changelog </release/changelog>`.
 ```

-### **AMD SMI** (25.4.2)
+### **AMD SMI** (25.5.1)

 #### Added

-* Dumping CPER entries from RAS tool `amdsmi_get_gpu_cper_entries()` to Python and C APIs.
-  - Dumping CPER entries consist of `amdsmi_cper_hdr_t`.
-  - Dumping CPER entries is also enabled in the CLI interface through `sudo amd-smi ras --cper`.
-* `amdsmi_get_gpu_busy_percent` to the C API.
+- Compute Unit Occupancy information per process.
+
+- Support for getting the GPU Board voltage.
+
+- New firmware PLDM_BUNDLE. `amd-smi firmware` can now show the PLDM Bundle on supported systems.
+
+- `amd-smi ras --afid --cper-file <file_path>` to decode CPER records.

 #### Changed

-* Modified VRAM display for amd-smi monitor -v. 
+- Padded `asic_serial` in `amdsmi_get_asic_info` with 0s.

-#### Optimized
-
-* Improved load times for CLI commands when the GPU has multiple parititons.
+- Renamed field `COMPUTE_PARTITION` to `ACCELERATOR_PARTITION` in CLI call `amd-smi --partition`.

 #### Resolved issues

-* Fixed partition enumeration in `amd-smi list -e`, `amdsmi_get_gpu_enumeration_info()`, `amdsmi_enumeration_info_t`, `drm_card`, and `drm_render` fields.
-
-#### Known issues
-
-* When using the `--follow` flag with `amd-smi ras --cper`, CPER entries are not streamed continuously as intended. This will be fixed in an upcoming ROCm release.
+- Corrected VRAM memory calculation in `amdsmi_get_gpu_process_list`. Previously, the VRAM memory usage reported by `amdsmi_get_gpu_process_list` was inaccurate and was calculated using KB instead of KiB.

 ```{note}
 See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/rocm-rel-6.4/CHANGELOG.md) for details, examples, and in-depth descriptions.
 ```

-### **HIP** (6.4.1)
+### **HIP** (6.4.2)

 #### Added

-* New log mask enumeration `LOG_COMGR` enables logging precise code object information.
-
-#### Changed
-
-* HIP runtime uses device bitcode before SPIRV.
-* The implementation of preventing `hipLaunchKernel` latency degradation with number of idle streams is reverted/disabled by default.
+* HIP API implementation for `hipEventRecordWithFlags`, records an event in the specified stream with flags.
+* Support for the pointer attribute `HIP_POINTER_ATTRIBUTE_CONTEXT`.
+* Support for the flags `hipEventWaitDefault` and `hipEventWaitExternal`.

 #### Optimized

-* Improved kernel logging includes de-mangling shader names.
-* Refined implementation in HIP APIs `hipEventRecords` and `hipStreamWaitEvent` for performance improvement.
+* Improved implementation in `hipEventSynchronize`, HIP runtime now makes internal callbacks as non-blocking operations to improve performance.

 #### Resolved issues

-* Stale state during the graph capture. The return error was fixed, HIP runtime now always uses the latest dependent nodes during `hipEventRecord` capture.
-* Segmentation fault during kernel execution. HIP runtime now allows maximum stack size as per ISA on the GPU device.
+* Issue of dependency on `libgcc-s1` during rocm-dev install on Debian Buster. HIP runtime removed this Debian package dependency, and uses `libgcc1` instead for this distros.
+* Building issue for `COMGR` dynamic load on Fedora and other Distros. HIP runtime now doesn't link against `libamd_comgr.so`.
+* Failure in the API `hipStreamDestroy`, when stream type is `hipStreamLegacy`. The API now returns error code `hipErrorInvalidResourceHandle` on this condition.
+* Kernel launch errors, such as `shared object initialization failed`, `invalid device function` or `kernel execution failure`. HIP runtime now loads `COMGR` properly considering the file with its name and mapped image.
+* Memory access fault in some applications. HIP runtime fixed offset accumulation in memory address.
+* The memory leak in virtual memory management (VMM). HIP runtime now uses the size of handle for allocated memory range instead of actual size for physical memory, which fixed the issue of address clash with VMM.
+* Large memory allocation issue. HIP runtime now checks GPU video RAM and system RAM properly and sets size limits during memory allocation either on the host or the GPU device.
+* Support of `hipDeviceMallocContiguous` flags in `hipExtMallocWithFlags()`. It now enables `HSA_AMD_MEMORY_POOL_CONTIGUOUS_FLAG` in the memory pool allocation on GPU device.
+* Radom memory segmentation fault in handling `GraphExec` object release and `hipDeviceSyncronization`. HIP runtime now uses internal device synchronize function in `__hipUnregisterFatBinary`. 

 ### **hipBLASLt** (0.12.1)

-#### Resolved issues
+#### Added

-* Fixed an accuracy issue for some solutions using an `FP32` or `TF32` data type with a TT transpose.
+* Support for gfx1151 on Linux, complementing the previous support in the HIP SDK for Windows.

 ### **RCCL** (2.22.3)

-#### Changed
+#### Added

-* MSCCL++ is now disabled by default. To enable it, set `RCCL_MSCCLPP_ENABLE=1`.
+* Added support for the LL128 protocol on gfx942.
+
+### **rocBLAS** (4.4.1)

 #### Resolved issues

-* Fixed an issue where early termination, in rare circumstances, could cause the application to stop responding by adding synchronization before destroying a proxy thread.
-* Fixed the accuracy issue for the MSCCLPP `allreduce7` kernel in graph mode.
+* rocBLAS might have failed to produce correct results for cherk/zherk on gfx90a/gfx942 with problem sizes k > 500 due to the imaginary portion on the C matrix diagonal not being zeros. rocBLAS now zeros the imaginary portion.

-#### Known issues
-
-* When splitting a communicator using `ncclCommSplit` in some GPU configurations, MSCCL initialization can cause a segmentation fault. The recommended workaround is to disable MSCCL with `export RCCL_MSCCL_ENABLE=0`.
-  This issue will be fixed in a future ROCm release.
-
-* Within the RCCL-UnitTests test suite, failures occur in tests ending with the
-  `.ManagedMem` and `.ManagedMemGraph` suffixes. These failures only affect the
-  test results and do not affect the RCCL component itself. This issue will be
-  resolved in a future ROCm release.
-
-### **rocALUTION** (3.2.3)
+### **ROCm Compute Profiler** (3.1.1)

 #### Added

-* The `-a` option has been added to the `rmake.py` build script. This option allows you to select specific architectures when building on Microsoft Windows.
-
-#### Resolved issues
-
-* Fixed an issue where the `HIP_PATH` environment variable was being ignored when compiling on Microsoft Windows.
-
-### **ROCm Data Center Tool** (0.3.0)
-
-#### Added
-
- Support for GPU partitions.
- `RDC_FI_GPU_BUSY_PERCENT` metric.
+* 8-bit floating point (FP8) metrics support for AMD Instinct MI300 GPUs.
+* Additional data types for roofline: FP8, FP16, BF16, FP32, FP64, I8, I32, I64 (dependent on the GPU architecture).
+* Data type selection option ``--roofline-data-type / -R`` for roofline profiling. The default data type is FP32.

 #### Changed

- Updated `rdc_field` to align with `rdc_bootstrap` for current metrics.
+* Changed dependency from `rocm-smi` to `amd-smi`.

 #### Resolved issues

- Fixed [ROCProfiler](https://rocm.docs.amd.com/projects/rocprofiler/en/docs-6.4.0/index.html) eval metrics and memory leaks.
+* Fixed a crash related to Agent ID caused by the new format of the `rocprofv3` output CSV file.

-### **ROCm SMI** (7.5.0)
+### **ROCm Systems Profiler** (1.0.2)
+
+#### Optimized
+
+* Improved readability of the OpenMP target offload traces by showing on a single Perfetto track.

 #### Resolved issues

- Fixed partition enumeration. It now refers to the correct DRM Render and Card paths.
+* Fixed the file path to the script that merges Perfetto files from multi-process MPI runs. The script has also been renamed from `merge-multiprocess-output.sh` to `rocprof-sys-merge-output.sh`.

-```{note}
-See the full [ROCm SMI changelog](https://github.com/ROCm/rocm_smi_lib/blob/release/rocm-rel-6.4/CHANGELOG.md) for details, examples, and in-depth descriptions.
-```
+### **ROCm Validation Suite** (1.1.0)

-### **ROCm Systems Profiler** (1.0.1)
+#### Added

-#### Added 
+* NPS2/DPX and NPS4/CPX partition modes support for AMD Instinct MI300X.

-* How-to document for [network performance profiling](https://rocm.docs.amd.com/projects/rocprofiler-systems/en/latest/how-to/nic-profiling.html) for standard Network Interface Cards (NICs).
+### **rocPRIM** (3.4.1)
+
+#### Upcoming changes
+
+* Changes to the template parameters of warp and block algorithms will be made in an upcoming release.
+* Due to an upcoming compiler change, the following symbols related to warp size have been marked as deprecated and will be removed in an upcoming major release:
+    * `rocprim::device_warp_size()`. This has been replaced by `rocprim::arch::wavefront::min_size()` and `rocprim::arch::wavefront::max_size()` for compile-time constants. Use these when allocating global or shared memory. For run-time constants, use `rocprim::arch::wavefront::size()`.
+  * `rocprim::warp_size()`
+  * `ROCPRIM_WAVEFRONT_SIZE`
+
+* The default scan accumulator types for device-level scan algorithms will be changed in an upcoming release, resulting in a breaking change. Previously, the default accumulator type was set to the input type for the inclusive scans and to the initial value type for the exclusive scans. This could lead to unexpected overflow if the input or initial type was smaller than the output type when the accumulator type wasn't explicitly set using the `AccType` template parameter. The new default accumulator types will be set to the type that results when the input or initial value type is applied to the scan operator.  
+
+    The following is the complete list of affected functions and how their default accumulator types are changing:
+    
+    * `rocprim::inclusive_scan`
+        * current default: `class AccType = typename std::iterator_traits<InputIterator>::value_type>`
+        * future default: `class AccType = rocprim::invoke_result_binary_op_t<typename std::iterator_traits<InputIterator>::value_type, BinaryFunction>`
+    * `rocprim::deterministic_inclusive_scan`
+        * current default: `class AccType = typename std::iterator_traits<InputIterator>::value_type>`
+        * future default: `class AccType = rocprim::invoke_result_binary_op_t<typename std::iterator_traits<InputIterator>::value_type, BinaryFunction>`
+    * `rocprim::exclusive_scan`
+        * current default: `class AccType = detail::input_type_t<InitValueType>>`
+        * future default: `class AccType = rocprim::invoke_result_binary_op_t<rocprim::detail::input_type_t<InitValueType>, BinaryFunction>`
+    * `rocprim::deterministic_exclusive_scan`
+        * current default: `class AccType = detail::input_type_t<InitValueType>>`
+        * future default: `class AccType = rocprim::invoke_result_binary_op_t<rocprim::detail::input_type_t<InitValueType>, BinaryFunction>`
+
+* `rocprim::load_cs` and `rocprim::store_cs` are deprecated and will be removed in an upcoming release. Alternatively, you can use `rocprim::load_nontemporal` and `rocprim::store_nontemporal` to load and store values in specific conditions (like bypassing the cache) for `rocprim::thread_load` and `rocprim::thread_store`.
+
+### **rocSHMEM** (2.0.1)

 #### Resolved issues

-* Fixed a build issue with Dyninst on GCC 13.
+* Incorrect output for `rocshmem_ctx_my_pe` and `rocshmem_ctx_n_pes`.
+* Multi-team errors by providing team specific buffers in `rocshmem_ctx_wg_team_sync`.
+* Missing implementation of `rocshmem_g` for IPC conduit.

-### **ROCr Runtime** (1.15.0)
+### **rocSOLVER** (3.28.2)

-#### Resolved issues
+#### Added

-* Fixed a rare occurrence issue on AMD Instinct MI25, MI50, and MI100 GPUs, where the `SDMA` copies might start before the dependent Kernel finishes and could cause memory corruption.
+* Hybrid computation support for existing routines, such as STERF.
+* SVD for general matrices based on Cuppen's Divide and Conquer algorithm:
+    - GESDD (with batched and strided\_batched versions)
+
+#### Optimized
+
+* Reduced the device memory requirements for STEDC, SYEVD/HEEVD, and SYGVD/HEGVD.
+* Improved the performance of STEDC and divide and conquer Eigensolvers.
+* Improved the performance of SYTRD, the initial step of the Eigensolvers that start with the tridiagonalization of the input matrix.

 ## ROCm known issues

 ROCm known issues are noted on {fab}`github` [GitHub](https://github.com/ROCm/ROCm/labels/Verified%20Issue). For known
 issues related to individual components, review the [Detailed component changes](#detailed-component-changes).

-### Radeon AI PRO R9700 hangs when running Stable Diffusion 2.1 at batch sizes above four
+## ROCm resolved issues

-Radeon AI PRO R9700 GPUs might hang when running [Stable Diffusion
-2.1](https://huggingface.co/stabilityai/stable-diffusion-2-1) with batch sizes
-greater than four. As a workaround, limit batch sizes to four or fewer. This issue
-will be addressed in a future ROCm release. See [issue #4770](https://github.com/ROCm/ROCm/issues/4770) on GitHub.
-
-### RCCL MSCCL initialization failure
-
-When splitting a communicator using `ncclCommSplit` in some GPU configurations, MSCCL initialization can cause a segmentation fault. The recommended workaround is to disable MSCCL with `export RCCL_MSCCL_ENABLE=0`.
-This issue will be fixed in a future ROCm release. See [issue #4769](https://github.com/ROCm/ROCm/issues/4769) on GitHub.
+The following are previously known issues resolved in this release. For resolved issues related to
+individual components, review the [Detailed component changes](#detailed-component-changes).

 ### AMD SMI CLI: CPER entries not dumped continuously when using follow flag

-* When using the `--follow` flag with `amd-smi ras --cper`, CPER entries are not streamed continuously as intended. This will be fixed in an upcoming ROCm release.
-See [issue #4768](https://github.com/ROCm/ROCm/issues/4768) on GitHub.
+An issue where CPER entries were not streamed continuously as intended when using the `--follow` flag with `amd-smi ras --cper` has been resolved. See [GitHub issue #4768](https://github.com/ROCm/ROCm/issues/4768).

-### ROCm SMI uninstallation issue on RHEL and SLES
+### Instinct MI300X reports incorrect raw GPU timestamps

-`rocm-smi-lib` does not get uninstalled and remains orphaned on RHEL and SLES systems when:
+An issue where the command processor firmware reported incorrect raw GPU timestamps on MI300X accelerators has been resolved. See [GitHub issue #4079](https://github.com/ROCm/ROCm/issues/4079).

-* [Uninstalling ROCm using the AMDGPU installer](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/amdgpu-install.html#uninstalling-rocm) with `amdgpu-install --uninstall`
+### MIOpen generates incorrect results for particular input with FP32 data type

-* [Uninstalling via package manager](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/install-methods/package-manager/package-manager-rhel.html#uninstall-rocm-packages)
-  with `dnf remove rocm-core` on RHEL or `zypper remove rocm-core` on SLES.
-
-As a workaround, manually remove the `rocm-smi-lib` package using `sudo dnf remove rocm-smi-lib` or `sudo zypper remove rocm-smi-lib`.
-See [issue #4767](https://github.com/ROCm/ROCm/issues/4767) on GitHub.
+An issue where MIOpen generated incorrect results on the `conv2dbackward` function for a particular input with 32-bit floating point (FP32) data types has been resolved. The issue was only specific to FP32 data types with 2 * 2 kernel size and dilation 2 * 1. See [GitHub issue #4606](https://github.com/ROCm/ROCm/issues/4606). 

 ## ROCm upcoming changes

 The following changes to the ROCm software stack are anticipated for future releases.

+### AMD SMI migration to AMDGPU driver repository
+
+In a future release, [AMD SMI](https://github.com/ROCm/amdsmi) will be relocated from the ROCm organization repository to a new AMDTools repository to better align with its system-level functionality. `amd-smi-lib` will no longer be included in the `rocm-developer-tools` meta-package included with your standard ROCm installation. Instead, it will be packaged with the AMDGPU driver installation.
+
 ### ROCm SMI deprecation

 [ROCm SMI](https://github.com/ROCm/rocm_smi_lib) will be phased out in an
@@ -616,10 +649,10 @@ and will be disabled in a future release.

 * The `__AMDGCN_WAVEFRONT_SIZE__` macro and `__AMDGCN_WAVEFRONT_SIZE` alias will be removed in an upcoming release.
  It is recommended to remove any use of this macro. For more information, see
-  [AMDGPU support](https://rocm.docs.amd.com/projects/llvm-project/en/docs-6.4.0/LLVM/clang/html/AMDGPUSupport.html).
+  [AMDGPU support](https://rocm.docs.amd.com/projects/llvm-project/en/docs-6.4.2/LLVM/clang/html/AMDGPUSupport.html).
 * `warpSize` will only be available as a non-`constexpr` variable. Where required,
  the wavefront size should be queried via the `warpSize` variable in device code,
-  or via `hipGetDeviceProperties` in host code. Neither of these will result in a compile-time constant. 
+  or via `hipGetDeviceProperties` in host code. Neither of these will result in a compile-time constant. For more information, see [warpSize](https://rocm.docs.amd.com/projects/HIP/en/docs-6.4.2/how-to/hip_cpp_language_extensions.html#warpsize).
 * For cases where compile-time evaluation of the wavefront size cannot be avoided,
  uses of `__AMDGCN_WAVEFRONT_SIZE`, `__AMDGCN_WAVEFRONT_SIZE__`, or `warpSize`
  can be replaced with a user-defined macro or `constexpr` variable with the wavefront
@@ -654,4 +687,4 @@ There are a number of upcoming changes planned for HIP runtime API in an upcomin
 that are not backward compatible with prior releases. Most of these changes increase 
 alignment between HIP and CUDA APIs or behavior. Some of the upcoming changes are to 
 clean up header files, remove namespace collision, and have a clear separation between 
-`hipRTC` and HIP runtime.
+`hipRTC` and HIP runtime. For more information, see [HIP 7.0 Is Coming: What You Need to Know to Stay Ahead](https://rocm.blogs.amd.com/ecosystems-and-partners/transition-to-hip-7.0-blog/README.html).
--- a/docs/compatibility/compatibility-matrix-historical-6.0.csv
+++ b/docs/compatibility/compatibility-matrix-historical-6.0.csv
@@ -1,126 +1,130 @@
-ROCm Version,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0, 6.1.5, 6.1.2, 6.1.1, 6.1.0, 6.0.2, 6.0.0
-      :ref:`Operating systems & kernels <OS-kernel-versions>`,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,"Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04",Ubuntu 24.04,,,,,,
-      ,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,"Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3, 22.04.2","Ubuntu 22.04.4, 22.04.3, 22.04.2"
-      ,,,,,,,,,,,"Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5"
-      ,"RHEL 9.6, 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.3, 9.2","RHEL 9.3, 9.2"
-      ,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,"RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8"
-      ,SLES 15 SP6,SLES 15 SP6,"SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4"
-      ,,,,,,,,,,,,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9
-      ,"Oracle Linux 9, 8 [#mi300x-past-60]_","Oracle Linux 9, 8 [#mi300x-past-60]_",Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,,,
-      ,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,,,,,,,,,,,
-      ,Azure Linux 3.0 [#mi300x-past-60]_,Azure Linux 3.0 [#mi300x-past-60]_,Azure Linux 3.0 [#mi300x-past-60]_,Azure Linux 3.0 [#mi300x-past-60]_,,,,,,,,,,,,
-      ,.. _architecture-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,
-      :doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3
-      ,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2
-      ,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA
-      ,RDNA4,,,,,,,,,,,,,,,
-      ,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3
-      ,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2
-      ,.. _gpu-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,
-      :doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>`,gfx1201 [#RDNA-OS-past-60]_,,,,,,,,,,,,,,,
-      ,gfx1200 [#RDNA-OS-past-60]_,,,,,,,,,,,,,,,
-,gfx1101 [#RDNA-OS-past-60]_,,,,,,,,,,,,,,,
-      ,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100
-      ,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030
-      ,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942 [#mi300_624-past-60]_,gfx942 [#mi300_622-past-60]_,gfx942 [#mi300_621-past-60]_,gfx942 [#mi300_620-past-60]_, gfx942 [#mi300_612-past-60]_, gfx942 [#mi300_612-past-60]_, gfx942 [#mi300_611-past-60]_, gfx942 [#mi300_610-past-60]_, gfx942 [#mi300_602-past-60]_, gfx942 [#mi300_600-past-60]_
-      ,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a
-      ,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908
-,,,,,,,,,,,,,,,,
-      FRAMEWORK SUPPORT,.. _framework-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,
-      :doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13"
-      :doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.14.0, 2.13.1, 2.12.1","2.14.0, 2.13.1, 2.12.1"
-      :doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.4.35,0.4.35,0.4.31,0.4.31,0.4.31,0.4.31,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26
-      `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.2,1.2,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.14.1,1.14.1
-,,,,,,,,,,,,,,,,
-      ,,,,,,,,,,,,,,,,
-      THIRD PARTY COMMS,.. _thirdpartycomms-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,
-      `UCC <https://github.com/ROCm/ucc>`_,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.2.0,>=1.2.0
-      `UCX <https://github.com/ROCm/ucx>`_,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1
-      ,,,,,,,,,,,,,,,,
-      THIRD PARTY ALGORITHM,.. _thirdpartyalgorithm-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,
-      Thrust,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
-      CUB,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
-,,,,,,,,,,,,,,,,
-      KMD & USER SPACE [#kfd_support-past-60]_,.. _kfd-userspace-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,
-      :doc:`KMD versions <rocm-install-on-linux:reference/user-kernel-space-compat-matrix>`,"6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x"
-      ,,,,,,,,,,,,,,,,
-      ML & COMPUTER VISION,.. _mllibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,
-      :doc:`Composable Kernel <composable_kernel:index>`,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0
-      :doc:`MIGraphX <amdmigraphx:index>`,2.12.0,2.12.0,2.11.0,2.11.0,2.11.0,2.11.0,2.10.0,2.10.0,2.10.0,2.10.0,2.9.0,2.9.0,2.9.0,2.9.0,2.8.0,2.8.0
-      :doc:`MIOpen <miopen:index>`,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
-      :doc:`MIVisionX <mivisionx:index>`,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0
-      :doc:`rocAL <rocal:index>`,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0,2.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
-      :doc:`rocDecode <rocdecode:index>`,0.10.0,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.0,0.5.0,N/A,N/A
-      :doc:`rocJPEG <rocjpeg:index>`,0.8.0,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`rocPyDecode <rocpydecode:index>`,0.3.1,0.3.1,0.2.0,0.2.0,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`RPP <rpp:index>`,1.9.10,1.9.10,1.9.1,1.9.1,1.9.1,1.9.1,1.8.0,1.8.0,1.8.0,1.8.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0
-      ,,,,,,,,,,,,,,,,
-      COMMUNICATION,.. _commlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,
-      :doc:`RCCL <rccl:index>`,2.22.3,2.22.3,2.21.5,2.21.5,2.21.5,2.21.5,2.20.5,2.20.5,2.20.5,2.20.5,2.18.6,2.18.6,2.18.6,2.18.6,2.18.3,2.18.3
-      :doc:`rocSHMEM <rocshmem:index>`,2.0.0,2.0.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      ,,,,,,,,,,,,,,,,
-      MATH LIBS,.. _mathlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,
-      `half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0
-      :doc:`hipBLAS <hipblas:index>`,2.4.0,2.4.0,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0
-      :doc:`hipBLASLt <hipblaslt:index>`,0.12.1,0.12.0,0.10.0,0.10.0,0.10.0,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.7.0,0.7.0,0.7.0,0.7.0,0.6.0,0.6.0
-      :doc:`hipFFT <hipfft:index>`,1.0.18,1.0.18,1.0.17,1.0.17,1.0.17,1.0.17,1.0.16,1.0.15,1.0.15,1.0.14,1.0.14,1.0.14,1.0.14,1.0.14,1.0.13,1.0.13
-      :doc:`hipfort <hipfort:index>`,0.6.0,0.6.0,0.5.1,0.5.1,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0
-      :doc:`hipRAND <hiprand:index>`,2.12.0,2.12.0,2.11.1,2.11.1,2.11.1,2.11.0,2.11.1,2.11.0,2.11.0,2.11.0,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16
-      :doc:`hipSOLVER <hipsolver:index>`,2.4.0,2.4.0,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.1,2.1.1,2.1.1,2.1.0,2.0.0,2.0.0
-      :doc:`hipSPARSE <hipsparse:index>`,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.1.1,3.1.1,3.1.1,3.1.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
-      :doc:`hipSPARSELt <hipsparselt:index>`,0.2.3,0.2.3,0.2.2,0.2.2,0.2.2,0.2.2,0.2.1,0.2.1,0.2.1,0.2.1,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0
-      :doc:`rocALUTION <rocalution:index>`,3.2.3,3.2.2,3.2.1,3.2.1,3.2.1,3.2.1,3.2.1,3.2.0,3.2.0,3.2.0,3.1.1,3.1.1,3.1.1,3.1.1,3.0.3,3.0.3
-      :doc:`rocBLAS <rocblas:index>`,4.4.0,4.4.0,4.3.0,4.3.0,4.3.0,4.3.0,4.2.4,4.2.1,4.2.1,4.2.0,4.1.2,4.1.2,4.1.0,4.1.0,4.0.0,4.0.0
-      :doc:`rocFFT <rocfft:index>`,1.0.32,1.0.32,1.0.31,1.0.31,1.0.31,1.0.31,1.0.30,1.0.29,1.0.29,1.0.28,1.0.27,1.0.27,1.0.27,1.0.26,1.0.25,1.0.23
-      :doc:`rocRAND <rocrand:index>`,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.1,3.1.0,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,2.10.17
-      :doc:`rocSOLVER <rocsolver:index>`,3.28.0,3.28.0,3.27.0,3.27.0,3.27.0,3.27.0,3.26.2,3.26.0,3.26.0,3.26.0,3.25.0,3.25.0,3.25.0,3.25.0,3.24.0,3.24.0
-      :doc:`rocSPARSE <rocsparse:index>`,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.0.2,3.0.2
-      :doc:`rocWMMA <rocwmma:index>`,1.7.0,1.7.0,1.6.0,1.6.0,1.6.0,1.6.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0
-      :doc:`Tensile <tensile:src/index>`,4.43.0,4.43.0,4.42.0,4.42.0,4.42.0,4.42.0,4.41.0,4.41.0,4.41.0,4.41.0,4.40.0,4.40.0,4.40.0,4.40.0,4.39.0,4.39.0
-      ,,,,,,,,,,,,,,,,
-      PRIMITIVES,.. _primitivelibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,
-      :doc:`hipCUB <hipcub:index>`,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
-      :doc:`hipTensor <hiptensor:index>`,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0,1.3.0,1.3.0,1.2.0,1.2.0,1.2.0,1.2.0,1.1.0,1.1.0
-      :doc:`rocPRIM <rocprim:index>`,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.2,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
-      :doc:`rocThrust <rocthrust:index>`,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.1.1,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
-      ,,,,,,,,,,,,,,,,
-      SUPPORT LIBS,,,,,,,,,,,,,,,,
-      `hipother <https://github.com/ROCm/hipother>`_,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
-      `rocm-core <https://github.com/ROCm/rocm-core>`_,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0,6.1.5,6.1.2,6.1.1,6.1.0,6.0.2,6.0.0
-      `ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,20240607.5.7,20240607.5.7,20240607.4.05,20240607.1.4246,20240125.5.08,20240125.5.08,20240125.5.08,20240125.3.30,20231016.2.245,20231016.2.245
-      ,,,,,,,,,,,,,,,,
-      SYSTEM MGMT TOOLS,.. _tools-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,
-      :doc:`AMD SMI <amdsmi:index>`,25.4.2,25.3.0,24.7.1,24.7.1,24.7.1,24.7.1,24.6.3,24.6.3,24.6.3,24.6.2,24.5.1,24.5.1,24.5.1,24.4.1,23.4.2,23.4.2
-      :doc:`ROCm Data Center Tool <rdc:index>`,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0
-      :doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
-      :doc:`ROCm SMI <rocm_smi_lib:index>`,7.5.0,7.5.0,7.4.0,7.4.0,7.4.0,7.4.0,7.3.0,7.3.0,7.3.0,7.3.0,7.2.0,7.2.0,7.0.0,7.0.0,6.0.2,6.0.0
-      :doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.0.60204,1.0.60202,1.0.60201,1.0.60200,1.0.60105,1.0.60102,1.0.60101,1.0.60100,1.0.60002,1.0.60000
-      ,,,,,,,,,,,,,,,,
-      PERFORMANCE TOOLS,,,,,,,,,,,,,,,,
-      :doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0
-      :doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.0.1,2.0.1,2.0.1,2.0.1,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.0.1,1.0.0,0.1.2,0.1.1,0.1.0,0.1.0,1.11.2,1.11.2,1.11.2,1.11.2,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`ROCProfiler <rocprofiler:index>`,2.0.60401,2.0.60400,2.0.60303,2.0.60302,2.0.60301,2.0.60300,2.0.60204,2.0.60202,2.0.60201,2.0.60200,2.0.60105,2.0.60102,2.0.60101,2.0.60100,2.0.60002,2.0.60000
-      :doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,0.6.0,0.6.0,0.5.0,0.5.0,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`ROCTracer <roctracer:index>`,4.1.60401,4.1.60400,4.1.60303,4.1.60302,4.1.60301,4.1.60300,4.1.60204,4.1.60202,4.1.60201,4.1.60200,4.1.60105,4.1.60102,4.1.60101,4.1.60100,4.1.60002,4.1.60000
-      ,,,,,,,,,,,,,,,,
-      DEVELOPMENT TOOLS,,,,,,,,,,,,,,,,
-      :doc:`HIPIFY <hipify:index>`,19.0.0,19.0.0,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
-      :doc:`ROCm CMake <rocmcmakebuildtools:index>`,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.13.0,0.13.0,0.13.0,0.13.0,0.12.0,0.12.0,0.12.0,0.12.0,0.11.0,0.11.0
-      :doc:`ROCdbgapi <rocdbgapi:index>`,0.77.2,0.77.2,0.77.0,0.77.0,0.77.0,0.77.0,0.76.0,0.76.0,0.76.0,0.76.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0
-      :doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,14.2.0,14.2.0,14.2.0,14.2.0,14.1.0,14.1.0,14.1.0,14.1.0,13.2.0,13.2.0
-      `rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.3.0,0.3.0,0.3.0,0.3.0,N/A,N/A
-      :doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.0.4,2.0.4,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3
-      ,,,,,,,,,,,,,,,,
-      COMPILERS,.. _compilers-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,
-      `clang-ocl <https://github.com/ROCm/clang-ocl>`_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0
-      :doc:`hipCC <hipcc:index>`,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
-      `Flang <https://github.com/ROCm/flang>`_,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
-      :doc:`llvm-project <llvm-project:index>`,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
-      `OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
-,,,,,,,,,,,,,,,,
-      RUNTIMES,.. _runtime-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,
-      :doc:`AMD CLR <hip:understand/amd_clr>`,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
-      :doc:`HIP <hip:index>`,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
-      `OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0
-      :doc:`ROCr Runtime <rocr-runtime:index>`,1.15.0,1.15.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.13.0,1.13.0,1.13.0,1.13.0,1.13.0,1.12.0,1.12.0
+ROCm Version,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0, 6.1.5, 6.1.2, 6.1.1, 6.1.0, 6.0.2, 6.0.0
+      :ref:`Operating systems & kernels <OS-kernel-versions>`,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,"Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04",Ubuntu 24.04,,,,,,
+      ,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,"Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3, 22.04.2","Ubuntu 22.04.4, 22.04.3, 22.04.2"
+      ,,,,,,,,,,,,"Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5"
+      ,"RHEL 9.6, 9.4","RHEL 9.6, 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.3, 9.2","RHEL 9.3, 9.2"
+      ,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,"RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8"
+      ,"SLES 15 SP7, SP6",SLES 15 SP6,SLES 15 SP6,"SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4"
+      ,,,,,,,,,,,,,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9
+      ,"Oracle Linux 9, 8 [#mi300x-past-60]_","Oracle Linux 9, 8 [#mi300x-past-60]_","Oracle Linux 9, 8 [#mi300x-past-60]_",Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,,,
+      ,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,,,,,,,,,,,
+      ,Azure Linux 3.0 [#mi300x-past-60]_,Azure Linux 3.0 [#mi300x-past-60]_,Azure Linux 3.0 [#mi300x-past-60]_,Azure Linux 3.0 [#mi300x-past-60]_,Azure Linux 3.0 [#mi300x-past-60]_,,,,,,,,,,,,
+      ,.. _architecture-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,
+      :doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3
+      ,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2
+      ,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA
+      ,RDNA4,RDNA4,,,,,,,,,,,,,,,
+      ,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3
+      ,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2
+      ,.. _gpu-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,
+      :doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>`,gfx1201 [#RDNA-OS-past-60]_,gfx1201 [#RDNA-OS-past-60]_,,,,,,,,,,,,,,,
+      ,gfx1200 [#RDNA-OS-past-60]_,gfx1200 [#RDNA-OS-past-60]_,,,,,,,,,,,,,,,
+,gfx1101 [#RDNA-OS-past-60]_ [#7700XT-OS-past-60]_,gfx1101 [#RDNA-OS-past-60]_,,,,,,,,,,,,,,,
+      ,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100
+      ,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030
+      ,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942 [#mi300_624-past-60]_,gfx942 [#mi300_622-past-60]_,gfx942 [#mi300_621-past-60]_,gfx942 [#mi300_620-past-60]_, gfx942 [#mi300_612-past-60]_, gfx942 [#mi300_612-past-60]_, gfx942 [#mi300_611-past-60]_, gfx942 [#mi300_610-past-60]_, gfx942 [#mi300_602-past-60]_, gfx942 [#mi300_600-past-60]_
+      ,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a
+      ,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908
+,,,,,,,,,,,,,,,,,
+      FRAMEWORK SUPPORT,.. _framework-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,
+      :doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13"
+      :doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.14.0, 2.13.1, 2.12.1","2.14.0, 2.13.1, 2.12.1"
+      :doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.4.35,0.4.35,0.4.35,0.4.31,0.4.31,0.4.31,0.4.31,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26
+      :doc:`verl <../compatibility/ml-compatibility/verl-compatibility>` [#verl_compat]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.3.0.post0,N/A,N/A,N/A,N/A,N/A
+      :doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>` [#stanford-megatron-lm_compat]_,N/A,N/A,N/A,N/A,N/A,N/A,85f95ae,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat]_,N/A,N/A,2.4.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>` [#megablocks_compat]_,N/A,N/A,N/A,N/A,N/A,N/A,0.7.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.2,1.2,1.2,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.14.1,1.14.1
+,,,,,,,,,,,,,,,,,
+      ,,,,,,,,,,,,,,,,,
+      THIRD PARTY COMMS,.. _thirdpartycomms-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,
+      `UCC <https://github.com/ROCm/ucc>`_,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.2.0,>=1.2.0
+      `UCX <https://github.com/ROCm/ucx>`_,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1
+      ,,,,,,,,,,,,,,,,,
+      THIRD PARTY ALGORITHM,.. _thirdpartyalgorithm-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,
+      Thrust,2.5.0,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
+      CUB,2.5.0,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
+,,,,,,,,,,,,,,,,,
+      KMD & USER SPACE [#kfd_support-past-60]_,.. _kfd-userspace-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,
+      :doc:`KMD versions <rocm-install-on-linux:reference/user-kernel-space-compat-matrix>`,"6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x"
+      ,,,,,,,,,,,,,,,,,
+      ML & COMPUTER VISION,.. _mllibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,
+      :doc:`Composable Kernel <composable_kernel:index>`,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0
+      :doc:`MIGraphX <amdmigraphx:index>`,2.12.0,2.12.0,2.12.0,2.11.0,2.11.0,2.11.0,2.11.0,2.10.0,2.10.0,2.10.0,2.10.0,2.9.0,2.9.0,2.9.0,2.9.0,2.8.0,2.8.0
+      :doc:`MIOpen <miopen:index>`,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
+      :doc:`MIVisionX <mivisionx:index>`,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0
+      :doc:`rocAL <rocal:index>`,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0,2.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
+      :doc:`rocDecode <rocdecode:index>`,0.10.0,0.10.0,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.0,0.5.0,N/A,N/A
+      :doc:`rocJPEG <rocjpeg:index>`,0.8.0,0.8.0,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`rocPyDecode <rocpydecode:index>`,0.3.1,0.3.1,0.3.1,0.2.0,0.2.0,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`RPP <rpp:index>`,1.9.10,1.9.10,1.9.10,1.9.1,1.9.1,1.9.1,1.9.1,1.8.0,1.8.0,1.8.0,1.8.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0
+      ,,,,,,,,,,,,,,,,,
+      COMMUNICATION,.. _commlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,
+      :doc:`RCCL <rccl:index>`,2.22.3,2.22.3,2.22.3,2.21.5,2.21.5,2.21.5,2.21.5,2.20.5,2.20.5,2.20.5,2.20.5,2.18.6,2.18.6,2.18.6,2.18.6,2.18.3,2.18.3
+      :doc:`rocSHMEM <rocshmem:index>`,2.0.1,2.0.0,2.0.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      ,,,,,,,,,,,,,,,,,
+      MATH LIBS,.. _mathlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,
+      `half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0
+      :doc:`hipBLAS <hipblas:index>`,2.4.0,2.4.0,2.4.0,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0
+      :doc:`hipBLASLt <hipblaslt:index>`,0.12.1,0.12.1,0.12.0,0.10.0,0.10.0,0.10.0,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.7.0,0.7.0,0.7.0,0.7.0,0.6.0,0.6.0
+      :doc:`hipFFT <hipfft:index>`,1.0.18,1.0.18,1.0.18,1.0.17,1.0.17,1.0.17,1.0.17,1.0.16,1.0.15,1.0.15,1.0.14,1.0.14,1.0.14,1.0.14,1.0.14,1.0.13,1.0.13
+      :doc:`hipfort <hipfort:index>`,0.6.0,0.6.0,0.6.0,0.5.1,0.5.1,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0
+      :doc:`hipRAND <hiprand:index>`,2.12.0,2.12.0,2.12.0,2.11.1,2.11.1,2.11.1,2.11.0,2.11.1,2.11.0,2.11.0,2.11.0,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16
+      :doc:`hipSOLVER <hipsolver:index>`,2.4.0,2.4.0,2.4.0,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.1,2.1.1,2.1.1,2.1.0,2.0.0,2.0.0
+      :doc:`hipSPARSE <hipsparse:index>`,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.1.1,3.1.1,3.1.1,3.1.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
+      :doc:`hipSPARSELt <hipsparselt:index>`,0.2.3,0.2.3,0.2.3,0.2.2,0.2.2,0.2.2,0.2.2,0.2.1,0.2.1,0.2.1,0.2.1,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0
+      :doc:`rocALUTION <rocalution:index>`,3.2.3,3.2.3,3.2.2,3.2.1,3.2.1,3.2.1,3.2.1,3.2.1,3.2.0,3.2.0,3.2.0,3.1.1,3.1.1,3.1.1,3.1.1,3.0.3,3.0.3
+      :doc:`rocBLAS <rocblas:index>`,4.4.1,4.4.0,4.4.0,4.3.0,4.3.0,4.3.0,4.3.0,4.2.4,4.2.1,4.2.1,4.2.0,4.1.2,4.1.2,4.1.0,4.1.0,4.0.0,4.0.0
+      :doc:`rocFFT <rocfft:index>`,1.0.32,1.0.32,1.0.32,1.0.31,1.0.31,1.0.31,1.0.31,1.0.30,1.0.29,1.0.29,1.0.28,1.0.27,1.0.27,1.0.27,1.0.26,1.0.25,1.0.23
+      :doc:`rocRAND <rocrand:index>`,3.3.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.1,3.1.0,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,2.10.17
+      :doc:`rocSOLVER <rocsolver:index>`,3.28.2,3.28.0,3.28.0,3.27.0,3.27.0,3.27.0,3.27.0,3.26.2,3.26.0,3.26.0,3.26.0,3.25.0,3.25.0,3.25.0,3.25.0,3.24.0,3.24.0
+      :doc:`rocSPARSE <rocsparse:index>`,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.0.2,3.0.2
+      :doc:`rocWMMA <rocwmma:index>`,1.7.0,1.7.0,1.7.0,1.6.0,1.6.0,1.6.0,1.6.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0
+      :doc:`Tensile <tensile:src/index>`,4.43.0,4.43.0,4.43.0,4.42.0,4.42.0,4.42.0,4.42.0,4.41.0,4.41.0,4.41.0,4.41.0,4.40.0,4.40.0,4.40.0,4.40.0,4.39.0,4.39.0
+      ,,,,,,,,,,,,,,,,,
+      PRIMITIVES,.. _primitivelibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,
+      :doc:`hipCUB <hipcub:index>`,3.4.0,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
+      :doc:`hipTensor <hiptensor:index>`,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0,1.3.0,1.3.0,1.2.0,1.2.0,1.2.0,1.2.0,1.1.0,1.1.0
+      :doc:`rocPRIM <rocprim:index>`,3.4.1,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.2,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
+      :doc:`rocThrust <rocthrust:index>`,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.1.1,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
+      ,,,,,,,,,,,,,,,,,
+      SUPPORT LIBS,,,,,,,,,,,,,,,,,
+      `hipother <https://github.com/ROCm/hipother>`_,6.4.43483,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
+      `rocm-core <https://github.com/ROCm/rocm-core>`_,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0,6.1.5,6.1.2,6.1.1,6.1.0,6.0.2,6.0.0
+      `ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,20240607.5.7,20240607.5.7,20240607.4.05,20240607.1.4246,20240125.5.08,20240125.5.08,20240125.5.08,20240125.3.30,20231016.2.245,20231016.2.245
+      ,,,,,,,,,,,,,,,,,
+      SYSTEM MGMT TOOLS,.. _tools-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,
+      :doc:`AMD SMI <amdsmi:index>`,25.5.1,25.4.2,25.3.0,24.7.1,24.7.1,24.7.1,24.7.1,24.6.3,24.6.3,24.6.3,24.6.2,24.5.1,24.5.1,24.5.1,24.4.1,23.4.2,23.4.2
+      :doc:`ROCm Data Center Tool <rdc:index>`,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0
+      :doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
+      :doc:`ROCm SMI <rocm_smi_lib:index>`,7.5.0,7.5.0,7.5.0,7.4.0,7.4.0,7.4.0,7.4.0,7.3.0,7.3.0,7.3.0,7.3.0,7.2.0,7.2.0,7.0.0,7.0.0,6.0.2,6.0.0
+      :doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.0.60204,1.0.60202,1.0.60201,1.0.60200,1.0.60105,1.0.60102,1.0.60101,1.0.60100,1.0.60002,1.0.60000
+      ,,,,,,,,,,,,,,,,,
+      PERFORMANCE TOOLS,,,,,,,,,,,,,,,,,
+      :doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0
+      :doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.1.1,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.0.1,2.0.1,2.0.1,2.0.1,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.0.2,1.0.1,1.0.0,0.1.2,0.1.1,0.1.0,0.1.0,1.11.2,1.11.2,1.11.2,1.11.2,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`ROCProfiler <rocprofiler:index>`,2.0.60402,2.0.60401,2.0.60400,2.0.60303,2.0.60302,2.0.60301,2.0.60300,2.0.60204,2.0.60202,2.0.60201,2.0.60200,2.0.60105,2.0.60102,2.0.60101,2.0.60100,2.0.60002,2.0.60000
+      :doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,0.6.0,0.6.0,0.6.0,0.5.0,0.5.0,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`ROCTracer <roctracer:index>`,4.1.60402,4.1.60401,4.1.60400,4.1.60303,4.1.60302,4.1.60301,4.1.60300,4.1.60204,4.1.60202,4.1.60201,4.1.60200,4.1.60105,4.1.60102,4.1.60101,4.1.60100,4.1.60002,4.1.60000
+      ,,,,,,,,,,,,,,,,,
+      DEVELOPMENT TOOLS,,,,,,,,,,,,,,,,,
+      :doc:`HIPIFY <hipify:index>`,19.0.0,19.0.0,19.0.0,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
+      :doc:`ROCm CMake <rocmcmakebuildtools:index>`,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.13.0,0.13.0,0.13.0,0.13.0,0.12.0,0.12.0,0.12.0,0.12.0,0.11.0,0.11.0
+      :doc:`ROCdbgapi <rocdbgapi:index>`,0.77.2,0.77.2,0.77.2,0.77.0,0.77.0,0.77.0,0.77.0,0.76.0,0.76.0,0.76.0,0.76.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0
+      :doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,14.2.0,14.2.0,14.2.0,14.2.0,14.1.0,14.1.0,14.1.0,14.1.0,13.2.0,13.2.0
+      `rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.3.0,0.3.0,0.3.0,0.3.0,N/A,N/A
+      :doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.0.4,2.0.4,2.0.4,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3
+      ,,,,,,,,,,,,,,,,,
+      COMPILERS,.. _compilers-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,
+      `clang-ocl <https://github.com/ROCm/clang-ocl>`_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0
+      :doc:`hipCC <hipcc:index>`,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
+      `Flang <https://github.com/ROCm/flang>`_,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
+      :doc:`llvm-project <llvm-project:index>`,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
+      `OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,19.0.0.25224,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
+,,,,,,,,,,,,,,,,,
+      RUNTIMES,.. _runtime-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,,
+      :doc:`AMD CLR <hip:understand/amd_clr>`,6.4.43484,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
+      :doc:`HIP <hip:index>`,6.4.43484,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
+      `OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0
+      :doc:`ROCr Runtime <rocr-runtime:index>`,1.15.0,1.15.0,1.15.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.13.0,1.13.0,1.13.0,1.13.0,1.13.0,1.12.0,1.12.0
--- a/docs/compatibility/compatibility-matrix.rst
+++ b/docs/compatibility/compatibility-matrix.rst
@@ -23,14 +23,14 @@ compatibility and system requirements.
 .. container:: format-big-table

  .. csv-table::
-      :header: "ROCm Version", "6.4.1", "6.4.0", "6.3.0"
+      :header: "ROCm Version", "6.4.2", "6.4.1", "6.3.0"
      :stub-columns: 1

      :ref:`Operating systems & kernels <OS-kernel-versions>`,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2
      ,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5
-      ,"RHEL 9.6, 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4"
+      ,"RHEL 9.6, 9.4","RHEL 9.6, 9.5, 9.4","RHEL 9.5, 9.4"
      ,RHEL 8.10,RHEL 8.10,RHEL 8.10
-      ,SLES 15 SP6,SLES 15 SP6,"SLES 15 SP6, SP5"
+      ,"SLES 15 SP7, SP6",SLES 15 SP6,"SLES 15 SP6, SP5"
      ,"Oracle Linux 9, 8 [#mi300x]_","Oracle Linux 9, 8 [#mi300x]_",Oracle Linux 8.10 [#mi300x]_
      ,Debian 12 [#single-node]_,Debian 12 [#single-node]_,
      ,Azure Linux 3.0 [#mi300x]_,Azure Linux 3.0 [#mi300x]_,
@@ -38,13 +38,13 @@ compatibility and system requirements.
      :doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA3,CDNA3,CDNA3
      ,CDNA2,CDNA2,CDNA2
      ,CDNA,CDNA,CDNA
-      ,RDNA4,,
+      ,RDNA4,RDNA4,
      ,RDNA3,RDNA3,RDNA3
      ,RDNA2,RDNA2,RDNA2
      ,.. _gpu-support-compatibility-matrix:,,
-      :doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>`,gfx1201 [#RDNA-OS]_,,
-      ,gfx1200 [#RDNA-OS]_,,
-      ,gfx1101 [#RDNA-OS]_,,
+      :doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>`,gfx1201 [#RDNA-OS]_,gfx1201 [#RDNA-OS]_,
+      ,gfx1200 [#RDNA-OS]_,gfx1200 [#RDNA-OS]_,
+      ,gfx1101 [#RDNA-OS]_ [#7700XT-OS]_,gfx1101 [#RDNA-OS]_,
      ,gfx1100,gfx1100,gfx1100
      ,gfx1030,gfx1030,gfx1030
      ,gfx942,gfx942,gfx942
@@ -54,7 +54,9 @@ compatibility and system requirements.
      FRAMEWORK SUPPORT,.. _framework-support-compatibility-matrix:,,
      :doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.4, 2.3, 2.2, 2.1, 2.0, 1.13"
      :doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1"
-      :doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.4.35,0.4.35,0.4.31
+      :doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.4.35,0.4.35,0.4.31  
+      :doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>`,N/A,N/A,85f95ae
+      :doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>`,N/A,N/A,0.7.0
      `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.2,1.2,1.17.3
      ,,,
      THIRD PARTY COMMS,.. _thirdpartycomms-support-compatibility-matrix:,,
@@ -81,23 +83,23 @@ compatibility and system requirements.
      ,,,
      COMMUNICATION,.. _commlibs-support-compatibility-matrix:,,
      :doc:`RCCL <rccl:index>`,2.22.3,2.22.3,2.21.5
-      :doc:`rocSHMEM <rocshmem:index>`,2.0.0,2.0.0,N/A
+      :doc:`rocSHMEM <rocshmem:index>`,2.0.1,2.0.0,N/A
      ,,,
      MATH LIBS,.. _mathlibs-support-compatibility-matrix:,,
      `half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0,1.12.0
      :doc:`hipBLAS <hipblas:index>`,2.4.0,2.4.0,2.3.0
-      :doc:`hipBLASLt <hipblaslt:index>`,0.12.1,0.12.0,0.10.0
+      :doc:`hipBLASLt <hipblaslt:index>`,0.12.1,0.12.1,0.10.0
      :doc:`hipFFT <hipfft:index>`,1.0.18,1.0.18,1.0.17
      :doc:`hipfort <hipfort:index>`,0.6.0,0.6.0,0.5.0
      :doc:`hipRAND <hiprand:index>`,2.12.0,2.12.0,2.11.0
      :doc:`hipSOLVER <hipsolver:index>`,2.4.0,2.4.0,2.3.0
      :doc:`hipSPARSE <hipsparse:index>`,3.2.0,3.2.0,3.1.2
      :doc:`hipSPARSELt <hipsparselt:index>`,0.2.3,0.2.3,0.2.2
-      :doc:`rocALUTION <rocalution:index>`,3.2.3,3.2.2,3.2.1
-      :doc:`rocBLAS <rocblas:index>`,4.4.0,4.4.0,4.3.0
+      :doc:`rocALUTION <rocalution:index>`,3.2.3,3.2.3,3.2.1
+      :doc:`rocBLAS <rocblas:index>`,4.4.1,4.4.0,4.3.0
      :doc:`rocFFT <rocfft:index>`,1.0.32,1.0.32,1.0.31
      :doc:`rocRAND <rocrand:index>`,3.3.0,3.3.0,3.2.0
-      :doc:`rocSOLVER <rocsolver:index>`,3.28.0,3.28.0,3.27.0
+      :doc:`rocSOLVER <rocsolver:index>`,3.28.2,3.28.0,3.27.0
      :doc:`rocSPARSE <rocsparse:index>`,3.4.0,3.4.0,3.3.0
      :doc:`rocWMMA <rocwmma:index>`,1.7.0,1.7.0,1.6.0
      :doc:`Tensile <tensile:src/index>`,4.43.0,4.43.0,4.42.0
@@ -105,16 +107,16 @@ compatibility and system requirements.
      PRIMITIVES,.. _primitivelibs-support-compatibility-matrix:,,
      :doc:`hipCUB <hipcub:index>`,3.4.0,3.4.0,3.3.0
      :doc:`hipTensor <hiptensor:index>`,1.5.0,1.5.0,1.4.0
-      :doc:`rocPRIM <rocprim:index>`,3.4.0,3.4.0,3.3.0
+      :doc:`rocPRIM <rocprim:index>`,3.4.1,3.4.0,3.3.0
      :doc:`rocThrust <rocthrust:index>`,3.3.0,3.3.0,3.3.0
      ,,,
      SUPPORT LIBS,,,
-      `hipother <https://github.com/ROCm/hipother>`_,6.4.43483,6.4.43482,6.3.42131
-      `rocm-core <https://github.com/ROCm/rocm-core>`_,6.4.1,6.4.0,6.3.0
+      `hipother <https://github.com/ROCm/hipother>`_,6.4.43483,6.4.43483,6.3.42131
+      `rocm-core <https://github.com/ROCm/rocm-core>`_,6.4.2,6.4.1,6.3.0
      `ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr]_,N/A [#ROCT-rocr]_,N/A [#ROCT-rocr]_
      ,,,
      SYSTEM MGMT TOOLS,.. _tools-support-compatibility-matrix:,,
-      :doc:`AMD SMI <amdsmi:index>`,25.4.2,25.3.0,24.7.1
+      :doc:`AMD SMI <amdsmi:index>`,25.5.1,25.4.2,24.7.1
      :doc:`ROCm Data Center Tool <rdc:index>`,0.3.0,0.3.0,0.3.0
      :doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0,1.0.0
      :doc:`ROCm SMI <rocm_smi_lib:index>`,7.5.0,7.5.0,7.4.0
@@ -122,11 +124,11 @@ compatibility and system requirements.
      ,,,
      PERFORMANCE TOOLS,,,
      :doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,1.4.0,1.4.0,1.4.0
-      :doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.1.0,3.1.0,3.0.0
-      :doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.0.1,1.0.0,0.1.0
-      :doc:`ROCProfiler <rocprofiler:index>`,2.0.60401,2.0.60400,2.0.60300
+      :doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.1.1,3.1.0,3.0.0
+      :doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.0.2,1.0.1,0.1.0
+      :doc:`ROCProfiler <rocprofiler:index>`,2.0.60402,2.0.60401,2.0.60300
      :doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,0.6.0,0.6.0,0.5.0
-      :doc:`ROCTracer <roctracer:index>`,4.1.60401,4.1.60400,4.1.60300
+      :doc:`ROCTracer <roctracer:index>`,4.1.60402,4.1.60401,4.1.60300
      ,,,
      DEVELOPMENT TOOLS,,,
      :doc:`HIPIFY <hipify:index>`,19.0.0,19.0.0,18.0.0.24455
@@ -139,25 +141,25 @@ compatibility and system requirements.
      COMPILERS,.. _compilers-support-compatibility-matrix:,,
      `clang-ocl <https://github.com/ROCm/clang-ocl>`_,N/A,N/A,N/A
      :doc:`hipCC <hipcc:index>`,1.1.1,1.1.1,1.1.1
-      `Flang <https://github.com/ROCm/flang>`_,19.0.0.25184,19.0.0.25133,18.0.0.24455
-      :doc:`llvm-project <llvm-project:index>`,19.0.0.25184,19.0.0.25133,18.0.0.24491
-      `OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,19.0.0.25184,19.0.0.25133,18.0.0.24491
+      `Flang <https://github.com/ROCm/flang>`_,19.0.0.25224,19.0.0.25184,18.0.0.24455
+      :doc:`llvm-project <llvm-project:index>`,19.0.0.25224,19.0.0.25184,18.0.0.24491
+      `OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,19.0.0.25224,19.0.0.25184,18.0.0.24491
      ,,,
      RUNTIMES,.. _runtime-support-compatibility-matrix:,,
-      :doc:`AMD CLR <hip:understand/amd_clr>`,6.4.43483,6.4.43482,6.3.42131
-      :doc:`HIP <hip:index>`,6.4.43483,6.4.43482,6.3.42131
+      :doc:`AMD CLR <hip:understand/amd_clr>`,6.4.43484,6.4.43483,6.3.42131
+      :doc:`HIP <hip:index>`,6.4.43484,6.4.43483,6.3.42131
      `OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0,2.0.0
      :doc:`ROCr Runtime <rocr-runtime:index>`,1.15.0,1.15.0,1.14.0

-
 .. rubric:: Footnotes

 .. [#mi300x] Oracle Linux and Azure Linux are supported only on AMD Instinct MI300X.
-.. [#single-node] Debian 12 is supported only on AMD Instinct MI300X for single-node functionality. 
-.. [#mi300_620] **For ROCm 6.2.0** - MI300X (gfx942) is supported on listed operating systems *except* Ubuntu 22.04.5 [6.8 HWE] and Ubuntu 22.04.4 [6.5 HWE].
-.. [#kfd_support] Starting from ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart (assuming hardware support is available in both). For earlier ROCm releases, the compatibility is provided for +/- 2 releases. These are the compatibility combinations that are currently supported.
+.. [#single-node] Debian 12 is supported only on AMD Instinct MI300X for single-node functionality.
+.. [#RDNA-OS] Radeon AI PRO R9700, Radeon RX 9070 XT (gfx1201), Radeon RX 9060 XT (gfx1200), Radeon PRO W7700 (gfx1101), and Radeon RX 7800 XT (gfx1101) are supported only on Ubuntu 24.04.2, Ubuntu 22.04.5, RHEL 9.6, and RHEL 9.4.
+.. [#7700XT-OS] Radeon RX 7700 XT (gfx1101) is supported only on Ubuntu 24.04.2 and RHEL 9.6.
+.. [#kfd_support] As of ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The tested user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and kernel-space support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
 .. [#ROCT-rocr] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.
-.. [#RDNA-OS] Radeon AI PRO R9700, Radeon RX 9070 XT (gfx1201), Radeon RX 9060 XT (gfx1200), Radeon PRO W7700 (gfx1101), and Radeon RX 7800 XT (gfx1101) are supported only on Ubuntu 24.04.2, Ubuntu 22.04.5, RHEL 9.6, RHEL 9.5, and RHEL 9.4.
+

 .. _OS-kernel-versions:

@@ -176,14 +178,15 @@ Use this lookup table to confirm which operating system and kernel versions are
   `Ubuntu <https://ubuntu.com/about/release-cycle#ubuntu-kernel-release-cycle>`_, 22.04.5, "5.15 GA, 6.8 HWE", 2.35
   ,,
   `Red Hat Enterprise Linux (RHEL 9) <https://access.redhat.com/articles/3078#RHEL9>`_, 9.6, 5.14+, 2.34
-   , 9.5, 5.14+, 2.34
+   ,9.5, 5.14+, 2.34
   ,9.4, 5.14+, 2.34
   ,9.3, 5.14+, 2.34
   ,,
   `Red Hat Enterprise Linux (RHEL 8) <https://access.redhat.com/articles/3078#RHEL8>`_, 8.10, 4.18.0+, 2.28
   ,8.9, 4.18.0, 2.28
   ,,
-   `SUSE Linux Enterprise Server (SLES) <https://www.suse.com/support/kb/doc/?id=000019587#SLE15SP4>`_, 15 SP6, "6.5.0+, 6.4.0", 2.38
+   `SUSE Linux Enterprise Server (SLES) <https://www.suse.com/support/kb/doc/?id=000019587#SLE15SP4>`_, 15 SP7, 6.11.0+, 2.38
+   ,15 SP6, "6.5.0+, 6.4.0", 2.38
   ,15 SP5, 5.14.21, 2.31
   ,,
   `Oracle Linux <https://blogs.oracle.com/scoter/post/oracle-linux-and-unbreakable-enterprise-kernel-uek-releases>`_, 9, 5.15.0 (UEK), 2.35
@@ -225,7 +228,9 @@ Expand for full historical view of:
   .. rubric:: Footnotes

   .. [#mi300x-past-60] Oracle Linux and Azure Linux are supported only on AMD Instinct MI300X.
-   .. [#single-node-past-60] Debian 12 is supported only on AMD Instinct MI300X for single-node functionality. 
+   .. [#single-node-past-60] Debian 12 is supported only on AMD Instinct MI300X for single-node functionality.
+   .. [#RDNA-OS-past-60] Radeon AI PRO R9700, Radeon RX 9070 XT (gfx1201), Radeon RX 9060 XT (gfx1200), Radeon PRO W7700 (gfx1101), and Radeon RX 7800 XT (gfx1101) are supported only on Ubuntu 24.04.2, Ubuntu 22.04.5, RHEL 9.6, and RHEL 9.4.
+   .. [#7700XT-OS-past-60] Radeon RX 7700 XT (gfx1101) is supported only on Ubuntu 24.04.2 and RHEL 9.6.
   .. [#mi300_624-past-60] **For ROCm 6.2.4** - MI300X (gfx942) is supported on listed operating systems *except* Ubuntu 22.04.5 [6.8 HWE] and Ubuntu 22.04.4 [6.5 HWE].
   .. [#mi300_622-past-60] **For ROCm 6.2.2** - MI300X (gfx942) is supported on listed operating systems *except* Ubuntu 22.04.5 [6.8 HWE] and Ubuntu 22.04.4 [6.5 HWE].
   .. [#mi300_621-past-60] **For ROCm 6.2.1** - MI300X (gfx942) is supported on listed operating systems *except* Ubuntu 22.04.5 [6.8 HWE] and Ubuntu 22.04.4 [6.5 HWE].
@@ -235,6 +240,10 @@ Expand for full historical view of:
   .. [#mi300_610-past-60] **For ROCm 6.1.0** - MI300A (gfx942) is supported on Ubuntu 22.04.4, RHEL 9.4, RHEL 9.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.4.
   .. [#mi300_602-past-60] **For ROCm 6.0.2** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.
   .. [#mi300_600-past-60] **For ROCm 6.0.0** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.
-   .. [#kfd_support-past-60] Starting from ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart (assuming hardware support is available in both). For earlier ROCm releases, the compatibility is provided for +/- 2 releases. These are the compatibility combinations that are currently supported.
+   .. [#verl_compat] verl is only supported on ROCm 6.2.0.
+   .. [#stanford-megatron-lm_compat] Stanford Megatron-LM is only supported on ROCm 6.3.0.
+   .. [#dgl_compat] DGL is only supported on ROCm 6.4.0.
+   .. [#megablocks_compat] Megablocks is only supported on ROCm 6.3.0.
+   .. [#kfd_support-past-60] As of ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The tested user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and kernel-space support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
   .. [#ROCT-rocr-past-60] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.
-   .. [#RDNA-OS-past-60] Radeon AI PRO R9700, Radeon RX 9070 XT (gfx1201), Radeon RX 9060 XT (gfx1200), Radeon PRO W7700 (gfx1101), and Radeon RX 7800 XT (gfx1101) are supported only on Ubuntu 24.04.2, Ubuntu 22.04.5, RHEL 9.6, RHEL 9.5, and RHEL 9.4.
+   
--- a/docs/compatibility/ml-compatibility/dgl-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/dgl-compatibility.rst
@@ -0,0 +1,255 @@
+:orphan:
+
+.. meta::
+    :description: Deep Graph Library (DGL) compatibility
+    :keywords: GPU, DGL compatibility
+
+.. version-set:: rocm_version latest
+
+********************************************************************************
+DGL compatibility
+********************************************************************************
+
+Deep Graph Library `(DGL) <https://www.dgl.ai/>`_ is an easy-to-use, high-performance and scalable 
+Python package for deep learning on graphs. DGL is framework agnostic, meaning 
+if a deep graph model is a component in an end-to-end application, the rest of 
+the logic is implemented using PyTorch.  
+
+* ROCm support for DGL is hosted in the `https://github.com/ROCm/dgl <https://github.com/ROCm/dgl>`_ repository. 
+* Due to independent compatibility considerations, this location differs from the `https://github.com/dmlc/dgl <https://github.com/dmlc/dgl>`_ upstream repository. 
+* Use the prebuilt :ref:`Docker images <dgl-docker-compat>` with DGL, PyTorch, and ROCm preinstalled.
+* See the :doc:`ROCm DGL installation guide <rocm-install-on-linux:install/3rd-party/dgl-install>` 
+  to install and get started.
+
+
+Supported devices
+================================================================================
+
+- **Officially Supported**: TF32 with AMD Instinct MI300X (through hipblaslt)
+- **Partially Supported**: TF32 with AMD Instinct MI250X
+
+
+.. _dgl-recommendations:
+
+Use cases and recommendations
+================================================================================
+
+DGL can be used for Graph Learning, and building popular graph models like  
+GAT, GCN and GraphSage. Using these we can support a variety of use-cases such as:
+
+- Recommender systems
+- Network Optimization and Analysis
+- 1D (Temporal) and 2D (Image) Classification
+- Drug Discovery
+
+Multiple use cases of DGL have been tested and verified.
+However, a recommended example follows a drug discovery pipeline using the ``SE3Transformer``.
+Refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`_, 
+where you can search for DGL examples and best practices to optimize your training workflows on AMD GPUs. 
+
+Coverage includes:
+
+- Single-GPU training/inference
+- Multi-GPU training
+
+
+.. _dgl-docker-compat:
+
+Docker image compatibility
+================================================================================
+
+.. |docker-icon| raw:: html
+
+   <i class="fab fa-docker"></i>
+
+AMD validates and publishes `DGL images <https://hub.docker.com/r/rocm/dgl>`_
+with ROCm and Pytorch backends on Docker Hub. The following Docker image tags and associated
+inventories were tested on `ROCm 6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`_.
+Click the |docker-icon| to view the image on Docker Hub.
+
+.. list-table:: DGL Docker image components
+    :header-rows: 1
+    :class: docker-image-compatibility
+
+    * - Docker
+      - DGL
+      - PyTorch
+      - Ubuntu
+      - Python
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.6.0/images/sha256-8ce2c3bcfaa137ab94a75f9e2ea711894748980f57417739138402a542dd5564"><i class="fab fa-docker fa-lg"></i></a>
+
+      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`_
+      - `2.6.0 <https://github.com/ROCm/pytorch/tree/release/2.6>`_
+      - 24.04
+      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.4.1/images/sha256-cf1683283b8eeda867b690229c8091c5bbf1edb9f52e8fb3da437c49a612ebe4"><i class="fab fa-docker fa-lg"></i></a>
+
+      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`_
+      - `2.4.1 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
+      - 24.04
+      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
+
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.4.1/images/sha256-4834f178c3614e2d09e89e32041db8984c456d45dfd20286e377ca8635686554"><i class="fab fa-docker fa-lg"></i></a>
+
+      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`_
+      - `2.4.1 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
+      - 22.04
+      - `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
+
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.3.0/images/sha256-88740a2c8ab4084b42b10c3c6ba984cab33dd3a044f479c6d7618e2b2cb05e69"><i class="fab fa-docker fa-lg"></i></a>
+
+      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`_
+      - `2.3.0 <https://github.com/ROCm/pytorch/tree/release/2.3>`_
+      - 22.04
+      - `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
+      
+
+Key ROCm libraries for DGL
+================================================================================
+
+DGL on ROCm depends on specific libraries that affect its features and performance.
+Using the DGL Docker container or building it with the provided docker file or a ROCm base image is recommended.
+If you prefer to build it yourself, ensure the following dependencies are installed:
+
+.. list-table:: 
+    :header-rows: 1
+
+    * - ROCm library
+      - Version
+      - Purpose
+    * - `Composable Kernel <https://github.com/ROCm/composable_kernel>`_
+      - :version-ref:`"Composable Kernel" rocm_version`
+      - Enables faster execution of core operations like matrix multiplication
+        (GEMM), convolutions and transformations.
+    * - `hipBLAS <https://github.com/ROCm/hipBLAS>`_
+      - :version-ref:`hipBLAS rocm_version`
+      - Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
+        matrix and vector operations.
+    * - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`_
+      - :version-ref:`hipBLASLt rocm_version`
+      - hipBLASLt is an extension of the hipBLAS library, providing additional
+        features like epilogues fused into the matrix multiplication kernel or
+        use of integer tensor cores.
+    * - `hipCUB <https://github.com/ROCm/hipCUB>`_
+      - :version-ref:`hipCUB rocm_version`
+      - Provides a C++ template library for parallel algorithms for reduction,
+        scan, sort and select.
+    * - `hipFFT <https://github.com/ROCm/hipFFT>`_
+      - :version-ref:`hipFFT rocm_version`
+      - Provides GPU-accelerated Fast Fourier Transform (FFT) operations.
+    * - `hipRAND <https://github.com/ROCm/hipRAND>`_
+      - :version-ref:`hipRAND rocm_version`
+      - Provides fast random number generation for GPUs.
+    * - `hipSOLVER <https://github.com/ROCm/hipSOLVER>`_
+      - :version-ref:`hipSOLVER rocm_version`
+      - Provides GPU-accelerated solvers for linear systems, eigenvalues, and
+        singular value decompositions (SVD).
+    * - `hipSPARSE <https://github.com/ROCm/hipSPARSE>`_
+      - :version-ref:`hipSPARSE rocm_version`
+      - Accelerates operations on sparse matrices, such as sparse matrix-vector
+        or matrix-matrix products.
+    * - `hipSPARSELt <https://github.com/ROCm/hipSPARSELt>`_
+      - :version-ref:`hipSPARSELt rocm_version`
+      - Accelerates operations on sparse matrices, such as sparse matrix-vector
+        or matrix-matrix products.
+    * - `hipTensor <https://github.com/ROCm/hipTensor>`_
+      - :version-ref:`hipTensor rocm_version`
+      - Optimizes for high-performance tensor operations, such as contractions.
+    * - `MIOpen <https://github.com/ROCm/MIOpen>`_
+      - :version-ref:`MIOpen rocm_version`
+      - Optimizes deep learning primitives such as convolutions, pooling,
+        normalization, and activation functions.
+    * - `MIGraphX <https://github.com/ROCm/AMDMIGraphX>`_
+      - :version-ref:`MIGraphX rocm_version`
+      - Adds graph-level optimizations, ONNX models and mixed precision support
+        and enable Ahead-of-Time (AOT) Compilation.
+    * - `MIVisionX <https://github.com/ROCm/MIVisionX>`_
+      - :version-ref:`MIVisionX rocm_version`
+      - Optimizes acceleration for computer vision and AI workloads like
+        preprocessing, augmentation, and inferencing.
+    * - `rocAL <https://github.com/ROCm/rocAL>`_
+      - :version-ref:`rocAL rocm_version`
+      - Accelerates the data pipeline by offloading intensive preprocessing and
+        augmentation tasks. rocAL is part of MIVisionX.
+    * - `RCCL <https://github.com/ROCm/rccl>`_
+      - :version-ref:`RCCL rocm_version`
+      - Optimizes for multi-GPU communication for operations like AllReduce and
+        Broadcast.
+    * - `rocDecode <https://github.com/ROCm/rocDecode>`_
+      - :version-ref:`rocDecode rocm_version`
+      - Provides hardware-accelerated data decoding capabilities, particularly
+        for image, video, and other dataset formats.
+    * - `rocJPEG <https://github.com/ROCm/rocJPEG>`_
+      - :version-ref:`rocJPEG rocm_version`
+      - Provides hardware-accelerated JPEG image decoding and encoding.
+    * - `RPP <https://github.com/ROCm/RPP>`_
+      - :version-ref:`RPP rocm_version`
+      - Speeds up data augmentation, transformation, and other preprocessing steps.
+    * - `rocThrust <https://github.com/ROCm/rocThrust>`_
+      - :version-ref:`rocThrust rocm_version`
+      - Provides a C++ template library for parallel algorithms like sorting,
+        reduction, and scanning.
+    * - `rocWMMA <https://github.com/ROCm/rocWMMA>`_
+      - :version-ref:`rocWMMA rocm_version`
+      - Accelerates warp-level matrix-multiply and matrix-accumulate to speed up matrix
+        multiplication (GEMM) and accumulation operations with mixed precision
+        support.
+
+
+Supported features
+================================================================================
+
+Many functions and methods available in DGL Upstream are also supported in DGL ROCm.
+Instead of listing them all, support is grouped into the following categories to provide a general overview. 
+
+* DGL Base
+* DGL Backend 
+* DGL Data
+* DGL Dataloading
+* DGL DGLGraph
+* DGL Function
+* DGL Ops
+* DGL Sampling
+* DGL Transforms
+* DGL Utils
+* DGL Distributed
+* DGL Geometry
+* DGL Mpops
+* DGL NN
+* DGL Optim
+* DGL Sparse
+
+
+Unsupported features
+================================================================================
+
+* Graphbolt
+* Partial TF32 Support (MI250x only)
+* Kineto/ ROCTracer integration
+
+
+Unsupported functions
+================================================================================
+
+* ``more_nnz``
+* ``format``
+* ``multiprocess_sparse_adam_state_dict``
+* ``record_stream_ndarray``
+* ``half_spmm``
+* ``segment_mm`` 
+* ``gather_mm_idx_b``
+* ``pgexplainer``
+* ``sample_labors_prob``
+* ``sample_labors_noprob``
--- a/docs/compatibility/ml-compatibility/jax-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/jax-compatibility.rst
@@ -53,7 +53,7 @@ Use cases and recommendations
 * The `nanoGPT in JAX <https://rocm.blogs.amd.com/artificial-intelligence/nanoGPT-JAX/README.html>`_
  blog explores the implementation and training of a Generative Pre-trained
  Transformer (GPT) model in JAX, inspired by Andrej Karpathy’s JAX-based
-  nanoGPT. Comparing how essential GPT components—such as self-attention 
+  nanoGPT. Comparing how essential GPT components—such as self-attention
  mechanisms and optimizers—are realized in JAX and JAX, also highlights
  JAX’s unique features.

@@ -97,7 +97,7 @@ Docker image compatibility
 AMD validates and publishes ready-made `ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax>`_
 with ROCm backends on Docker Hub. The following Docker image tags and
 associated inventories represent the latest JAX version from the official Docker Hub and are validated for
-`ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`_. Click the |docker-icon|
+`ROCm 6.4.2 <https://repo.radeon.com/rocm/apt/6.4.2/>`_. Click the |docker-icon|
 icon to view the image on Docker Hub.

 .. list-table:: JAX Docker image components
@@ -110,7 +110,7 @@ icon to view the image on Docker Hub.

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/jax/rocm6.4.1-jax0.4.35-py3.12/images/sha256-7a0745a2a2758bdf86397750bac00e9086cbf67d170cfdbb08af73f7c7d18a6a"><i class="fab fa-docker fa-lg"></i> rocm/jax</a>
+           <a href="https://hub.docker.com/layers/rocm/jax/rocm6.4.2-jax0.4.35-py3.12/images/sha256-8918fa806a172c1a10eb2f57131eb31b5d7c8fa1656b8729fe7d3d736112de83"><i class="fab fa-docker fa-lg"></i> rocm/jax</a>

      - `0.4.35 <https://github.com/ROCm/jax/releases/tag/rocm-jax-v0.4.35>`_
      - Ubuntu 24.04
@@ -118,7 +118,7 @@ icon to view the image on Docker Hub.

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/jax/rocm6.4.1-jax0.4.35-py3.10/images/sha256-5f9e8d6e6e69fdc9a1a3f2ba3b1234c3f46c53b7468538c07fd18b00899da54f"><i class="fab fa-docker fa-lg"></i> rocm/jax</a>
+           <a href="https://hub.docker.com/layers/rocm/jax/rocm6.4.2-jax0.4.35-py3.10/images/sha256-a394be13c67b7fc602216abee51233afd4b6cb7adaa57ca97e688fba82f9ad79"><i class="fab fa-docker fa-lg"></i> rocm/jax</a>

      - `0.4.35 <https://github.com/ROCm/jax/releases/tag/rocm-jax-v0.4.35>`_
      - Ubuntu 22.04
@@ -160,12 +160,14 @@ associated inventories are tested for `ROCm 6.3.2 <https://repo.radeon.com/rocm/
      - Ubuntu 22.04
      - `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_

+.. _key_rocm_libraries:
+
 Key ROCm libraries for JAX
 ================================================================================

-JAX functionality on ROCm is determined by its underlying library
-dependencies. These ROCm components affect the capabilities, performance, and
-feature set available to developers.
+The following ROCm libraries represent potential targets that could be utilized
+by JAX on ROCm for various computational tasks. The actual libraries used will
+depend on the specific implementation and operations performed.

 .. list-table::
    :header-rows: 1
@@ -173,347 +175,140 @@ feature set available to developers.
    * - ROCm library
      - Version
      - Purpose
-      - Used in
    * - `hipBLAS <https://github.com/ROCm/hipBLAS>`_
      - :version-ref:`hipBLAS rocm_version`
      - Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
        matrix and vector operations.
-      - Matrix multiplication in ``jax.numpy.matmul``, ``jax.lax.dot`` and
-        ``jax.lax.dot_general``, operations like ``jax.numpy.dot``, which
-        involve vector and matrix computations and batch matrix multiplications
-        ``jax.numpy.einsum`` with matrix-multiplication patterns algebra
-        operations.
    * - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`_
      - :version-ref:`hipBLASLt rocm_version`
      - hipBLASLt is an extension of hipBLAS, providing additional
        features like epilogues fused into the matrix multiplication kernel or
        use of integer tensor cores.
-      - Matrix multiplication in ``jax.numpy.matmul`` or ``jax.lax.dot``, and
-        the XLA (Accelerated Linear Algebra) use hipBLASLt for optimized matrix
-        operations, mixed-precision support, and hardware-specific
-        optimizations.
    * - `hipCUB <https://github.com/ROCm/hipCUB>`_
      - :version-ref:`hipCUB rocm_version`
      - Provides a C++ template library for parallel algorithms for reduction,
        scan, sort and select.
-      - Reduction functions (``jax.numpy.sum``, ``jax.numpy.mean``,
-        ``jax.numpy.prod``, ``jax.numpy.max`` and ``jax.numpy.min``), prefix sum
-        (``jax.numpy.cumsum``, ``jax.numpy.cumprod``) and sorting
-        (``jax.numpy.sort``, ``jax.numpy.argsort``).
    * - `hipFFT <https://github.com/ROCm/hipFFT>`_
      - :version-ref:`hipFFT rocm_version`
      - Provides GPU-accelerated Fast Fourier Transform (FFT) operations.
-      - Used in functions like ``jax.numpy.fft``.
    * - `hipRAND <https://github.com/ROCm/hipRAND>`_
      - :version-ref:`hipRAND rocm_version`
      - Provides fast random number generation for GPUs.
-      - The ``jax.random.uniform``, ``jax.random.normal``,
-        ``jax.random.randint`` and ``jax.random.split``.
    * - `hipSOLVER <https://github.com/ROCm/hipSOLVER>`_
      - :version-ref:`hipSOLVER rocm_version`
      - Provides GPU-accelerated solvers for linear systems, eigenvalues, and
        singular value decompositions (SVD).
-      - Solving linear systems (``jax.numpy.linalg.solve``), matrix
-        factorizations, SVD (``jax.numpy.linalg.svd``) and eigenvalue problems
-        (``jax.numpy.linalg.eig``).
    * - `hipSPARSE <https://github.com/ROCm/hipSPARSE>`_
      - :version-ref:`hipSPARSE rocm_version`
      - Accelerates operations on sparse matrices, such as sparse matrix-vector
        or matrix-matrix products.
-      - Sparse matrix multiplication (``jax.numpy.matmul``), sparse
-        matrix-vector and matrix-matrix products
-        (``jax.experimental.sparse.dot``), sparse linear system solvers and
-        sparse data handling.
    * - `hipSPARSELt <https://github.com/ROCm/hipSPARSELt>`_
      - :version-ref:`hipSPARSELt rocm_version`
      - Accelerates operations on sparse matrices, such as sparse matrix-vector
        or matrix-matrix products.
-      - Sparse matrix multiplication (``jax.numpy.matmul``), sparse
-        matrix-vector and matrix-matrix products
-        (``jax.experimental.sparse.dot``) and sparse linear system solvers.
    * - `MIOpen <https://github.com/ROCm/MIOpen>`_
      - :version-ref:`MIOpen rocm_version`
      - Optimized for deep learning primitives such as convolutions, pooling,
        normalization, and activation functions.
-      - Speeds up convolutional neural networks (CNNs), recurrent neural
-        networks (RNNs), and other layers. Used in operations like
-        ``jax.nn.conv``, ``jax.nn.relu``, and ``jax.nn.batch_norm``.
    * - `RCCL <https://github.com/ROCm/rccl>`_
      - :version-ref:`RCCL rocm_version`
      - Optimized for multi-GPU communication for operations like  all-reduce,
        broadcast, and scatter.
-      - Distribute computations across multiple GPU with ``pmap`` and
-        ``jax.distributed``. XLA automatically uses rccl when executing
-        operations across multiple GPUs on AMD hardware.
    * - `rocThrust <https://github.com/ROCm/rocThrust>`_
      - :version-ref:`rocThrust rocm_version`
      - Provides a C++ template library for parallel algorithms like sorting,
        reduction, and scanning.
-      - Reduction operations like ``jax.numpy.sum``, ``jax.pmap`` for
-        distributed training, which involves parallel reductions or
-        operations like ``jax.numpy.cumsum`` can use rocThrust.

-Supported features
+.. note::
+
+    This table shows ROCm libraries that could potentially be utilized by JAX. Not
+    all libraries may be used in every configuration, and the actual library usage
+    will depend on the specific operations and implementation details.
+
+Supported data types and modules
 ===============================================================================

-The following table maps the public JAX API modules to their supported
-ROCm and JAX versions.
+The following tables lists the supported public JAX API data types and modules.
+
+Supported data types
+--------------------------------------------------------------------------------
+
+ROCm supports all the JAX data types of `jax.dtypes <https://docs.jax.dev/en/latest/jax.dtypes.html>`_
+module, `jax.numpy.dtype <https://docs.jax.dev/en/latest/_autosummary/jax.numpy.dtype.html>`_
+and `default_dtype <https://docs.jax.dev/en/latest/default_dtypes.html>`_ .
+The ROCm supported data types in JAX are collected in the following table.

 .. list-table::
    :header-rows: 1

-    * - Module
-      - Description
-      - As of JAX
-      - As of ROCm
-    * - ``jax.numpy``
-      - Implements the NumPy API, using the primitives in ``jax.lax``.
-      - 0.1.56
-      - 5.0.0
-    * - ``jax.scipy``
-      - Provides GPU-accelerated and differentiable implementations of many
-        functions from the SciPy library, leveraging JAX's transformations
-        (e.g., ``grad``, ``jit``, ``vmap``).
-      - 0.1.56
-      - 5.0.0
-    * - ``jax.lax``
-      - A library of primitives operations that underpins libraries such as
-        ``jax.numpy.`` Transformation rules, such as Jacobian-vector product
-        (JVP) and batching rules, are typically defined as transformations on
-        ``jax.lax`` primitives.
-      - 0.1.57
-      - 5.0.0
-    * - ``jax.random``
-      - Provides a number of routines for deterministic generation of sequences
-        of pseudorandom numbers.
-      - 0.1.58
-      - 5.0.0
-    * - ``jax.sharding``
-      - Allows to define partitioning and distributing arrays across multiple
-        devices.
-      - 0.3.20
-      - 5.1.0
-    * - ``jax.distributed``
-      - Enables the scaling of computations across multiple devices on a single
-        machine or across multiple machines.
-      - 0.1.74
-      - 5.0.0
-    * - ``jax.image``
-      - Contains image manipulation functions like resize, scale and translation.
-      - 0.1.57
-      - 5.0.0
-    * - ``jax.nn``
-      - Contains common functions for neural network libraries.
-      - 0.1.56
-      - 5.0.0
-    * - ``jax.ops``
-      - Computes the minimum, maximum, sum or product within segments of an
-        array.
-      - 0.1.57
-      - 5.0.0
-    * - ``jax.stages``
-      - Contains interfaces to stages of the compiled execution process.
-      - 0.3.4
-      - 5.0.0
-    * - ``jax.extend``
-      - Provides modules for access to JAX internal machinery module. The
-        ``jax.extend`` module defines a library view of some of JAX’s internal
-        components.
-      - 0.4.15
-      - 5.5.0
-    * - ``jax.example_libraries``
-      - Serves as a collection of example code and libraries that demonstrate
-        various capabilities of JAX.
-      - 0.1.74
-      - 5.0.0
-    * - ``jax.experimental``
-      - Namespace for experimental features and APIs that are in development or
-        are not yet fully stable for production use.
-      - 0.1.56
-      - 5.0.0
-    * - ``jax.lib``
-      - Set of internal tools and types for bridging between JAX’s Python
-        frontend and its XLA backend.
-      - 0.4.6
-      - 5.3.0
-    * - ``jax_triton``
-      - Library that integrates the Triton deep learning compiler with JAX.
-      - jax_triton 0.2.0
-      - 6.2.4
-
-jax.scipy module
-------------------------------------------------------------------------------
-
-A SciPy-like API for scientific computing.
-
-.. list-table::
-    :header-rows: 1
-
-    * - Module
-      - As of JAX
-      - As of ROCm
-    * - ``jax.scipy.cluster``
-      - 0.3.11
-      - 5.1.0
-    * - ``jax.scipy.fft``
-      - 0.1.71
-      - 5.0.0
-    * - ``jax.scipy.integrate``
-      - 0.4.15
-      - 5.5.0
-    * - ``jax.scipy.interpolate``
-      - 0.1.76
-      - 5.0.0
-    * - ``jax.scipy.linalg``
-      - 0.1.56
-      - 5.0.0
-    * - ``jax.scipy.ndimage``
-      - 0.1.56
-      - 5.0.0
-    * - ``jax.scipy.optimize``
-      - 0.1.57
-      - 5.0.0
-    * - ``jax.scipy.signal``
-      - 0.1.56
-      - 5.0.0
-    * - ``jax.scipy.spatial.transform``
-      - 0.4.12
-      - 5.4.0
-    * - ``jax.scipy.sparse.linalg``
-      - 0.1.56
-      - 5.0.0
-    * - ``jax.scipy.special``
-      - 0.1.56
-      - 5.0.0
-    * - ``jax.scipy.stats``
-      - 0.1.56
-      - 5.0.0
-
-jax.scipy.stats module
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. list-table::
-   :header-rows: 1
-
-   * - Module
-     - As of JAX
-     - As of ROCm
-   * - ``jax.scipy.stats.bernouli``
-     - 0.1.56
-     - 5.0.0
-   * - ``jax.scipy.stats.beta``
-     - 0.1.56
-     - 5.0.0
-   * - ``jax.scipy.stats.betabinom``
-     - 0.1.61
-     - 5.0.0
-   * - ``jax.scipy.stats.binom``
-     - 0.4.14
-     - 5.4.0
-   * - ``jax.scipy.stats.cauchy``
-     - 0.1.56
-     - 5.0.0
-   * - ``jax.scipy.stats.chi2``
-     - 0.1.61
-     - 5.0.0
-   * - ``jax.scipy.stats.dirichlet``
-     - 0.1.56
-     - 5.0.0
-   * - ``jax.scipy.stats.expon``
-     - 0.1.56
-     - 5.0.0
-   * - ``jax.scipy.stats.gamma``
-     - 0.1.56
-     - 5.0.0
-   * - ``jax.scipy.stats.gennorm``
-     - 0.3.15
-     - 5.2.0
-   * - ``jax.scipy.stats.geom``
-     - 0.1.56
-     - 5.0.0
-   * - ``jax.scipy.stats.laplace``
-     - 0.1.56
-     - 5.0.0
-   * - ``jax.scipy.stats.logistic``
-     - 0.1.56
-     - 5.0.0
-   * - ``jax.scipy.stats.multinomial``
-     - 0.3.18
-     - 5.1.0
-   * - ``jax.scipy.stats.multivariate_normal``
-     - 0.1.56
-     - 5.0.0
-   * - ``jax.scipy.stats.nbinom``
-     - 0.1.72
-     - 5.0.0
-   * - ``jax.scipy.stats.norm``
-     - 0.1.56
-     - 5.0.0
-   * - ``jax.scipy.stats.pareto``
-     - 0.1.56
-     - 5.0.0
-   * - ``jax.scipy.stats.poisson``
-     - 0.1.56
-     - 5.0.0
-   * - ``jax.scipy.stats.t``
-     - 0.1.56
-     - 5.0.0
-   * - ``jax.scipy.stats.truncnorm``
-     - 0.4.0
-     - 5.3.0
-   * - ``jax.scipy.stats.uniform``
-     - 0.1.56
-     - 5.0.0
-   * - ``jax.scipy.stats.vonmises``
-     - 0.4.2
-     - 5.3.0
-   * - ``jax.scipy.stats.wrapcauchy``
-     - 0.4.20
-     - 5.6.0
-
-jax.extend module
-------------------------------------------------------------------------------
-
-Modules for JAX extensions.
-
-.. list-table::
-    :header-rows: 1
-
-    * - Module
-      - As of JAX
-      - As of ROCm
-    * - ``jax.extend.ffi``
-      - 0.4.30
-      - 6.0.0
-    * - ``jax.extend.linear_util``
-      - 0.4.17
-      - 5.6.0
-    * - ``jax.extend.mlir``
-      - 0.4.26
-      - 5.6.0
-    * - ``jax.extend.random``
-      - 0.4.15
-      - 5.5.0
-
-Unsupported JAX features
-===============================================================================
-
-The following GPU-accelerated JAX features are not supported by ROCm for
-the listed supported JAX versions.
-
-.. list-table::
-    :header-rows: 1
-
-    * - Feature
+    * - Data type
      - Description

-    * - Mixed Precision with TF32
-      - Mixed precision with TF32 is used for matrix multiplications,
-        convolutions, and other linear algebra operations, particularly in
-        deep learning workloads like CNNs and transformers.
+    * - ``bfloat16``
+      - 16-bit bfloat (brain floating point).

-    * - XLA int4 support
-      - 4-bit integer (int4) precision in the XLA compiler.
+    * - ``bool``
+      - Boolean.

-    * - MOSAIC (GPU)
-      - Mosaic is a library of kernel-building abstractions for JAX's Pallas system
+    * - ``complex128``
+      - 128-bit complex.
+
+    * - ``complex64``
+      - 64-bit complex.
+
+    * - ``float16``
+      - 16-bit (half precision) floating-point.
+
+    * - ``float32``
+      - 32-bit (single precision) floating-point.
+
+    * - ``float64``
+      - 64-bit (double precision) floating-point.
+
+    * - ``half``
+      - 16-bit (half precision) floating-point.
+
+    * - ``int16``
+      - Signed 16-bit integer.
+
+    * - ``int32``
+      - Signed 32-bit integer.
+
+    * - ``int64``
+      - Signed 64-bit integer.
+
+    * - ``int8``
+      - Signed 8-bit integer.
+
+    * - ``uint16``
+      - Unsigned 16-bit (word) integer.
+
+    * - ``uint32``
+      - Unsigned 32-bit (dword) integer.
+
+    * - ``uint64``
+      - Unsigned 64-bit (qword) integer.
+
+    * - ``uint8``
+      - Unsigned 8-bit (byte) integer.
+
+.. note::
+
+  JAX data type support is effected by the :ref:`key_rocm_libraries` and it's
+  collected on :doc:`ROCm data types and precision support <rocm:reference/precision-support>`
+  page.
+
+Supported modules
+--------------------------------------------------------------------------------
+
+For a complete and up-to-date list of JAX public modules (for example, ``jax.numpy``,
+``jax.scipy``, ``jax.lax``), their descriptions, and usage, please refer directly to the
+`official JAX API documentation <https://jax.readthedocs.io/en/latest/jax.html>`_.
+
+.. note::
+
+  Since version 0.1.56, JAX has full support for ROCm, and the
+  :ref:`Known issues and important notes <jax_comp_known_issues>` section
+  contains details about limitations specific to the ROCm backend. The list of
+  JAX API modules is maintained by the JAX project and is subject to change. 
+  Refer to the official Jax documentation for the most up-to-date information.
--- a/docs/compatibility/ml-compatibility/megablocks-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/megablocks-compatibility.rst
@@ -0,0 +1,93 @@
+:orphan:
+
+.. meta::
+    :description: Megablocks compatibility
+    :keywords: GPU, megablocks, compatibility
+
+.. version-set:: rocm_version latest
+
+********************************************************************************
+Megablocks compatibility
+********************************************************************************
+
+Megablocks is a light-weight library for mixture-of-experts (MoE) training. 
+The core of the system is efficient "dropless-MoE" and standard MoE layers. 
+Megablocks is integrated with `https://github.com/stanford-futuredata/Megatron-LM <https://github.com/stanford-futuredata/Megatron-LM>`_, 
+where data and pipeline parallel training of MoEs is supported.
+
+* ROCm support for Megablocks is hosted in the official `https://github.com/ROCm/megablocks <https://github.com/ROCm/megablocks>`_ repository. 
+* Due to independent compatibility considerations, this location differs from the `https://github.com/stanford-futuredata/Megatron-LM <https://github.com/stanford-futuredata/Megatron-LM>`_ upstream repository. 
+* Use the prebuilt :ref:`Docker image <megablocks-docker-compat>` with ROCm, PyTorch, and Megablocks preinstalled. 
+* See the :doc:`ROCm Megablocks installation guide <rocm-install-on-linux:install/3rd-party/megablocks-install>` to install and get started.
+
+.. note::
+
+  Megablocks is supported on ROCm 6.3.0.
+
+Supported devices
+================================================================================
+
+- **Officially Supported**: AMD Instinct MI300X
+- **Partially Supported** (functionality or performance limitations): AMD Instinct MI250X, MI210X
+
+Supported models and features
+================================================================================
+
+This section summarizes the Megablocks features supported by ROCm.
+
+* Distributed Pre-training
+* Activation Checkpointing and Recomputation
+* Distributed Optimizer
+* Mixture-of-Experts
+* dropless-Mixture-of-Experts
+
+
+.. _megablocks-recommendations:
+
+Use cases and recommendations
+================================================================================
+
+The `ROCm Megablocks blog posts <https://rocm.blogs.amd.com/artificial-intelligence/megablocks/README.html>`_ 
+guide how to leverage the ROCm platform for pre-training using the Megablocks framework. 
+It features how to pre-process datasets and how to begin pre-training on AMD GPUs through:
+
+* Single-GPU pre-training
+* Multi-GPU pre-training
+
+
+.. _megablocks-docker-compat:
+
+Docker image compatibility
+================================================================================
+
+.. |docker-icon| raw:: html
+
+   <i class="fab fa-docker"></i>
+
+AMD validates and publishes `ROCm Megablocks images <https://hub.docker.com/r/rocm/megablocks/tags>`_
+with ROCm and Pytorch backends on Docker Hub. The following Docker image tags and associated
+inventories represent the latest Megatron-LM version from the official Docker Hub.
+The Docker images have been validated for `ROCm 6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`_.
+Click |docker-icon| to view the image on Docker Hub.
+
+.. list-table:: 
+    :header-rows: 1
+    :class: docker-image-compatibility
+
+    * - Docker image
+      - ROCm
+      - Megablocks
+      - PyTorch
+      - Ubuntu
+      - Python
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/megablocks/megablocks-0.7.0_rocm6.3.0_ubuntu24.04_py3.12_pytorch2.4.0/images/sha256-372ff89b96599019b8f5f9db469c84add2529b713456781fa62eb9a148659ab4"><i class="fab fa-docker fa-lg"></i> rocm/megablocks</a>
+      - `6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`_
+      - `0.7.0 <https://github.com/databricks/megablocks/releases/tag/v0.7.0>`_
+      - `2.4.0 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
+      - 24.04
+      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
+
+
--- a/docs/compatibility/ml-compatibility/pytorch-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/pytorch-compatibility.rst
--- a/docs/compatibility/ml-compatibility/stanford-megatron-lm-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/stanford-megatron-lm-compatibility.rst
@@ -0,0 +1,100 @@
+:orphan:
+
+.. meta::
+    :description: Stanford Megatron-LM compatibility
+    :keywords: Stanford, Megatron-LM, compatibility
+
+.. version-set:: rocm_version latest
+
+********************************************************************************
+Stanford Megatron-LM compatibility
+********************************************************************************
+
+Stanford Megatron-LM is a large-scale language model training framework developed by NVIDIA `https://github.com/NVIDIA/Megatron-LM <https://github.com/NVIDIA/Megatron-LM>`_. It is
+designed to train massive transformer-based language models efficiently by model and data parallelism. 
+
+* ROCm support for Stanford Megatron-LM is hosted in the official `https://github.com/ROCm/Stanford-Megatron-LM <https://github.com/ROCm/Stanford-Megatron-LM>`_ repository. 
+* Due to independent compatibility considerations, this location differs from the `https://github.com/stanford-futuredata/Megatron-LM <https://github.com/stanford-futuredata/Megatron-LM>`_ upstream repository. 
+* Use the prebuilt :ref:`Docker image <megatron-lm-docker-compat>` with ROCm, PyTorch, and Megatron-LM preinstalled. 
+* See the :doc:`ROCm Stanford Megatron-LM installation guide <rocm-install-on-linux:install/3rd-party/stanford-megatron-lm-install>` to install and get started.
+
+.. note::
+
+	Stanford Megatron-LM is supported on ROCm 6.3.0.
+
+
+Supported Devices
+================================================================================
+
+- **Officially Supported**: AMD Instinct MI300X
+- **Partially Supported** (functionality or performance limitations): AMD Instinct MI250X, MI210X
+
+
+Supported models and features
+================================================================================
+
+This section details models & features that are supported by the ROCm version on Stanford Megatron-LM.
+
+Models:
+
+* Bert
+* GPT
+* T5
+* ICT
+
+Features:
+
+* Distributed Pre-training
+* Activation Checkpointing and Recomputation
+* Distributed Optimizer
+* Mixture-of-Experts
+
+.. _megatron-lm-recommendations:
+
+Use cases and recommendations
+================================================================================
+
+See the `Efficient MoE training on AMD ROCm: How-to use Megablocks on AMD GPUs blog <https://rocm.blogs.amd.com/artificial-intelligence/megablocks/README.html>`_ post  
+to leverage the ROCm platform for pre-training by using the Stanford Megatron-LM framework of pre-processing datasets on AMD GPUs. 
+Coverage includes:
+
+  * Single-GPU pre-training
+  * Multi-GPU pre-training
+
+
+.. _megatron-lm-docker-compat:
+
+Docker image compatibility
+================================================================================
+
+.. |docker-icon| raw:: html
+
+   <i class="fab fa-docker"></i>
+
+AMD validates and publishes `Stanford Megatron-LM images <https://hub.docker.com/r/rocm/megatron-lm>`_
+with ROCm and Pytorch backends on Docker Hub. The following Docker image tags and associated
+inventories represent the latest Megatron-LM version from the official Docker Hub.
+The Docker images have been validated for `ROCm 6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`_.
+Click |docker-icon| to view the image on Docker Hub.
+
+.. list-table:: 
+    :header-rows: 1
+    :class: docker-image-compatibility
+
+    * - Docker image
+      - Stanford Megatron-LM
+      - PyTorch
+      - Ubuntu
+      - Python
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/stanford-megatron-lm/stanford-megatron-lm85f95ae_rocm6.3.0_ubuntu24.04_py3.12_pytorch2.4.0/images/sha256-070556f078be10888a1421a2cb4f48c29f28b02bfeddae02588d1f7fc02a96a6"><i class="fab fa-docker fa-lg"></i></a>
+
+      - `85f95ae <https://github.com/stanford-futuredata/Megatron-LM/commit/85f95aef3b648075fe6f291c86714fdcbd9cd1f5>`_
+      - `2.4.0 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
+      - 24.04
+      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
+
+      
+
--- a/docs/compatibility/ml-compatibility/tensorflow-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/tensorflow-compatibility.rst
@@ -10,16 +10,16 @@
 TensorFlow compatibility
 *******************************************************************************

-`TensorFlow <https://www.tensorflow.org/>`_ is an open-source library for
+`TensorFlow <https://www.tensorflow.org/>`__ is an open-source library for
 solving machine learning, deep learning, and AI problems. It can solve many
 problems across different sectors and industries but primarily focuses on
 neural network training and inference. It is one of the most popular and
 in-demand frameworks and is very active in open-source contribution and
 development.

-The `official TensorFlow repository <http://github.com/tensorflow/tensorflow>`_
+The `official TensorFlow repository <http://github.com/tensorflow/tensorflow>`__
 includes full ROCm support. AMD maintains a TensorFlow `ROCm repository
-<http://github.com/rocm/tensorflow-upstream>`_ in order to quickly add bug
+<http://github.com/rocm/tensorflow-upstream>`__ in order to quickly add bug
 fixes, updates, and support for the latest ROCM versions.

 - ROCm TensorFlow release:
@@ -27,16 +27,16 @@ fixes, updates, and support for the latest ROCM versions.
  - Offers :ref:`Docker images <tensorflow-docker-compat>` with
    ROCm and TensorFlow pre-installed.

-  - ROCm TensorFlow repository: `<https://github.com/ROCm/tensorflow-upstream>`_
+  - ROCm TensorFlow repository: `<https://github.com/ROCm/tensorflow-upstream>`__

  - See the :doc:`ROCm TensorFlow installation guide <rocm-install-on-linux:install/3rd-party/tensorflow-install>`
    to get started.

 - Official TensorFlow release:

-  - Official TensorFlow repository: `<https://github.com/tensorflow/tensorflow>`_
+  - Official TensorFlow repository: `<https://github.com/tensorflow/tensorflow>`__

-  - See the `TensorFlow API versions <https://www.tensorflow.org/versions>`_ list.
+  - See the `TensorFlow API versions <https://www.tensorflow.org/versions>`__ list.

  .. note::

@@ -54,9 +54,9 @@ Docker image compatibility
   <i class="fab fa-docker"></i>

 AMD validates and publishes ready-made `TensorFlow images
-<https://hub.docker.com/r/rocm/tensorflow>`_ with ROCm backends on
+<https://hub.docker.com/r/rocm/tensorflow>`__ with ROCm backends on
 Docker Hub. The following Docker image tags and associated inventories are
-validated for `ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`_. Click
+validated for `ROCm 6.4.2 <https://repo.radeon.com/rocm/apt/6.4.2/>`__. Click
 the |docker-icon| icon to view the image on Docker Hub.

 .. list-table:: TensorFlow Docker image components
@@ -65,129 +65,62 @@ the |docker-icon| icon to view the image on Docker Hub.
    * - Docker image
      - TensorFlow
      - Ubuntu
-      - Dev
      - Python
      - TensorBoard

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4-py3.12-tf2.18-dev/images/sha256-fa9cf5fa6c6079a7118727531ccd0056c6e3224a42c3d6e78a49e7781daafff4"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.2-py3.12-tf2.18-dev/images/sha256-96754ce2d30f729e19b497279915b5212ba33d5e408e7e5dd3f2304d87e3441e"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>

-      - `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.18.1-cp312-cp312-manylinux_2_28_x86_64.whl>`__
-      - dev
+      - `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.2/tensorflow_rocm-2.18.1-cp312-cp312-manylinux_2_28_x86_64.whl>`__
      - 24.04
-      - `Python 3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
-      - `TensorBoard 2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`_
+      - `Python 3.12 <https://www.python.org/downloads/release/python-31210/>`__
+      - `TensorBoard 2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.12-tf2.18-runtime/images/sha256-d14d8c4989e7c9a60f4e72461b9e349de72347c6162dcd6897e6f4f80ffbb440"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.2-py3.10-tf2.18-dev/images/sha256-fa741508d383858e86985a9efac85174529127408102558ae2e3a4ac894eea1e"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>

-      - `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.18.1-cp312-cp312-manylinux_2_28_x86_64.whl>`__
-      - runtime
+      - `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.2/tensorflow_rocm-2.18.1-cp310-cp310-manylinux_2_28_x86_64.whl>`__
+      - 22.04
+      - `Python 3.10 <https://www.python.org/downloads/release/python-31017/>`__
+      - `TensorBoard 2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`__
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.2-py3.12-tf2.17-dev/images/sha256-3a0aef09f2a8833c2b64b85874dd9449ffc2ad257351857338ff5b706c03a418"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+
+      - `tensorflow-rocm 2.17.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.2/tensorflow_rocm-2.17.1-cp312-cp312-manylinux_2_28_x86_64.whl>`__
      - 24.04
-      - `Python 3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
-      - `TensorBoard 2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`_
+      - `Python 3.12 <https://www.python.org/downloads/release/python-31210/>`__
+      - `TensorBoard 2.17.1 <https://github.com/tensorflow/tensorboard/tree/2.17.1>`__

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.10-tf2.18-dev/images/sha256-081e5bd6615a5dc17247ebd2ccc26895c3feeff086720400fa39b477e60a77c0"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.2-py3.10-tf2.17-dev/images/sha256-bc7341a41ebe7ab261aa100732874507c452421ef733e408ac4f05ed453b0bc5"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>

-      - `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.18.1-cp310-cp310-manylinux_2_28_x86_64.whl>`__
-      - dev
+      - `tensorflow-rocm 2.17.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.2/tensorflow_rocm-2.17.1-cp310-cp310-manylinux_2_28_x86_64.whl>`__
      - 22.04
-      - `Python 3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
-      - `TensorBoard 2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`_
+      - `Python 3.10 <https://www.python.org/downloads/release/python-31017/>`__
+      - `TensorBoard 2.17.1 <https://github.com/tensorflow/tensorboard/tree/2.17.1>`__

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.10-tf2.18-runtime/images/sha256-bf369637378264f4af6ddad5ca8b8611d3e372ffbea9ab7a06f1e122f0a0867b"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.2-py3.12-tf2.16-dev/images/sha256-4841a8df7c340dab79bf9362dad687797649a00d594e0832eb83ea6880a40d3b"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>

-      - `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.18.1-cp310-cp310-manylinux_2_28_x86_64.whl>`__
-      - runtime
-      - 22.04
-      - `Python 3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
-      - `TensorBoard 2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`_
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.12-tf2.17-dev/images/sha256-5a502008c50d0b6508e6027f911bdff070a7493700ae064bed74e1d22b91ed50"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
-
-      - `tensorflow-rocm 2.17.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4/tensorflow_rocm-2.17.1-cp312-cp312-manylinux_2_28_x86_64.whl>`__
-      - dev
+      - `tensorflow-rocm 2.16.2 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.2/tensorflow_rocm-2.16.2-cp312-cp312-manylinux_2_28_x86_64.whl>`__
      - 24.04
-      - `Python 3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
-      - `TensorBoard 2.17.1 <https://github.com/tensorflow/tensorboard/tree/2.17.1>`_
+      - `Python 3.12 <https://www.python.org/downloads/release/python-31210/>`__
+      - `TensorBoard 2.16.2 <https://github.com/tensorflow/tensorboard/tree/2.16.2>`__

    * - .. raw:: html

-           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.12-tf2.17-runtime/images/sha256-1ee5dfffceb71ac66617ada33de3a10de0cb74199cc4b82441192e5e92fa2ddf"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.2-py3.10-tf2.16-dev/images/sha256-883fa95aba960c58a3e46fceaa18f03ede2c7df89b8e9fd603ab2d47e0852897"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>

-      - `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4/tensorflow_rocm-2.17.1-cp312-cp312-manylinux_2_28_x86_64.whl>`__
-      - runtime
-      - 24.04
-      - `Python 3.12.10 <https://www.python.org/downloads/release/python-3124/>`_
-      - `TensorBoard 2.17.1 <https://github.com/tensorflow/tensorboard/tree/2.17.1>`_
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.10-tf2.17-dev/images/sha256-109218ad92bfae83bbd2710475f7502166e1ed54ca0b9748a9cbc3f5a1d75af1"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
-
-      - `tensorflow-rocm 2.17.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.17.1-cp312-cp312-manylinux_2_28_x86_64.whl>`__
-      - dev
+      - `tensorflow-rocm 2.16.2 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.2/tensorflow_rocm-2.16.2-cp310-cp310-manylinux_2_28_x86_64.whl>`__
      - 22.04
-      - `Python 3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
-      - `TensorBoard 2.17.1 <https://github.com/tensorflow/tensorboard/tree/2.17.1>`_
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.10-tf2.17-runtime/images/sha256-5d78bd5918d394f92263daa2990e88d695d27200dd90ed83ec64d20c7661c9c1"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
-
-      - `tensorflow-rocm 2.17.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.17.1-cp310-cp310-manylinux_2_28_x86_64.whl>`__
-      - runtime
-      - 22.04
-      - `Python 3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
-      - `TensorBoard 2.17.1 <https://github.com/tensorflow/tensorboard/tree/2.17.1>`_
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.12-tf2.16-dev/images/sha256-b09b1ad921c09c687b7c916141051e9fcf15539a5686e5aa67c689195a522719"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
-
-      - `tensorflow-rocm 2.16.2 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.16.2-cp312-cp312-manylinux_2_28_x86_64.whl>`__
-      - dev
-      - 24.04
-      - `Python 3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
-      - `TensorBoard 2.16.2 <https://github.com/tensorflow/tensorboard/tree/2.16.2>`_
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.12-tf2.16-runtime/images/sha256-20dbd824e85558abfe33fc9283cc547d88cde3c623fe95322743a5082f883a64"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
-
-      - `tensorflow-rocm 2.16.2 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.16.2-cp312-cp312-manylinux_2_28_x86_64.whl>`__
-      - runtime
-      - 24.04
-      - `Python 3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
-      - `TensorBoard 2.16.2 <https://github.com/tensorflow/tensorboard/tree/2.16.2>`_
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.10-tf2.16-dev/images/sha256-36c4fa047c86e2470ac473ec1429aea6d4b8934b90ffeb34d1afab40e7e5b377"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
-
-      - `tensorflow-rocm 2.16.2 <https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.10-tf2.16-dev/images/sha256-36c4fa047c86e2470ac473ec1429aea6d4b8934b90ffeb34d1afab40e7e5b377>`__
-      - dev
-      - 22.04
-      - `Python 3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
-      - `TensorBoard 2.16.2 <https://github.com/tensorflow/tensorboard/tree/2.16.2>`_
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.10-tf2.16-runtime/images/sha256-a94150ffb81365234ebfa34e764db5474bc6ab7d141b56495eac349778dafcf3"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
-
-      - `tensorflow-rocm 2.16.2 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.16.2-cp312-cp312-manylinux_2_28_x86_64.whl>`__
-      - runtime
-      - 22.04
-      - `Python 3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
-      - `TensorBoard 2.16.2 <https://github.com/tensorflow/tensorboard/tree/2.16.2>`_
+      - `Python 3.10 <https://www.python.org/downloads/release/python-31017/>`__
+      - `TensorBoard 2.16.2 <https://github.com/tensorflow/tensorboard/tree/2.16.2>`__


 Critical ROCm libraries for TensorFlow
@@ -207,43 +140,43 @@ are available in ROCm :version:`rocm_version`.
      - Version
      - Purpose
      - Used in
-    * - `hipBLAS <https://github.com/ROCm/hipBLAS>`_
+    * - `hipBLAS <https://github.com/ROCm/hipBLAS>`__
      - :version-ref:`hipBLAS rocm_version`
      - Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
        matrix and vector operations.
      - Accelerates operations like ``tf.matmul``, ``tf.linalg.matmul``, and
        other matrix multiplications commonly used in neural network layers.
-    * - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`_
+    * - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`__
      - :version-ref:`hipBLASLt rocm_version`
      - Extends hipBLAS with additional optimizations like fused kernels and
        integer tensor cores.
      - Optimizes matrix multiplications and linear algebra operations used in
        layers like dense, convolutional, and RNNs in TensorFlow.
-    * - `hipCUB <https://github.com/ROCm/hipCUB>`_
+    * - `hipCUB <https://github.com/ROCm/hipCUB>`__
      - :version-ref:`hipCUB rocm_version`
      - Provides a C++ template library for parallel algorithms for reduction,
        scan, sort and select.
      - Supports operations like ``tf.reduce_sum``, ``tf.cumsum``, ``tf.sort``
        and other tensor operations in TensorFlow, especially those involving
        scanning, sorting, and filtering.
-    * - `hipFFT <https://github.com/ROCm/hipFFT>`_
+    * - `hipFFT <https://github.com/ROCm/hipFFT>`__
      - :version-ref:`hipFFT rocm_version`
      - Accelerates Fast Fourier Transforms (FFT) for signal processing tasks.
      - Used for operations like signal processing, image filtering, and
        certain types of neural networks requiring FFT-based transformations.
-    * - `hipSOLVER <https://github.com/ROCm/hipSOLVER>`_
+    * - `hipSOLVER <https://github.com/ROCm/hipSOLVER>`__
      - :version-ref:`hipSOLVER rocm_version`
      - Provides GPU-accelerated direct linear solvers for dense and sparse
        systems.
      - Optimizes linear algebra functions such as solving systems of linear
        equations, often used in optimization and training tasks.
-    * - `hipSPARSE <https://github.com/ROCm/hipSPARSE>`_
+    * - `hipSPARSE <https://github.com/ROCm/hipSPARSE>`__
      - :version-ref:`hipSPARSE rocm_version`
      - Optimizes sparse matrix operations for efficient computations on sparse
        data.
      - Accelerates sparse matrix operations in models with sparse weight
        matrices or activations, commonly used in neural networks.
-    * - `MIOpen <https://github.com/ROCm/MIOpen>`_
+    * - `MIOpen <https://github.com/ROCm/MIOpen>`__
      - :version-ref:`MIOpen rocm_version`
      - Provides optimized deep learning primitives such as convolutions,
        pooling,
@@ -251,13 +184,13 @@ are available in ROCm :version:`rocm_version`.
      - Speeds up convolutional neural networks (CNNs) and other layers. Used
        in TensorFlow for layers like ``tf.nn.conv2d``, ``tf.nn.relu``, and
        ``tf.nn.lstm_cell``.
-    * - `RCCL <https://github.com/ROCm/rccl>`_
+    * - `RCCL <https://github.com/ROCm/rccl>`__
      - :version-ref:`RCCL rocm_version`
      - Optimizes for multi-GPU communication for operations like AllReduce and
        Broadcast.
      - Distributed data parallel training (``tf.distribute.MirroredStrategy``).
        Handles communication in multi-GPU setups.
-    * - `rocThrust <https://github.com/ROCm/rocThrust>`_
+    * - `rocThrust <https://github.com/ROCm/rocThrust>`__
      - :version-ref:`rocThrust rocm_version`
      - Provides a C++ template library for parallel algorithms like sorting,
        reduction, and scanning.
@@ -278,7 +211,7 @@ The data type of a tensor is specified using the ``dtype`` attribute or
 argument, and TensorFlow supports a wide range of data types for different use
 cases.

-The basic, single data types of `tf.dtypes <https://www.tensorflow.org/api_docs/python/tf/dtypes>`_
+The basic, single data types of `tf.dtypes <https://www.tensorflow.org/api_docs/python/tf/dtypes>`__
 are as follows:

 .. list-table::
@@ -550,7 +483,7 @@ Use cases and recommendations
 ===============================================================================

 * The `Training a Neural Collaborative Filtering (NCF) Recommender on an AMD
-  GPU <https://rocm.blogs.amd.com/artificial-intelligence/ncf/README.html>`_
+  GPU <https://rocm.blogs.amd.com/artificial-intelligence/ncf/README.html>`__
  blog post discusses training an NCF recommender system using TensorFlow. It
  explains how NCF improves traditional collaborative filtering methods by
  leveraging neural networks to model non-linear user-item interactions. The
@@ -559,7 +492,7 @@ Use cases and recommendations
  purchasing) and how it addresses challenges like the lack of negative values.

 * The `Creating a PyTorch/TensorFlow code environment on AMD GPUs
-  <https://rocm.blogs.amd.com/software-tools-optimization/pytorch-tensorflow-env/README.html>`_
+  <https://rocm.blogs.amd.com/software-tools-optimization/pytorch-tensorflow-env/README.html>`__
  blog post provides instructions for creating a machine learning environment
  for PyTorch and TensorFlow on AMD GPUs using ROCm. It covers steps like
  installing the libraries, cloning code repositories, installing dependencies,
@@ -568,4 +501,4 @@ Use cases and recommendations
  for a better experience on AMD GPUs. This guide aims to help data scientists
  and ML practitioners adapt their code for AMD GPUs.

-For more use cases and recommendations, see the `ROCm Tensorflow blog posts <https://rocm.blogs.amd.com/blog/tag/tensorflow.html>`_.
+For more use cases and recommendations, see the `ROCm Tensorflow blog posts <https://rocm.blogs.amd.com/blog/tag/tensorflow.html>`__.
--- a/docs/compatibility/ml-compatibility/verl-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/verl-compatibility.rst
@@ -0,0 +1,86 @@
+:orphan:
+
+.. meta::
+   :description: verl compatibility
+   :keywords: GPU, verl compatibility
+
+.. version-set:: rocm_version latest
+
+*******************************************************************************
+verl compatibility
+*******************************************************************************
+
+Volcano Engine Reinforcement Learning for LLMs (verl) is a reinforcement learning framework designed for large language models (LLMs). 
+verl offers a scalable, open-source fine-tuning solution optimized for AMD Instinct GPUs with full ROCm support.
+
+* See the `verl documentation <https://verl.readthedocs.io/en/latest/>`_ for more information about verl. 
+* The official verl GitHub repository is `https://github.com/volcengine/verl <https://github.com/volcengine/verl>`_.
+* Use the AMD-validated :ref:`Docker images <verl-docker-compat>` with ROCm and verl preinstalled. 
+* See the :doc:`ROCm verl installation guide <rocm-install-on-linux:install/3rd-party/verl-install>` to install and get started.
+
+.. note::
+
+	verl is supported on ROCm 6.2.0.
+
+.. _verl-recommendations:
+
+Use cases and recommendations
+================================================================================
+
+The benefits of verl in large-scale reinforcement learning from human feedback (RLHF) are discussed in the `Reinforcement Learning from Human Feedback on AMD GPUs with verl and ROCm Integration <https://rocm.blogs.amd.com/artificial-intelligence/verl-large-scale/README.html>`_ blog.
+
+.. _verl-supported_features:
+
+Supported features
+===============================================================================
+
+The following table shows verl on ROCm support for GPU-accelerated modules.
+
+.. list-table::
+    :header-rows: 1
+
+    * - Module
+      - Description
+      - verl version
+      - ROCm version
+    * - ``FSDP``
+      - Training engine
+      - 0.3.0.post0
+      - 6.2.0
+    * - ``vllm``
+      - Inference engine
+      - 0.3.0.post0
+      - 6.2.0
+
+.. _verl-docker-compat:
+
+Docker image compatibility
+================================================================================
+
+.. |docker-icon| raw:: html
+
+   <i class="fab fa-docker"></i>
+
+AMD validates and publishes ready-made `ROCm verl Docker images <https://hub.docker.com/r/rocm/verl/tags>`_
+with ROCm backends on Docker Hub. The following Docker image tags and associated inventories represent the available verl versions from the official Docker Hub. 
+
+.. list-table:: 
+    :header-rows: 1
+
+    *   - Docker image
+        - ROCm
+        - verl
+        - Ubuntu
+        - Pytorch
+        - Python
+        - vllm
+
+    *   - .. raw:: html
+
+            <a href="https://hub.docker.com/layers/rocm/verl/verl-0.3.0.post0_rocm6.2_vllm0.6.3/images/sha256-cbe423803fd7850448b22444176bee06f4dcf22cd3c94c27732752d3a39b04b2"><i class="fab fa-docker fa-lg"></i> rocm/verl</a>
+        - `6.2.0 <https://repo.radeon.com/rocm/apt/6.2/>`_
+        - `0.3.0post0 <https://github.com/volcengine/verl/releases/tag/v0.3.0.post0>`_
+        - 20.04
+        - `2.5.0 <https://github.com/ROCm/pytorch/tree/release/2.5>`_
+        - `3.9.19 <https://www.python.org/downloads/release/python-3919/>`_
+        - `0.6.3 <https://github.com/vllm-project/vllm/releases/tag/v0.6.3>`_
--- a/docs/conceptual/gpu-arch/mi300-mi200-performance-counters.rst
+++ b/docs/conceptual/gpu-arch/mi300-mi200-performance-counters.rst
@@ -8,7 +8,7 @@ MI300 and MI200 series performance counters and metrics

 This document lists and describes the hardware performance counters and derived metrics available
 for the AMD Instinct™ MI300 and MI200 GPU. You can also access this information using the
-:doc:`ROCProfiler tool <rocprofiler:rocprofv1>`.
+:doc:`ROCprofiler-SDK <rocprofiler-sdk:how-to/using-rocprofv3>`.

 MI300 and MI200 series performance counters
 ===============================================================
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -12,6 +12,54 @@ from pathlib import Path
 shutil.copy2("../RELEASE.md", "./about/release-notes.md")
 shutil.copy2("../CHANGELOG.md", "./release/changelog.md")

+# Mark the consolidated changelog as orphan to prevent Sphinx from warning about missing toctree entries
+with open("./release/changelog.md", "r+") as file:
+    content = file.read()
+    file.seek(0)
+    file.write(":orphan:\n" + content)
+
+# Replace GitHub-style [!ADMONITION]s with Sphinx-compatible ```{admonition} blocks
+with open("./release/changelog.md", "r") as file:
+    lines = file.readlines()
+
+    modified_lines = []
+    in_admonition_section = False
+
+    # Map for matching the specific admonition type to its corresponding Sphinx markdown syntax
+    admonition_types = {
+        '> [!NOTE]': '```{note}',
+        '> [!TIP]': '```{tip}',
+        '> [!IMPORTANT]': '```{important}',
+        '> [!WARNING]': '```{warning}',
+        '> [!CAUTION]': '```{caution}'
+    }
+
+    for line in lines:
+        if any(line.startswith(k) for k in admonition_types):
+            for key in admonition_types:
+                if(line.startswith(key)):
+                    modified_lines.append(admonition_types[key] + '\n')
+                    break
+            in_admonition_section = True
+        elif in_admonition_section:
+            if line.strip() == '':
+                # If we encounter an empty line, close the admonition section
+                modified_lines.append('```\n\n')  # Close the admonition block
+                in_admonition_section = False
+            else:
+                modified_lines.append(line.lstrip('> '))
+        else:
+            modified_lines.append(line)
+
+    # In case the file ended while still in a admonition section, close it
+    if in_admonition_section:
+        modified_lines.append('```')
+
+    file.close()
+
+    with open("./release/changelog.md", 'w') as file:
+        file.writelines(modified_lines)
+
 os.system("mkdir -p ../_readthedocs/html/downloads")
 os.system("cp compatibility/compatibility-matrix-historical-6.0.csv ../_readthedocs/html/downloads/compatibility-matrix-historical-6.0.csv")

@@ -34,20 +82,24 @@ project = "ROCm Documentation"
 project_path = os.path.abspath(".").replace("\\", "/")
 author = "Advanced Micro Devices, Inc."
 copyright = "Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved."
-version = "6.4.1"
-release = "6.4.1"
+version = "6.4.2"
+release = "6.4.2"
 setting_all_article_info = True
 all_article_info_os = ["linux", "windows"]
 all_article_info_author = ""

 # pages with specific settings
 article_pages = [
-    {"file": "about/release-notes", "os": ["linux"], "date": "2025-05-07"},
+    {"file": "about/release-notes", "os": ["linux"], "date": "2025-07-21"},
    {"file": "release/changelog", "os": ["linux"],},
    {"file": "compatibility/compatibility-matrix", "os": ["linux"]},
    {"file": "compatibility/ml-compatibility/pytorch-compatibility", "os": ["linux"]},
    {"file": "compatibility/ml-compatibility/tensorflow-compatibility", "os": ["linux"]},
    {"file": "compatibility/ml-compatibility/jax-compatibility", "os": ["linux"]},
+    {"file": "compatibility/ml-compatibility/verl-compatibility", "os": ["linux"]},
+    {"file": "compatibility/ml-compatibility/stanford-megatron-lm-compatibility", "os": ["linux"]},
+    {"file": "compatibility/ml-compatibility/dgl-compatibility", "os": ["linux"]},
+    {"file": "compatibility/ml-compatibility/megablocks-compatibility", "os": ["linux"]},
    {"file": "how-to/deep-learning-rocm", "os": ["linux"]},

    {"file": "how-to/rocm-for-ai/index", "os": ["linux"]},
@@ -57,10 +109,22 @@ article_pages = [
    {"file": "how-to/rocm-for-ai/training/index", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/train-a-model", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/prerequisite-system-validation", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/benchmark-docker/megatron-lm", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/scale-model-training", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/megatron-lm", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v24.12-dev", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.5", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.3", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.4", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry", "os": ["linux"]},

    {"file": "how-to/rocm-for-ai/fine-tuning/index", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/fine-tuning/overview", "os": ["linux"]},
@@ -71,8 +135,18 @@ article_pages = [
    {"file": "how-to/rocm-for-ai/inference/index", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/hugging-face-models", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/llm-inference-frameworks", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/inference/vllm-benchmark", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/inference/pytorch-inference-benchmark", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/vllm", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.4.3", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.4", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.6", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.7.3-20250325", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.3-20250415", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250513", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250521", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250605", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250702", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/deploy-your-model", "os": ["linux"]},

    {"file": "how-to/rocm-for-ai/inference-optimization/index", "os": ["linux"]},
@@ -128,6 +202,7 @@ html_theme_options = {"link_main_doc": False}
 redirects = {"reference/openmp/openmp": "../../about/compatibility/openmp.html"}

 numfig = False
+suppress_warnings = ["autosectionlabel.*"]

 html_context = {
    "project_path" : {project_path},
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.7.3_20250325-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.7.3_20250325-benchmark-models.yaml
@@ -0,0 +1,159 @@
+vllm_benchmark:
+  unified_docker:
+    latest:
+      pull_tag: rocm/vllm:rocm6.3.1_instinct_vllm0.7.3_20250325
+      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.7.3_20250325/images/sha256-25245924f61750b19be6dcd8e787e46088a496c1fe17ee9b9e397f3d84d35640
+      rocm_version: 6.3.1
+      vllm_version: 0.7.3
+      pytorch_version: 2.7.0 (dev nightly)
+      hipblaslt_version: 0.13
+  model_groups:
+    - group: Llama
+      tag: llama
+      models:
+      - model: Llama 3.1 8B
+        mad_tag: pyt_vllm_llama-3.1-8b
+        model_repo: meta-llama/Llama-3.1-8B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-8B
+        precision: float16
+      - model: Llama 3.1 70B
+        mad_tag: pyt_vllm_llama-3.1-70b
+        model_repo: meta-llama/Llama-3.1-70B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
+        precision: float16
+      - model: Llama 3.1 405B
+        mad_tag: pyt_vllm_llama-3.1-405b
+        model_repo: meta-llama/Llama-3.1-405B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
+        precision: float16
+      - model: Llama 3.2 11B Vision
+        mad_tag: pyt_vllm_llama-3.2-11b-vision-instruct
+        model_repo: meta-llama/Llama-3.2-11B-Vision-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct
+        precision: float16
+      - model: Llama 2 7B
+        mad_tag: pyt_vllm_llama-2-7b
+        model_repo: meta-llama/Llama-2-7b-chat-hf
+        url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
+        precision: float16
+      - model: Llama 2 70B
+        mad_tag: pyt_vllm_llama-2-70b
+        model_repo: meta-llama/Llama-2-70b-chat-hf
+        url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
+        precision: float16
+      - model: Llama 3.1 8B FP8
+        mad_tag: pyt_vllm_llama-3.1-8b_fp8
+        model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
+        precision: float8
+      - model: Llama 3.1 70B FP8
+        mad_tag: pyt_vllm_llama-3.1-70b_fp8
+        model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
+        precision: float8
+      - model: Llama 3.1 405B FP8
+        mad_tag: pyt_vllm_llama-3.1-405b_fp8
+        model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
+        precision: float8
+    - group: Mistral
+      tag: mistral
+      models:
+      - model: Mixtral MoE 8x7B
+        mad_tag: pyt_vllm_mixtral-8x7b
+        model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
+        url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
+        precision: float16
+      - model: Mixtral MoE 8x22B
+        mad_tag: pyt_vllm_mixtral-8x22b
+        model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
+        url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
+        precision: float16
+      - model: Mistral 7B
+        mad_tag: pyt_vllm_mistral-7b
+        model_repo: mistralai/Mistral-7B-Instruct-v0.3
+        url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
+        precision: float16
+      - model: Mixtral MoE 8x7B FP8
+        mad_tag: pyt_vllm_mixtral-8x7b_fp8
+        model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+        precision: float8
+      - model: Mixtral MoE 8x22B FP8
+        mad_tag: pyt_vllm_mixtral-8x22b_fp8
+        model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+        precision: float8
+      - model: Mistral 7B FP8
+        mad_tag: pyt_vllm_mistral-7b_fp8
+        model_repo: amd/Mistral-7B-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
+        precision: float8
+    - group: Qwen
+      tag: qwen
+      models:
+      - model: Qwen2 7B
+        mad_tag: pyt_vllm_qwen2-7b
+        model_repo: Qwen/Qwen2-7B-Instruct
+        url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
+        precision: float16
+      - model: Qwen2 72B
+        mad_tag: pyt_vllm_qwen2-72b
+        model_repo: Qwen/Qwen2-72B-Instruct
+        url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
+        precision: float16
+    - group: JAIS
+      tag: jais
+      models:
+      - model: JAIS 13B
+        mad_tag: pyt_vllm_jais-13b
+        model_repo: core42/jais-13b-chat
+        url: https://huggingface.co/core42/jais-13b-chat
+        precision: float16
+      - model: JAIS 30B
+        mad_tag: pyt_vllm_jais-30b
+        model_repo: core42/jais-30b-chat-v3
+        url: https://huggingface.co/core42/jais-30b-chat-v3
+        precision: float16
+    - group: DBRX
+      tag: dbrx
+      models:
+      - model: DBRX Instruct
+        mad_tag: pyt_vllm_dbrx-instruct
+        model_repo: databricks/dbrx-instruct
+        url: https://huggingface.co/databricks/dbrx-instruct
+        precision: float16
+      - model: DBRX Instruct FP8
+        mad_tag: pyt_vllm_dbrx_fp8
+        model_repo: amd/dbrx-instruct-FP8-KV
+        url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
+        precision: float8
+    - group: Gemma
+      tag: gemma
+      models:
+      - model: Gemma 2 27B
+        mad_tag: pyt_vllm_gemma-2-27b
+        model_repo: google/gemma-2-27b
+        url: https://huggingface.co/google/gemma-2-27b
+        precision: float16
+    - group: Cohere
+      tag: cohere
+      models:
+      - model: C4AI Command R+ 08-2024
+        mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
+        model_repo: CohereForAI/c4ai-command-r-plus-08-2024
+        url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
+        precision: float16
+      - model: C4AI Command R+ 08-2024 FP8
+        mad_tag: pyt_vllm_command-r-plus_fp8
+        model_repo: amd/c4ai-command-r-plus-FP8-KV
+        url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
+        precision: float8
+    - group: DeepSeek
+      tag: deepseek
+      models:
+      - model: DeepSeek MoE 16B
+        mad_tag: pyt_vllm_deepseek-moe-16b-chat
+        model_repo: deepseek-ai/deepseek-moe-16b-chat
+        url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
+        precision: float16
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.8.3_20250415-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.8.3_20250415-benchmark-models.yaml
@@ -0,0 +1,152 @@
+vllm_benchmark:
+  unified_docker:
+    latest:
+      pull_tag: rocm/vllm:rocm6.3.1_instinct_vllm0.8.3_20250415
+      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.8.3_20250415/images/sha256-ad9062dea3483d59dedb17c67f7c49f30eebd6eb37c3fac0a171fb19696cc845
+      rocm_version: 6.3.1
+      vllm_version: 0.8.3
+      pytorch_version: 2.7.0 (dev nightly)
+      hipblaslt_version: 0.13
+  model_groups:
+    - group: Llama
+      tag: llama
+      models:
+      - model: Llama 3.1 8B
+        mad_tag: pyt_vllm_llama-3.1-8b
+        model_repo: meta-llama/Llama-3.1-8B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-8B
+        precision: float16
+      - model: Llama 3.1 70B
+        mad_tag: pyt_vllm_llama-3.1-70b
+        model_repo: meta-llama/Llama-3.1-70B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
+        precision: float16
+      - model: Llama 3.1 405B
+        mad_tag: pyt_vllm_llama-3.1-405b
+        model_repo: meta-llama/Llama-3.1-405B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
+        precision: float16
+      - model: Llama 3.2 11B Vision
+        mad_tag: pyt_vllm_llama-3.2-11b-vision-instruct
+        model_repo: meta-llama/Llama-3.2-11B-Vision-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct
+        precision: float16
+      - model: Llama 2 7B
+        mad_tag: pyt_vllm_llama-2-7b
+        model_repo: meta-llama/Llama-2-7b-chat-hf
+        url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
+        precision: float16
+      - model: Llama 2 70B
+        mad_tag: pyt_vllm_llama-2-70b
+        model_repo: meta-llama/Llama-2-70b-chat-hf
+        url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
+        precision: float16
+      - model: Llama 3.1 8B FP8
+        mad_tag: pyt_vllm_llama-3.1-8b_fp8
+        model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
+        precision: float8
+      - model: Llama 3.1 70B FP8
+        mad_tag: pyt_vllm_llama-3.1-70b_fp8
+        model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
+        precision: float8
+      - model: Llama 3.1 405B FP8
+        mad_tag: pyt_vllm_llama-3.1-405b_fp8
+        model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
+        precision: float8
+    - group: Mistral
+      tag: mistral
+      models:
+      - model: Mixtral MoE 8x7B
+        mad_tag: pyt_vllm_mixtral-8x7b
+        model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
+        url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
+        precision: float16
+      - model: Mixtral MoE 8x22B
+        mad_tag: pyt_vllm_mixtral-8x22b
+        model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
+        url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
+        precision: float16
+      - model: Mistral 7B
+        mad_tag: pyt_vllm_mistral-7b
+        model_repo: mistralai/Mistral-7B-Instruct-v0.3
+        url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
+        precision: float16
+      - model: Mixtral MoE 8x7B FP8
+        mad_tag: pyt_vllm_mixtral-8x7b_fp8
+        model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+        precision: float8
+      - model: Mixtral MoE 8x22B FP8
+        mad_tag: pyt_vllm_mixtral-8x22b_fp8
+        model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+        precision: float8
+      - model: Mistral 7B FP8
+        mad_tag: pyt_vllm_mistral-7b_fp8
+        model_repo: amd/Mistral-7B-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
+        precision: float8
+    - group: Qwen
+      tag: qwen
+      models:
+      - model: Qwen2 7B
+        mad_tag: pyt_vllm_qwen2-7b
+        model_repo: Qwen/Qwen2-7B-Instruct
+        url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
+        precision: float16
+      - model: Qwen2 72B
+        mad_tag: pyt_vllm_qwen2-72b
+        model_repo: Qwen/Qwen2-72B-Instruct
+        url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
+        precision: float16
+      - model: QwQ-32B
+        mad_tag: pyt_vllm_qwq-32b
+        model_repo: Qwen/QwQ-32B
+        url: https://huggingface.co/Qwen/QwQ-32B
+        precision: float16
+        tunableop: true
+    - group: DBRX
+      tag: dbrx
+      models:
+      - model: DBRX Instruct
+        mad_tag: pyt_vllm_dbrx-instruct
+        model_repo: databricks/dbrx-instruct
+        url: https://huggingface.co/databricks/dbrx-instruct
+        precision: float16
+      - model: DBRX Instruct FP8
+        mad_tag: pyt_vllm_dbrx_fp8
+        model_repo: amd/dbrx-instruct-FP8-KV
+        url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
+        precision: float8
+    - group: Gemma
+      tag: gemma
+      models:
+      - model: Gemma 2 27B
+        mad_tag: pyt_vllm_gemma-2-27b
+        model_repo: google/gemma-2-27b
+        url: https://huggingface.co/google/gemma-2-27b
+        precision: float16
+    - group: Cohere
+      tag: cohere
+      models:
+      - model: C4AI Command R+ 08-2024
+        mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
+        model_repo: CohereForAI/c4ai-command-r-plus-08-2024
+        url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
+        precision: float16
+      - model: C4AI Command R+ 08-2024 FP8
+        mad_tag: pyt_vllm_command-r-plus_fp8
+        model_repo: amd/c4ai-command-r-plus-FP8-KV
+        url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
+        precision: float8
+    - group: DeepSeek
+      tag: deepseek
+      models:
+      - model: DeepSeek MoE 16B
+        mad_tag: pyt_vllm_deepseek-moe-16b-chat
+        model_repo: deepseek-ai/deepseek-moe-16b-chat
+        url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
+        precision: float16
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.8.5_20250521-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.8.5_20250521-benchmark-models.yaml
@@ -0,0 +1,167 @@
+vllm_benchmark:
+  unified_docker:
+    latest:
+      pull_tag: rocm/vllm:rocm6.3.1_vllm0.8.5_20250521
+      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250521/images/sha256-38410c51af7208897cd8b737c9bdfc126e9bc8952d4aa6b88c85482f03092a11
+      rocm_version: 6.3.1
+      vllm_version: 0.8.5 (0.8.6.dev315+g91a560098.rocm631)
+      pytorch_version: 2.7.0+gitf717b2a
+      hipblaslt_version: 0.15
+  model_groups:
+    - group: Meta Llama
+      tag: llama
+      models:
+        - model: Llama 3.1 8B
+          mad_tag: pyt_vllm_llama-3.1-8b
+          model_repo: meta-llama/Llama-3.1-8B-Instruct
+          url: https://huggingface.co/meta-llama/Llama-3.1-8B
+          precision: float16
+        - model: Llama 3.1 70B
+          mad_tag: pyt_vllm_llama-3.1-70b
+          model_repo: meta-llama/Llama-3.1-70B-Instruct
+          url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
+          precision: float16
+        - model: Llama 3.1 405B
+          mad_tag: pyt_vllm_llama-3.1-405b
+          model_repo: meta-llama/Llama-3.1-405B-Instruct
+          url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
+          precision: float16
+        - model: Llama 3.2 11B Vision
+          mad_tag: pyt_vllm_llama-3.2-11b-vision-instruct
+          model_repo: meta-llama/Llama-3.2-11B-Vision-Instruct
+          url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct
+          precision: float16
+        - model: Llama 2 7B
+          mad_tag: pyt_vllm_llama-2-7b
+          model_repo: meta-llama/Llama-2-7b-chat-hf
+          url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
+          precision: float16
+        - model: Llama 2 70B
+          mad_tag: pyt_vllm_llama-2-70b
+          model_repo: meta-llama/Llama-2-70b-chat-hf
+          url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
+          precision: float16
+        - model: Llama 3.1 8B FP8
+          mad_tag: pyt_vllm_llama-3.1-8b_fp8
+          model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
+          url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
+          precision: float8
+        - model: Llama 3.1 70B FP8
+          mad_tag: pyt_vllm_llama-3.1-70b_fp8
+          model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
+          url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
+          precision: float8
+        - model: Llama 3.1 405B FP8
+          mad_tag: pyt_vllm_llama-3.1-405b_fp8
+          model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
+          url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
+          precision: float8
+    - group: Mistral AI
+      tag: mistral
+      models:
+        - model: Mixtral MoE 8x7B
+          mad_tag: pyt_vllm_mixtral-8x7b
+          model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
+          url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
+          precision: float16
+        - model: Mixtral MoE 8x22B
+          mad_tag: pyt_vllm_mixtral-8x22b
+          model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
+          url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
+          precision: float16
+        - model: Mistral 7B
+          mad_tag: pyt_vllm_mistral-7b
+          model_repo: mistralai/Mistral-7B-Instruct-v0.3
+          url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
+          precision: float16
+        - model: Mixtral MoE 8x7B FP8
+          mad_tag: pyt_vllm_mixtral-8x7b_fp8
+          model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+          url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+          precision: float8
+        - model: Mixtral MoE 8x22B FP8
+          mad_tag: pyt_vllm_mixtral-8x22b_fp8
+          model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+          url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+          precision: float8
+        - model: Mistral 7B FP8
+          mad_tag: pyt_vllm_mistral-7b_fp8
+          model_repo: amd/Mistral-7B-v0.1-FP8-KV
+          url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
+          precision: float8
+    - group: Qwen
+      tag: qwen
+      models:
+        - model: Qwen2 7B
+          mad_tag: pyt_vllm_qwen2-7b
+          model_repo: Qwen/Qwen2-7B-Instruct
+          url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
+          precision: float16
+        - model: Qwen2 72B
+          mad_tag: pyt_vllm_qwen2-72b
+          model_repo: Qwen/Qwen2-72B-Instruct
+          url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
+          precision: float16
+        - model: QwQ-32B
+          mad_tag: pyt_vllm_qwq-32b
+          model_repo: Qwen/QwQ-32B
+          url: https://huggingface.co/Qwen/QwQ-32B
+          precision: float16
+          tunableop: true
+    - group: Databricks DBRX
+      tag: dbrx
+      models:
+        - model: DBRX Instruct
+          mad_tag: pyt_vllm_dbrx-instruct
+          model_repo: databricks/dbrx-instruct
+          url: https://huggingface.co/databricks/dbrx-instruct
+          precision: float16
+        - model: DBRX Instruct FP8
+          mad_tag: pyt_vllm_dbrx_fp8
+          model_repo: amd/dbrx-instruct-FP8-KV
+          url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
+          precision: float8
+    - group: Google Gemma
+      tag: gemma
+      models:
+        - model: Gemma 2 27B
+          mad_tag: pyt_vllm_gemma-2-27b
+          model_repo: google/gemma-2-27b
+          url: https://huggingface.co/google/gemma-2-27b
+          precision: float16
+    - group: Cohere
+      tag: cohere
+      models:
+        - model: C4AI Command R+ 08-2024
+          mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
+          model_repo: CohereForAI/c4ai-command-r-plus-08-2024
+          url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
+          precision: float16
+        - model: C4AI Command R+ 08-2024 FP8
+          mad_tag: pyt_vllm_command-r-plus_fp8
+          model_repo: amd/c4ai-command-r-plus-FP8-KV
+          url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
+          precision: float8
+    - group: DeepSeek
+      tag: deepseek
+      models:
+        - model: DeepSeek MoE 16B
+          mad_tag: pyt_vllm_deepseek-moe-16b-chat
+          model_repo: deepseek-ai/deepseek-moe-16b-chat
+          url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
+          precision: float16
+    - group: Microsoft Phi
+      tag: phi
+      models:
+        - model: Phi-4
+          mad_tag: pyt_vllm_phi-4
+          model_repo: microsoft/phi-4
+          url: https://huggingface.co/microsoft/phi-4
+    - group: TII Falcon
+      tag: falcon
+      models:
+        - model: Falcon 180B
+          mad_tag: pyt_vllm_falcon-180b
+          model_repo: tiiuae/falcon-180B
+          url: https://huggingface.co/tiiuae/falcon-180B
+          precision: float16
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.0.1_20250605-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.0.1_20250605-benchmark-models.yaml
@@ -0,0 +1,162 @@
+vllm_benchmark:
+  unified_docker:
+    latest:
+      pull_tag: rocm/vllm:rocm6.4.1_vllm_0.9.0.1_20250605
+      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.0.1_20250605/images/sha256-f48beeb3d72663a93c77211eb45273d564451447c097e060befa713d565fa36c
+      rocm_version: 6.4.1
+      vllm_version: 0.9.0.1 (0.9.0.2.dev108+g71faa1880.rocm641)
+      pytorch_version: 2.7.0+gitf717b2a
+      hipblaslt_version: 0.15
+  model_groups:
+    - group: Meta Llama
+      tag: llama
+      models:
+      - model: Llama 3.1 8B
+        mad_tag: pyt_vllm_llama-3.1-8b
+        model_repo: meta-llama/Llama-3.1-8B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-8B
+        precision: float16
+      - model: Llama 3.1 70B
+        mad_tag: pyt_vllm_llama-3.1-70b
+        model_repo: meta-llama/Llama-3.1-70B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
+        precision: float16
+      - model: Llama 3.1 405B
+        mad_tag: pyt_vllm_llama-3.1-405b
+        model_repo: meta-llama/Llama-3.1-405B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
+        precision: float16
+      - model: Llama 2 7B
+        mad_tag: pyt_vllm_llama-2-7b
+        model_repo: meta-llama/Llama-2-7b-chat-hf
+        url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
+        precision: float16
+      - model: Llama 2 70B
+        mad_tag: pyt_vllm_llama-2-70b
+        model_repo: meta-llama/Llama-2-70b-chat-hf
+        url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
+        precision: float16
+      - model: Llama 3.1 8B FP8
+        mad_tag: pyt_vllm_llama-3.1-8b_fp8
+        model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
+        precision: float8
+      - model: Llama 3.1 70B FP8
+        mad_tag: pyt_vllm_llama-3.1-70b_fp8
+        model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
+        precision: float8
+      - model: Llama 3.1 405B FP8
+        mad_tag: pyt_vllm_llama-3.1-405b_fp8
+        model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
+        precision: float8
+    - group: Mistral AI
+      tag: mistral
+      models:
+      - model: Mixtral MoE 8x7B
+        mad_tag: pyt_vllm_mixtral-8x7b
+        model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
+        url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
+        precision: float16
+      - model: Mixtral MoE 8x22B
+        mad_tag: pyt_vllm_mixtral-8x22b
+        model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
+        url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
+        precision: float16
+      - model: Mistral 7B
+        mad_tag: pyt_vllm_mistral-7b
+        model_repo: mistralai/Mistral-7B-Instruct-v0.3
+        url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
+        precision: float16
+      - model: Mixtral MoE 8x7B FP8
+        mad_tag: pyt_vllm_mixtral-8x7b_fp8
+        model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+        precision: float8
+      - model: Mixtral MoE 8x22B FP8
+        mad_tag: pyt_vllm_mixtral-8x22b_fp8
+        model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+        precision: float8
+      - model: Mistral 7B FP8
+        mad_tag: pyt_vllm_mistral-7b_fp8
+        model_repo: amd/Mistral-7B-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
+        precision: float8
+    - group: Qwen
+      tag: qwen
+      models:
+      - model: Qwen2 7B
+        mad_tag: pyt_vllm_qwen2-7b
+        model_repo: Qwen/Qwen2-7B-Instruct
+        url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
+        precision: float16
+      - model: Qwen2 72B
+        mad_tag: pyt_vllm_qwen2-72b
+        model_repo: Qwen/Qwen2-72B-Instruct
+        url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
+        precision: float16
+      - model: QwQ-32B
+        mad_tag: pyt_vllm_qwq-32b
+        model_repo: Qwen/QwQ-32B
+        url: https://huggingface.co/Qwen/QwQ-32B
+        precision: float16
+        tunableop: true
+    - group: Databricks DBRX
+      tag: dbrx
+      models:
+      - model: DBRX Instruct
+        mad_tag: pyt_vllm_dbrx-instruct
+        model_repo: databricks/dbrx-instruct
+        url: https://huggingface.co/databricks/dbrx-instruct
+        precision: float16
+      - model: DBRX Instruct FP8
+        mad_tag: pyt_vllm_dbrx_fp8
+        model_repo: amd/dbrx-instruct-FP8-KV
+        url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
+        precision: float8
+    - group: Google Gemma
+      tag: gemma
+      models:
+      - model: Gemma 2 27B
+        mad_tag: pyt_vllm_gemma-2-27b
+        model_repo: google/gemma-2-27b
+        url: https://huggingface.co/google/gemma-2-27b
+        precision: float16
+    - group: Cohere
+      tag: cohere
+      models:
+      - model: C4AI Command R+ 08-2024
+        mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
+        model_repo: CohereForAI/c4ai-command-r-plus-08-2024
+        url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
+        precision: float16
+      - model: C4AI Command R+ 08-2024 FP8
+        mad_tag: pyt_vllm_command-r-plus_fp8
+        model_repo: amd/c4ai-command-r-plus-FP8-KV
+        url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
+        precision: float8
+    - group: DeepSeek
+      tag: deepseek
+      models:
+      - model: DeepSeek MoE 16B
+        mad_tag: pyt_vllm_deepseek-moe-16b-chat
+        model_repo: deepseek-ai/deepseek-moe-16b-chat
+        url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
+        precision: float16
+    - group: Microsoft Phi
+      tag: phi
+      models:
+      - model: Phi-4
+        mad_tag: pyt_vllm_phi-4
+        model_repo: microsoft/phi-4
+        url: https://huggingface.co/microsoft/phi-4
+    - group: TII Falcon
+      tag: falcon
+      models:
+      - model: Falcon 180B
+        mad_tag: pyt_vllm_falcon-180b
+        model_repo: tiiuae/falcon-180B
+        url: https://huggingface.co/tiiuae/falcon-180B
+        precision: float16
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250702-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250702-benchmark-models.yaml
@@ -0,0 +1,163 @@
+vllm_benchmark:
+  unified_docker:
+    latest:
+      # TODO: update me
+      pull_tag: rocm/vllm:rocm6.4.1_vllm_0.9.1_20250702
+      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250702/images/sha256-45068a2079cb8df554ed777141bf0c67d6627c470a897256e60c9f262677faab
+      rocm_version: 6.4.1
+      vllm_version: 0.9.1 (0.9.2.dev206+gb335519f2.rocm641)
+      pytorch_version: 2.7.0+gitf717b2a
+      hipblaslt_version: 0.15
+  model_groups:
+    - group: Meta Llama
+      tag: llama
+      models:
+      - model: Llama 3.1 8B
+        mad_tag: pyt_vllm_llama-3.1-8b
+        model_repo: meta-llama/Llama-3.1-8B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-8B
+        precision: float16
+      - model: Llama 3.1 70B
+        mad_tag: pyt_vllm_llama-3.1-70b
+        model_repo: meta-llama/Llama-3.1-70B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
+        precision: float16
+      - model: Llama 3.1 405B
+        mad_tag: pyt_vllm_llama-3.1-405b
+        model_repo: meta-llama/Llama-3.1-405B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
+        precision: float16
+      - model: Llama 2 7B
+        mad_tag: pyt_vllm_llama-2-7b
+        model_repo: meta-llama/Llama-2-7b-chat-hf
+        url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
+        precision: float16
+      - model: Llama 2 70B
+        mad_tag: pyt_vllm_llama-2-70b
+        model_repo: meta-llama/Llama-2-70b-chat-hf
+        url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
+        precision: float16
+      - model: Llama 3.1 8B FP8
+        mad_tag: pyt_vllm_llama-3.1-8b_fp8
+        model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
+        precision: float8
+      - model: Llama 3.1 70B FP8
+        mad_tag: pyt_vllm_llama-3.1-70b_fp8
+        model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
+        precision: float8
+      - model: Llama 3.1 405B FP8
+        mad_tag: pyt_vllm_llama-3.1-405b_fp8
+        model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
+        precision: float8
+    - group: Mistral AI
+      tag: mistral
+      models:
+      - model: Mixtral MoE 8x7B
+        mad_tag: pyt_vllm_mixtral-8x7b
+        model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
+        url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
+        precision: float16
+      - model: Mixtral MoE 8x22B
+        mad_tag: pyt_vllm_mixtral-8x22b
+        model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
+        url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
+        precision: float16
+      - model: Mistral 7B
+        mad_tag: pyt_vllm_mistral-7b
+        model_repo: mistralai/Mistral-7B-Instruct-v0.3
+        url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
+        precision: float16
+      - model: Mixtral MoE 8x7B FP8
+        mad_tag: pyt_vllm_mixtral-8x7b_fp8
+        model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+        precision: float8
+      - model: Mixtral MoE 8x22B FP8
+        mad_tag: pyt_vllm_mixtral-8x22b_fp8
+        model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+        precision: float8
+      - model: Mistral 7B FP8
+        mad_tag: pyt_vllm_mistral-7b_fp8
+        model_repo: amd/Mistral-7B-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
+        precision: float8
+    - group: Qwen
+      tag: qwen
+      models:
+      - model: Qwen2 7B
+        mad_tag: pyt_vllm_qwen2-7b
+        model_repo: Qwen/Qwen2-7B-Instruct
+        url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
+        precision: float16
+      - model: Qwen2 72B
+        mad_tag: pyt_vllm_qwen2-72b
+        model_repo: Qwen/Qwen2-72B-Instruct
+        url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
+        precision: float16
+      - model: QwQ-32B
+        mad_tag: pyt_vllm_qwq-32b
+        model_repo: Qwen/QwQ-32B
+        url: https://huggingface.co/Qwen/QwQ-32B
+        precision: float16
+        tunableop: true
+    - group: Databricks DBRX
+      tag: dbrx
+      models:
+      - model: DBRX Instruct
+        mad_tag: pyt_vllm_dbrx-instruct
+        model_repo: databricks/dbrx-instruct
+        url: https://huggingface.co/databricks/dbrx-instruct
+        precision: float16
+      - model: DBRX Instruct FP8
+        mad_tag: pyt_vllm_dbrx_fp8
+        model_repo: amd/dbrx-instruct-FP8-KV
+        url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
+        precision: float8
+    - group: Google Gemma
+      tag: gemma
+      models:
+      - model: Gemma 2 27B
+        mad_tag: pyt_vllm_gemma-2-27b
+        model_repo: google/gemma-2-27b
+        url: https://huggingface.co/google/gemma-2-27b
+        precision: float16
+    - group: Cohere
+      tag: cohere
+      models:
+      - model: C4AI Command R+ 08-2024
+        mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
+        model_repo: CohereForAI/c4ai-command-r-plus-08-2024
+        url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
+        precision: float16
+      - model: C4AI Command R+ 08-2024 FP8
+        mad_tag: pyt_vllm_command-r-plus_fp8
+        model_repo: amd/c4ai-command-r-plus-FP8-KV
+        url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
+        precision: float8
+    - group: DeepSeek
+      tag: deepseek
+      models:
+      - model: DeepSeek MoE 16B
+        mad_tag: pyt_vllm_deepseek-moe-16b-chat
+        model_repo: deepseek-ai/deepseek-moe-16b-chat
+        url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
+        precision: float16
+    - group: Microsoft Phi
+      tag: phi
+      models:
+      - model: Phi-4
+        mad_tag: pyt_vllm_phi-4
+        model_repo: microsoft/phi-4
+        url: https://huggingface.co/microsoft/phi-4
+    - group: TII Falcon
+      tag: falcon
+      models:
+      - model: Falcon 180B
+        mad_tag: pyt_vllm_falcon-180b
+        model_repo: tiiuae/falcon-180B
+        url: https://huggingface.co/tiiuae/falcon-180B
+        precision: float16
--- a/docs/data/how-to/rocm-for-ai/inference/pytorch-inference-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/pytorch-inference-benchmark-models.yaml
@@ -1,6 +1,6 @@
 pytorch_inference_benchmark:
  unified_docker:
-    latest: &rocm-pytorch-docker-latest
+    latest:
      pull_tag: rocm/pytorch:latest
      docker_hub_url:
      rocm_version:
@@ -23,3 +23,27 @@ pytorch_inference_benchmark:
        model_repo: meta-llama/Llama-3.1-8B-Instruct
        url: https://huggingface.co/chaidiscovery/chai-1
        precision: float16
+    - group: Mochi Video
+      tag: mochi
+      models:
+      - model: Mochi 1
+        mad_tag: pyt_mochi_video_inference
+        model_repo: genmo/mochi-1-preview
+        url: https://huggingface.co/genmo/mochi-1-preview
+        precision: float16
+    - group: Wan2.1
+      tag: wan
+      models:
+      - model: Wan2.1
+        mad_tag: pyt_wan2.1_inference
+        model_repo: Wan-AI/Wan2.1-T2V-14B
+        url: https://huggingface.co/Wan-AI/Wan2.1-T2V-14B
+        precision: bfloat16
+    - group: Janus-Pro
+      tag: janus-pro
+      models:
+      - model: Janus Pro 7B
+        mad_tag: pyt_janus_pro_inference
+        model_repo: deepseek-ai/Janus-Pro-7B
+        url: https://huggingface.co/deepseek-ai/Janus-Pro-7B
+        precision: bfloat16
--- a/docs/data/how-to/rocm-for-ai/inference/sglang-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/sglang-benchmark-models.yaml
@@ -0,0 +1,17 @@
+sglang_benchmark:
+  unified_docker:
+    latest:
+      pull_tag: lmsysorg/sglang:v0.4.5-rocm630
+      docker_hub_url: https://hub.docker.com/layers/lmsysorg/sglang/v0.4.5-rocm630/images/sha256-63d2cb760a237125daf6612464cfe2f395c0784e21e8b0ea37d551cd10d3c951
+      rocm_version: 6.3.0
+      sglang_version: 0.4.5 (0.4.5-rocm)
+      pytorch_version: 2.6.0a0+git8d4926e
+  model_groups:
+    - group: DeepSeek
+      tag: deepseek
+      models:
+      - model: DeepSeek-R1-Distill-Qwen-32B
+        mad_tag: pyt_sglang_deepseek-r1-distill-qwen-32b
+        model_repo: deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
+        url: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
+        precision: bfloat16
--- a/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
@@ -1,10 +1,11 @@
 vllm_benchmark:
  unified_docker:
    latest:
-      pull_tag: rocm/vllm:rocm6.3.1_vllm0.8.5_20250521
-      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250521/images/sha256-38410c51af7208897cd8b737c9bdfc126e9bc8952d4aa6b88c85482f03092a11
-      rocm_version: 6.3.1
-      vllm_version: 0.8.5 (0.8.6.dev315+g91a560098.rocm631)
+      # TODO: update me
+      pull_tag: rocm/vllm:rocm6.4.1_vllm_0.9.1_20250715
+      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250715/images/sha256-4a429705fa95a58f6d20aceab43b1b76fa769d57f32d5d28bd3f4e030e2a78ea
+      rocm_version: 6.4.1
+      vllm_version: 0.9.1 (0.9.2.dev364+gb432b7a28.rocm641)
      pytorch_version: 2.7.0+gitf717b2a
      hipblaslt_version: 0.15
  model_groups:
@@ -26,11 +27,6 @@ vllm_benchmark:
        model_repo: meta-llama/Llama-3.1-405B-Instruct
        url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
        precision: float16
-      - model: Llama 3.2 11B Vision
-        mad_tag: pyt_vllm_llama-3.2-11b-vision-instruct
-        model_repo: meta-llama/Llama-3.2-11B-Vision-Instruct
-        url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct
-        precision: float16
      - model: Llama 2 7B
        mad_tag: pyt_vllm_llama-2-7b
        model_repo: meta-llama/Llama-2-7b-chat-hf
--- a/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
@@ -1,29 +1,60 @@
-megatron-lm_benchmark:
-  model_groups:
-    - group: Meta Llama
-      tag: llama
-      models:
+dockers:
+  - pull_tag: rocm/megatron-lm:v25.6_py312
+    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py312/images/sha256-482ff906532285bceabdf2bda629bd32cb6174d2d07f4243a736378001b28df0
+    components:
+      ROCm: 6.4.1
+      PyTorch: 2.8.0a0+git7d205b2
+      Python: 3.12
+      Transformer Engine: 2.1.0.dev0+8c4a512
+      hipBLASLt: 393e413
+      Triton: 3.3.0
+      RCCL: 2.23.4.7a84c5d
+    doc_name: Ubuntu 24.04 + Python 3.12
+  - pull_tag: rocm/megatron-lm:v25.6_py310
+    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py310/images/sha256-9627bd9378684fe26cb1a10c7dd817868f553b33402e49b058355b0f095568d6
+    components:
+      ROCm: 6.4.1
+      PyTorch: 2.8.0a0+git7d205b2
+      Python: "3.10"
+      Transformer Engine: 2.1.0.dev0+8c4a512
+      hipBLASLt: 393e413
+      Triton: 3.3.0
+      RCCL: 2.23.4.7a84c5d
+    doc_name: Ubuntu 22.04 + Python 3.10
+model_groups:
+  - group: Meta Llama
+    tag: llama
+    models:
      - model: Llama 3.3 70B
        mad_tag: pyt_megatron_lm_train_llama-3.3-70b
      - model: Llama 3.1 8B
        mad_tag: pyt_megatron_lm_train_llama-3.1-8b
      - model: Llama 3.1 70B
        mad_tag: pyt_megatron_lm_train_llama-3.1-70b
+      - model: Llama 3.1 70B (proxy)
+        mad_tag: pyt_megatron_lm_train_llama-3.1-70b-proxy
      - model: Llama 2 7B
        mad_tag: pyt_megatron_lm_train_llama-2-7b
      - model: Llama 2 70B
        mad_tag: pyt_megatron_lm_train_llama-2-70b
-    - group: DeepSeek
-      tag: deepseek
-      models:
-      - model: DeepSeek-V3
+  - group: DeepSeek
+    tag: deepseek
+    models:
+      - model: DeepSeek-V3 (proxy)
        mad_tag: pyt_megatron_lm_train_deepseek-v3-proxy
      - model: DeepSeek-V2-Lite
        mad_tag: pyt_megatron_lm_train_deepseek-v2-lite-16b
-    - group: Mistral AI
-      tag: mistral
-      models:
+  - group: Mistral AI
+    tag: mistral
+    models:
      - model: Mixtral 8x7B
        mad_tag: pyt_megatron_lm_train_mixtral-8x7b
-      - model: Mixtral 8x22B
+      - model: Mixtral 8x22B (proxy)
        mad_tag: pyt_megatron_lm_train_mixtral-8x22b-proxy
+  - group: Qwen
+    tag: qwen
+    models:
+      - model: Qwen 2.5 7B
+        mad_tag: pyt_megatron_lm_train_qwen2.5-7b
+      - model: Qwen 2.5 72B
+        mad_tag: pyt_megatron_lm_train_qwen2.5-72b
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.5-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.5-benchmark-models.yaml
@@ -0,0 +1,29 @@
+megatron-lm_benchmark:
+  model_groups:
+    - group: Meta Llama
+      tag: llama
+      models:
+      - model: Llama 3.3 70B
+        mad_tag: pyt_megatron_lm_train_llama-3.3-70b
+      - model: Llama 3.1 8B
+        mad_tag: pyt_megatron_lm_train_llama-3.1-8b
+      - model: Llama 3.1 70B
+        mad_tag: pyt_megatron_lm_train_llama-3.1-70b
+      - model: Llama 2 7B
+        mad_tag: pyt_megatron_lm_train_llama-2-7b
+      - model: Llama 2 70B
+        mad_tag: pyt_megatron_lm_train_llama-2-70b
+    - group: DeepSeek
+      tag: deepseek
+      models:
+      - model: DeepSeek-V3
+        mad_tag: pyt_megatron_lm_train_deepseek-v3-proxy
+      - model: DeepSeek-V2-Lite
+        mad_tag: pyt_megatron_lm_train_deepseek-v2-lite-16b
+    - group: Mistral AI
+      tag: mistral
+      models:
+      - model: Mixtral 8x7B
+        mad_tag: pyt_megatron_lm_train_mixtral-8x7b
+      - model: Mixtral 8x22B
+        mad_tag: pyt_megatron_lm_train_mixtral-8x22b-proxy
--- a/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
@@ -0,0 +1,120 @@
+unified_docker:
+  latest:
+    pull_tag: rocm/pytorch-training:v25.6
+    docker_hub_url: https://hub.docker.com/r/rocm/pytorch-training/tags
+    rocm_version: 6.4.1
+    pytorch_version: 2.8.0a0+git7d205b2
+    python_version: 3.10.17
+    transformer_engine_version: 1.14.0+2f85f5f2
+    flash_attention_version: 3.0.0.post1
+    hipblaslt_version: 0.15.0-8c6919d
+    triton_version: 3.3.0
+model_groups:
+  - group: Pre-training
+    tag: pre-training
+    models:
+    - model: Llama 3.1 8B
+      mad_tag: pyt_train_llama-3.1-8b
+      model_repo: Llama-3.1-8B
+      url: https://huggingface.co/meta-llama/Llama-3.1-8B
+      precision: BF16
+      training_modes: [pretrain]
+    - model: Llama 3.1 70B
+      mad_tag: pyt_train_llama-3.1-70b
+      model_repo: Llama-3.1-70B
+      url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
+      precision: BF16
+      training_modes: [pretrain]
+    - model: FLUX.1-dev
+      mad_tag: pyt_train_flux
+      model_repo: Flux
+      url: https://huggingface.co/black-forest-labs/FLUX.1-dev
+      precision: BF16
+      training_modes: [pretrain]
+  - group: Fine-tuning
+    tag: fine-tuning
+    models:
+    - model: Llama 4 Scout 17B-16E
+      mad_tag: pyt_train_llama-4-scout-17b-16e
+      model_repo: Llama-4-17B_16E
+      url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3.3 70B
+      mad_tag: pyt_train_llama-3.3-70b
+      model_repo: Llama-3.3-70B
+      url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
+    - model: Llama 3.2 1B
+      mad_tag: pyt_train_llama-3.2-1b
+      model_repo: Llama-3.2-1B
+      url: https://huggingface.co/meta-llama/Llama-3.2-1B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3.2 3B
+      mad_tag: pyt_train_llama-3.2-3b
+      model_repo: Llama-3.2-3B
+      url: https://huggingface.co/meta-llama/Llama-3.2-3B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3.2 Vision 11B
+      mad_tag: pyt_train_llama-3.2-vision-11b
+      model_repo: Llama-3.2-Vision-11B
+      url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision
+      precision: BF16
+      training_modes: [finetune_fw]
+    - model: Llama 3.2 Vision 90B
+      mad_tag: pyt_train_llama-3.2-vision-90b
+      model_repo: Llama-3.2-Vision-90B
+      url: https://huggingface.co/meta-llama/Llama-3.2-90B-Vision
+      precision: BF16
+      training_modes: [finetune_fw]
+    - model: Llama 3.1 8B
+      mad_tag: pyt_train_llama-3.1-8b
+      model_repo: Llama-3.1-8B
+      url: https://huggingface.co/meta-llama/Llama-3.1-8B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3.1 70B
+      mad_tag: pyt_train_llama-3.1-70b
+      model_repo: Llama-3.1-70B
+      url: https://huggingface.co/meta-llama/Llama-3.1-70B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
+    - model: Llama 3.1 405B
+      mad_tag: pyt_train_llama-3.1-405b
+      model_repo: Llama-3.1-405B
+      url: https://huggingface.co/meta-llama/Llama-3.1-405B
+      precision: BF16
+      training_modes: [finetune_qlora, HF_finetune_lora]
+    - model: Llama 3 8B
+      mad_tag: pyt_train_llama-3-8b
+      model_repo: Llama-3-8B
+      url: https://huggingface.co/meta-llama/Meta-Llama-3-8B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3 70B
+      mad_tag: pyt_train_llama-3-70b
+      model_repo: Llama-3-70B
+      url: https://huggingface.co/meta-llama/Meta-Llama-3-70B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 2 7B
+      mad_tag: pyt_train_llama-2-7b
+      model_repo: Llama-2-7B
+      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
+    - model: Llama 2 13B
+      mad_tag: pyt_train_llama-2-13b
+      model_repo: Llama-2-13B
+      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 2 70B
+      mad_tag: pyt_train_llama-2-70b
+      model_repo: Llama-2-70B
+      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
+      precision: BF16
+      training_modes: [finetune_lora, finetune_qlora, HF_finetune_lora]
--- a/docs/data/rocm-software-stack-6_4_0.jpg
+++ b/docs/data/rocm-software-stack-6_4_0.jpg
--- a/docs/how-to/deep-learning-rocm.rst
+++ b/docs/how-to/deep-learning-rocm.rst
@@ -17,6 +17,10 @@ features for these ROCm-enabled deep learning frameworks.
 * :doc:`PyTorch compatibility <../compatibility/ml-compatibility/pytorch-compatibility>`
 * :doc:`TensorFlow compatibility <../compatibility/ml-compatibility/tensorflow-compatibility>`
 * :doc:`JAX compatibility <../compatibility/ml-compatibility/jax-compatibility>`
+* :doc:`verl compatibility <../compatibility/ml-compatibility/verl-compatibility>`
+* :doc:`Stanford Megatron-LM compatibility <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>`
+* :doc:`DGL compatibility <../compatibility/ml-compatibility/dgl-compatibility>`
+* :doc:`Megablocks compatibility <../compatibility/ml-compatibility/megablocks-compatibility>`

 This chart steps through typical installation workflows for installing deep learning frameworks for ROCm.

@@ -29,6 +33,10 @@ See the installation instructions to get started.
 * :doc:`PyTorch for ROCm <rocm-install-on-linux:install/3rd-party/pytorch-install>`
 * :doc:`TensorFlow for ROCm <rocm-install-on-linux:install/3rd-party/tensorflow-install>`
 * :doc:`JAX for ROCm <rocm-install-on-linux:install/3rd-party/jax-install>`
+* :doc:`verl for ROCm <rocm-install-on-linux:install/3rd-party/verl-install>`
+* :doc:`Stanford Megatron-LM for ROCm <rocm-install-on-linux:install/3rd-party/stanford-megatron-lm-install>`
+* :doc:`DGL for ROCm <rocm-install-on-linux:install/3rd-party/dgl-install>`
+* :doc:`Megablocks for ROCm <rocm-install-on-linux:install/3rd-party/megablocks-install>`

 .. note::

--- a/docs/how-to/gpu-performance/mi300x.rst
+++ b/docs/how-to/gpu-performance/mi300x.rst
@@ -7,21 +7,21 @@ AMD Instinct MI300X performance guides
 **************************************

 The following performance guides provide essential guidance on the necessary
-steps to properly :doc:`configure your system for AMD Instinct™ MI300X
-accelerators <../system-optimization/mi300x>`. They include detailed
-instructions on system settings and application :doc:`workload tuning
-<../rocm-for-ai/inference-optimization/workload>` to help you
-leverage the maximum capabilities of these accelerators and achieve superior
-performance.
+steps to properly `configure your system for AMD Instinct™ MI300X accelerators
+<https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+They include detailed instructions on system settings and application
+:doc:`workload tuning </how-to/rocm-for-ai/inference-optimization/workload>` to
+help you leverage the maximum capabilities of these accelerators and achieve
+superior performance.

 * `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`__
  covers essential system settings and system management practices to configure
  your AMD Instinct MI300X system for performance.

-* :doc:`../rocm-for-ai/inference-optimization/workload` covers steps to
+* :doc:`/how-to/rocm-for-ai/inference-optimization/workload` covers steps to
  optimize the performance of AMD Instinct MI300X series accelerators for HPC
  and deep learning operations.

-* :doc:`../rocm-for-ai/inference/vllm-benchmark` introduces a preconfigured
+* :doc:`/how-to/rocm-for-ai/inference/benchmark-docker/vllm` introduces a preconfigured
  environment for LLM inference, designed to help you test performance with
  popular models on AMD Instinct MI300X series accelerators.
--- a/docs/how-to/rocm-for-ai/fine-tuning/index.rst
+++ b/docs/how-to/rocm-for-ai/fine-tuning/index.rst
@@ -24,5 +24,3 @@ training, fine-tuning, and inference. It leverages popular machine learning fram
 - :doc:`Fine-tuning and inference <fine-tuning-and-inference>` using a
  :doc:`single-accelerator <single-gpu-fine-tuning-and-inference>` or
  :doc:`multi-accelerator <multi-gpu-fine-tuning-and-inference>` system.
-
-
--- a/docs/how-to/rocm-for-ai/index.rst
+++ b/docs/how-to/rocm-for-ai/index.rst
@@ -6,7 +6,7 @@
 Use ROCm for AI
 **************************

-ROCm™ is an open-source software platform that enables high-performance computing and machine learning applications. It features the ability to accelerate training, fine-tuning, and inference for AI application development. With ROCm, you can access the full power of AMD GPUs, which can significantly improve the performance and efficiency of AI workloads.
+ROCm is an open-source software platform that enables high-performance computing and machine learning applications. It features the ability to accelerate training, fine-tuning, and inference for AI application development. With ROCm, you can access the full power of AMD GPUs, which can significantly improve the performance and efficiency of AI workloads.

 You can use ROCm to perform distributed training, which enables you to train models across multiple GPUs or nodes simultaneously. Additionally, ROCm supports mixed-precision training, which can help reduce the memory and compute requirements of training workloads. For fine-tuning, ROCm provides access to various algorithms and optimization techniques. In terms of inference, ROCm provides several techniques that can help you optimize your models for deployment, such as quantization, GEMM tuning, and optimization with composable kernel.
 
--- a/docs/how-to/rocm-for-ai/inference-optimization/workload.rst
+++ b/docs/how-to/rocm-for-ai/inference-optimization/workload.rst
@@ -151,8 +151,8 @@ desired effect. Continuous iteration helps refine the performance gains and
 address any new bottlenecks that may emerge.

 ROCm provides a prebuilt optimized Docker image that has everything required to implement
-the tips in this section. It includes ROCm, vLLM, PyTorch, and tuning files in the CSV 
-format. For more information, see :doc:`../inference/vllm-benchmark`.
+the LLM inference tips in this section. It includes ROCm, PyTorch, and vLLM.
+For more information, see :doc:`/how-to/rocm-for-ai/inference/benchmark-docker/vllm`.

 .. _mi300x-profiling-tools:

@@ -343,9 +343,10 @@ The following performance tips are not *specific* to vLLM -- they are general
 but relevant in this context. You can tune the following vLLM parameters to
 achieve optimal request latency and throughput performance.

-* As described in :ref:`mi300x-env-vars`, the environment
-  variable ``HIP_FORCE_DEV_KERNARG`` can improve vLLM performance. Set it to
-  ``export HIP_FORCE_DEV_KERNARG=1``.
+* As described in `Environment variables (MI300X)
+  <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#environment-variables>`_,
+  the environment variable ``HIP_FORCE_DEV_KERNARG`` can improve vLLM
+  performance. Set it to ``export HIP_FORCE_DEV_KERNARG=1``.

 * Set the :ref:`RCCL environment variable <mi300x-rccl>` ``NCCL_MIN_NCHANNELS``
  to ``112`` to increase the number of channels on MI300X to potentially improve
@@ -410,9 +411,9 @@ for additional performance tips. :ref:`fine-tuning-llms-vllm` describes vLLM
 usage with ROCm.

 ROCm provides a prebuilt optimized Docker image for validating the performance
-of LLM inference with vLLM on the MI300X accelerator. The Docker image includes
-ROCm, vLLM, PyTorch, and tuning files in the CSV format. For more information,
-see :doc:`../inference/vllm-benchmark`.
+of LLM inference with vLLM on MI300X series accelerators. The Docker image includes
+ROCm, vLLM, and PyTorch. For more information, see
+:doc:`/how-to/rocm-for-ai/inference/benchmark-docker/vllm`.

 .. _mi300x-vllm-throughput-measurement:

@@ -1477,8 +1478,9 @@ following command: ``cat /proc/sys/kernel/numa_balancing`` and
 checking whether the output is ``0``.

 If the output is ``1``, you can disable NUMA auto-balancing by running the
-following command: ``sudo sysctl kernel.numa_balancing=0``. For more
-details, see :ref:`AMD Instinct MI300X system optimization <mi300x-disable-numa>`.
+following command: ``sudo sysctl kernel.numa_balancing=0``. For more details,
+see `AMD Instinct MI300X system optimization
+<https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#disable-numa-auto-balancing>`_.

 .. _mi300x-rccl-disable-acs:

--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/sglang-history.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/sglang-history.rst
@@ -0,0 +1,25 @@
+:orphan:
+
+****************************************************
+SGLang inference performance testing version history
+****************************************************
+
+This table lists previous versions of the ROCm SGLang inference performance
+testing environment. For detailed information about available models for
+benchmarking, see the version-specific documentation.
+
+.. list-table::
+   :header-rows: 1
+
+   * - Docker image tag
+     - Components
+     - Resources
+
+   * - ``lmsysorg/sglang:v0.4.5-rocm630``
+     - 
+       * ROCm 6.3.0
+       * SGLang 0.4.5
+       * PyTorch 2.6.0
+     - 
+       * :doc:`Documentation <../sglang>`
+       * `Docker Hub <https://hub.docker.com/layers/lmsysorg/sglang/v0.4.5-rocm630/images/sha256-63d2cb760a237125daf6612464cfe2f395c0784e21e8b0ea37d551cd10d3c951>`__
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.4.3.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.4.3.rst
@@ -0,0 +1,346 @@
+:orphan:
+
+.. meta::
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the unified
+                 ROCm Docker image.
+   :keywords: model, MAD, automation, dashboarding, validate
+
+**********************************
+vLLM inference performance testing
+**********************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm vLLM
+   inference performance documentation. See :doc:`../vllm` for the latest version.
+
+.. _vllm-benchmark-unified-docker:
+
+The `ROCm vLLM Docker <https://hub.docker.com/r/rocm/vllm/tags>`_ image offers
+a prebuilt, optimized environment designed for validating large language model
+(LLM) inference performance on the AMD Instinct™ MI300X accelerator. This
+ROCm vLLM Docker image integrates vLLM and PyTorch tailored specifically for the
+MI300X accelerator and includes the following components:
+
+* `ROCm 6.2.0 <https://github.com/ROCm/ROCm>`_
+
+* `vLLM 0.4.3 <https://docs.vllm.ai/en/latest>`_
+
+* `PyTorch 2.4.0 <https://github.com/pytorch/pytorch>`_
+
+* Tuning files (in CSV format)
+
+With this Docker image, you can quickly validate the expected inference
+performance numbers on the MI300X accelerator. This topic also provides tips on
+optimizing performance with popular AI models.
+
+.. _vllm-benchmark-vllm:
+
+.. note::
+
+   vLLM is a toolkit and library for LLM inference and
+   serving. It deploys the PagedAttention algorithm, which reduces memory
+   consumption and increases throughput by leveraging dynamic key and value
+   allocation in GPU memory. vLLM also incorporates many LLM acceleration
+   and quantization algorithms. In addition, AMD implements high-performance
+   custom kernels and modules in vLLM to enhance performance further. See
+   :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for more
+   information.
+
+Getting started
+===============
+
+Use the following procedures to reproduce the benchmark results on an
+MI300X accelerator with the prebuilt vLLM Docker image.
+
+.. _vllm-benchmark-get-started:
+
+1. Disable NUMA auto-balancing.
+
+   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
+   might hang until the periodic balancing is finalized. For more information,
+   see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
+
+   .. code-block:: shell
+
+      # disable automatic NUMA balancing
+      sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
+      # check if NUMA balancing is disabled (returns 0 if disabled)
+      cat /proc/sys/kernel/numa_balancing
+      0
+
+2. Download the :ref:`ROCm vLLM Docker image <vllm-benchmark-unified-docker>`.
+
+   Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull rocm/vllm:rocm6.2_mi300_ubuntu22.04_py3.9_vllm_7c5fd50
+
+Once setup is complete, you can choose between two options to reproduce the
+benchmark results:
+
+-  :ref:`MAD-integrated benchmarking <vllm-benchmark-mad-v043>`
+
+-  :ref:`Standalone benchmarking <vllm-benchmark-standalone-v043>`
+
+.. _vllm-benchmark-mad-v043:
+
+MAD-integrated benchmarking
+===========================
+
+Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+directory and install the required packages on the host machine.
+
+.. code-block:: shell
+
+   git clone https://github.com/ROCm/MAD
+   cd MAD
+   pip install -r requirements.txt
+
+Use this command to run a performance benchmark test of the Llama 3.1 8B model
+on one GPU with ``float16`` data type in the host machine.
+
+.. code-block:: shell
+
+   export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+   python3 tools/run_models.py --tags pyt_vllm_llama-3.1-8b --keep-model-dir --live-output --timeout 28800
+
+ROCm MAD launches a Docker container with the name
+``container_ci-pyt_vllm_llama-3.1-8b``. The latency and throughput reports of the
+model are collected in the following path: ``~/MAD/reports_float16/``
+
+Although the following eight models are pre-configured to collect latency and
+throughput performance data, users can also change the benchmarking parameters.
+Refer to the :ref:`Standalone benchmarking <vllm-benchmark-standalone-v043>` section.
+
+Available models
+----------------
+
+.. hlist::
+   :columns: 3
+
+   * ``pyt_vllm_llama-3.1-8b``
+
+   * ``pyt_vllm_llama-3.1-70b``
+
+   * ``pyt_vllm_llama-3.1-405b``
+
+   * ``pyt_vllm_llama-2-7b``
+
+   * ``pyt_vllm_mistral-7b``
+
+   * ``pyt_vllm_qwen2-7b``
+
+   * ``pyt_vllm_jais-13b``
+
+   * ``pyt_vllm_jais-30b``
+
+.. _vllm-benchmark-standalone-v043:
+
+Standalone benchmarking
+=======================
+
+You can run the vLLM benchmark tool independently by starting the
+:ref:`Docker container <vllm-benchmark-get-started>` as shown in the following
+snippet.
+
+.. code-block::
+
+   docker pull rocm/vllm:rocm6.2_mi300_ubuntu22.04_py3.9_vllm_7c5fd50
+   docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 128G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name unified_docker_vllm rocm/vllm:rocm6.2_mi300_ubuntu22.04_py3.9_vllm_7c5fd50
+
+In the Docker container, clone the ROCm MAD repository and navigate to the
+benchmark scripts directory at ``~/MAD/scripts/vllm``.
+
+.. code-block::
+
+   git clone https://github.com/ROCm/MAD
+   cd MAD/scripts/vllm
+
+Multiprocessing distributed executor
+--------------------------------------
+
+To optimize vLLM performance, add the multiprocessing API server argument ``--distributed-executor-backend mp``.
+
+Command
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+To start the benchmark, use the following command with the appropriate options.
+See :ref:`Options <vllm-benchmark-standalone-options-v043>` for the list of
+options and their descriptions.
+
+.. code-block:: shell
+
+   ./vllm_benchmark_report.sh -s $test_option -m $model_repo -g $num_gpu -d $datatype
+
+See the :ref:`examples <vllm-benchmark-run-benchmark-v043>` for more information.
+
+.. note::
+
+   The input sequence length, output sequence length, and tensor parallel (TP) are
+   already configured. You don't need to specify them with this script.
+
+.. note::
+
+   If you encounter the following error, pass your access-authorized Hugging
+   Face token to the gated models.
+
+   .. code-block:: shell
+
+      OSError: You are trying to access a gated repo.
+
+      # pass your HF_TOKEN
+      export HF_TOKEN=$your_personal_hf_token
+
+.. _vllm-benchmark-standalone-options-v043:
+
+Options
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. list-table::
+   :header-rows: 1
+
+   * - Name
+     - Options
+     - Description
+
+   * - ``$test_option``
+     - latency
+     - Measure decoding token latency
+
+   * -
+     - throughput
+     - Measure token generation throughput
+
+   * -
+     - all
+     - Measure both throughput and latency
+
+   * - ``$model_repo``
+     - ``meta-llama/Meta-Llama-3.1-8B-Instruct``
+     - Llama 3.1 8B
+
+   * - (``float16``)
+     - ``meta-llama/Meta-Llama-3.1-70B-Instruct``
+     - Llama 3.1 70B
+
+   * -
+     - ``meta-llama/Meta-Llama-3.1-405B-Instruct``
+     - Llama 3.1 405B
+
+   * -
+     - ``meta-llama/Llama-2-7b-chat-hf``
+     - Llama 2 7B
+
+   * -
+     - ``mistralai/Mixtral-8x7B-Instruct-v0.1``
+     - Mixtral 8x7B
+
+   * -
+     - ``mistralai/Mixtral-8x22B-Instruct-v0.1``
+     - Mixtral 8x22B
+
+   * -
+     - ``mistralai/Mistral-7B-Instruct-v0.3``
+     - Mixtral 7B
+
+   * -
+     - ``Qwen/Qwen2-7B-Instruct``
+     - Qwen2 7B
+
+   * -
+     - ``core42/jais-13b-chat``
+     - JAIS 13B
+
+   * -
+     - ``core42/jais-30b-chat-v3``
+     - JAIS 30B
+
+   * - ``$num_gpu``
+     - 1 or 8
+     - Number of GPUs
+
+   * - ``$datatype``
+     - ``float16``
+     - Data type
+
+.. _vllm-benchmark-run-benchmark-v043:
+
+Running the benchmark on the MI300X accelerator
+-----------------------------------------------
+
+Here are some examples of running the benchmark with various options.
+See :ref:`Options <vllm-benchmark-standalone-options-v043>` for the list of
+options and their descriptions.
+
+Latency benchmark example
+^^^^^^^^^^^^^^^^^^^^^^^^^
+ 
+Use this command to benchmark the latency of the Llama 3.1 8B model on one GPU with the ``float16`` data type.
+
+.. code-block::
+
+   ./vllm_benchmark_report.sh -s latency -m meta-llama/Meta-Llama-3.1-8B-Instruct -g 1 -d float16
+
+Find the latency report at:
+
+- ``./reports_float16/summary/Meta-Llama-3.1-8B-Instruct_latency_report.csv``
+
+Throughput benchmark example
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Use this command to benchmark the throughput of the Llama 3.1 8B model on one GPU with the ``float16`` and ``float8`` data types.
+
+.. code-block:: shell
+
+   ./vllm_benchmark_report.sh -s throughput -m meta-llama/Meta-Llama-3.1-8B-Instruct -g 1 -d float16
+
+Find the throughput reports at:
+
+- ``./reports_float16/summary/Meta-Llama-3.1-8B-Instruct_throughput_report.csv``
+
+.. raw:: html
+
+   <style>
+   mjx-container[jax="CHTML"][display="true"] {
+       text-align: left;
+       margin: 0;
+   }
+
+   </style>
+
+.. note::
+
+   Throughput is calculated as:
+
+   - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
+
+   - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
+
+Further reading
+===============
+
+- For application performance optimization strategies for HPC and AI workloads,
+  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
+
+- To learn more about the options for latency and throughput benchmark scripts,
+  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
+
+- To learn more about system settings and management practices to configure your system for
+  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+
+- To learn how to run community models from Hugging Face on AMD GPUs, see
+  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
+
+- To learn how to fine-tune LLMs and optimize inference, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`vllm-history` to find documentation for previous releases
+of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.4.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.4.rst
@@ -0,0 +1,416 @@
+:orphan:
+
+.. meta::
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the unified
+                 ROCm Docker image.
+   :keywords: model, MAD, automation, dashboarding, validate
+
+**********************************
+vLLM inference performance testing
+**********************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm vLLM
+   inference performance documentation. See :doc:`../vllm` for the latest version.
+
+.. _vllm-benchmark-unified-docker:
+
+The `ROCm vLLM Docker <https://hub.docker.com/r/rocm/vllm/tags>`_ image offers
+a prebuilt, optimized environment designed for validating large language model
+(LLM) inference performance on the AMD Instinct™ MI300X accelerator. This
+ROCm vLLM Docker image integrates vLLM and PyTorch tailored specifically for the
+MI300X accelerator and includes the following components:
+
+* `ROCm 6.2.1 <https://github.com/ROCm/ROCm>`_
+
+* `vLLM 0.6.4 <https://docs.vllm.ai/en/latest>`_
+
+* `PyTorch 2.5.0 <https://github.com/pytorch/pytorch>`_
+
+* Tuning files (in CSV format)
+
+With this Docker image, you can quickly validate the expected inference
+performance numbers on the MI300X accelerator. This topic also provides tips on
+optimizing performance with popular AI models.
+
+.. hlist::
+   :columns: 6
+
+   * Llama 3.1 8B
+
+   * Llama 3.1 70B
+
+   * Llama 3.1 405B
+
+   * Llama 2 7B
+
+   * Llama 2 70B
+
+   * Mixtral 8x7B
+
+   * Mixtral 8x22B
+
+   * Mixtral 7B
+
+   * Qwen2 7B
+
+   * Qwen2 72B
+
+   * JAIS 13B
+
+   * JAIS 30B
+
+.. _vllm-benchmark-vllm:
+
+.. note::
+
+   vLLM is a toolkit and library for LLM inference and serving. AMD implements
+   high-performance custom kernels and modules in vLLM to enhance performance.
+   See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
+   more information.
+
+Getting started
+===============
+
+Use the following procedures to reproduce the benchmark results on an
+MI300X accelerator with the prebuilt vLLM Docker image.
+
+.. _vllm-benchmark-get-started:
+
+1. Disable NUMA auto-balancing.
+
+   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
+   might hang until the periodic balancing is finalized. For more information,
+   see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
+
+   .. code-block:: shell
+
+      # disable automatic NUMA balancing
+      sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
+      # check if NUMA balancing is disabled (returns 0 if disabled)
+      cat /proc/sys/kernel/numa_balancing
+      0
+
+2. Download the :ref:`ROCm vLLM Docker image <vllm-benchmark-unified-docker>`.
+
+   Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
+
+Once setup is complete, you can choose between two options to reproduce the
+benchmark results:
+
+-  :ref:`MAD-integrated benchmarking <vllm-benchmark-mad-v064>`
+
+-  :ref:`Standalone benchmarking <vllm-benchmark-standalone-v064>`
+
+.. _vllm-benchmark-mad-v064:
+
+MAD-integrated benchmarking
+===========================
+
+Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+directory and install the required packages on the host machine.
+
+.. code-block:: shell
+
+   git clone https://github.com/ROCm/MAD
+   cd MAD
+   pip install -r requirements.txt
+
+Use this command to run a performance benchmark test of the Llama 3.1 8B model
+on one GPU with ``float16`` data type in the host machine.
+
+.. code-block:: shell
+
+   export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+   python3 tools/run_models.py --tags pyt_vllm_llama-3.1-8b --keep-model-dir --live-output --timeout 28800
+
+ROCm MAD launches a Docker container with the name
+``container_ci-pyt_vllm_llama-3.1-8b``. The latency and throughput reports of the
+model are collected in the following path: ``~/MAD/reports_float16/``.
+
+Although the following models are preconfigured to collect latency and
+throughput performance data, you can also change the benchmarking parameters.
+Refer to the :ref:`Standalone benchmarking <vllm-benchmark-standalone-v064>` section.
+
+Available models
+----------------
+
+.. hlist::
+   :columns: 3
+
+   * ``pyt_vllm_llama-3.1-8b``
+
+   * ``pyt_vllm_llama-3.1-70b``
+
+   * ``pyt_vllm_llama-3.1-405b``
+
+   * ``pyt_vllm_llama-2-7b``
+
+   * ``pyt_vllm_llama-2-70b``
+
+   * ``pyt_vllm_mixtral-8x7b``
+
+   * ``pyt_vllm_mixtral-8x22b``
+
+   * ``pyt_vllm_mistral-7b``
+
+   * ``pyt_vllm_qwen2-7b``
+
+   * ``pyt_vllm_qwen2-72b``
+
+   * ``pyt_vllm_jais-13b``
+
+   * ``pyt_vllm_jais-30b``
+
+   * ``pyt_vllm_llama-3.1-8b_fp8``
+
+   * ``pyt_vllm_llama-3.1-70b_fp8``
+
+   * ``pyt_vllm_llama-3.1-405b_fp8``
+
+   * ``pyt_vllm_mixtral-8x7b_fp8``
+
+   * ``pyt_vllm_mixtral-8x22b_fp8``
+
+.. _vllm-benchmark-standalone-v064:
+
+Standalone benchmarking
+=======================
+
+You can run the vLLM benchmark tool independently by starting the
+:ref:`Docker container <vllm-benchmark-get-started>` as shown in the following
+snippet.
+
+.. code-block::
+
+   docker pull rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
+   docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 128G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name vllm_v0.6.4 rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
+
+In the Docker container, clone the ROCm MAD repository and navigate to the
+benchmark scripts directory at ``~/MAD/scripts/vllm``.
+
+.. code-block::
+
+   git clone https://github.com/ROCm/MAD
+   cd MAD/scripts/vllm
+
+Command
+-------
+
+To start the benchmark, use the following command with the appropriate options.
+See :ref:`Options <vllm-benchmark-standalone-v064-options>` for the list of
+options and their descriptions.
+
+.. code-block:: shell
+
+   ./vllm_benchmark_report.sh -s $test_option -m $model_repo -g $num_gpu -d $datatype
+
+See the :ref:`examples <vllm-benchmark-run-benchmark-v064>` for more information.
+
+.. note::
+
+   The input sequence length, output sequence length, and tensor parallel (TP) are
+   already configured. You don't need to specify them with this script.
+
+.. note::
+
+   If you encounter the following error, pass your access-authorized Hugging
+   Face token to the gated models.
+
+   .. code-block:: shell
+
+      OSError: You are trying to access a gated repo.
+
+      # pass your HF_TOKEN
+      export HF_TOKEN=$your_personal_hf_token
+
+.. _vllm-benchmark-standalone-v064-options:
+
+Options
+-------
+
+.. list-table::
+   :header-rows: 1
+   :align: center
+
+   * - Name
+     - Options
+     - Description
+
+   * - ``$test_option``
+     - latency
+     - Measure decoding token latency
+
+   * -
+     - throughput
+     - Measure token generation throughput
+
+   * -
+     - all
+     - Measure both throughput and latency
+
+   * - ``$model_repo``
+     - ``meta-llama/Meta-Llama-3.1-8B-Instruct``
+     - Llama 3.1 8B
+
+   * - (``float16``)
+     - ``meta-llama/Meta-Llama-3.1-70B-Instruct``
+     - Llama 3.1 70B
+
+   * -
+     - ``meta-llama/Meta-Llama-3.1-405B-Instruct``
+     - Llama 3.1 405B
+
+   * -
+     - ``meta-llama/Llama-2-7b-chat-hf``
+     - Llama 2 7B
+
+   * -
+     - ``meta-llama/Llama-2-70b-chat-hf``
+     - Llama 2 70B
+
+   * -
+     - ``mistralai/Mixtral-8x7B-Instruct-v0.1``
+     - Mixtral 8x7B
+
+   * -
+     - ``mistralai/Mixtral-8x22B-Instruct-v0.1``
+     - Mixtral 8x22B
+
+   * -
+     - ``mistralai/Mistral-7B-Instruct-v0.3``
+     - Mixtral 7B
+
+   * -
+     - ``Qwen/Qwen2-7B-Instruct``
+     - Qwen2 7B
+
+   * -
+     - ``Qwen/Qwen2-72B-Instruct``
+     - Qwen2 72B
+
+   * -
+     - ``core42/jais-13b-chat``
+     - JAIS 13B
+
+   * -
+     - ``core42/jais-30b-chat-v3``
+     - JAIS 30B
+
+   * - ``$model_repo``
+     - ``amd/Meta-Llama-3.1-8B-Instruct-FP8-KV``
+     - Llama 3.1 8B
+
+   * - (``float8``)
+     - ``amd/Meta-Llama-3.1-70B-Instruct-FP8-KV``
+     - Llama 3.1 70B
+
+   * -
+     - ``amd/Meta-Llama-3.1-405B-Instruct-FP8-KV``
+     - Llama 3.1 405B
+
+   * -
+     - ``amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV``
+     - Mixtral 8x7B
+
+   * -
+     - ``amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV``
+     - Mixtral 8x22B
+
+   * - ``$num_gpu``
+     - 1 or 8
+     - Number of GPUs
+
+   * - ``$datatype``
+     - ``float16`` or ``float8``
+     - Data type
+
+.. _vllm-benchmark-run-benchmark-v064:
+
+Running the benchmark on the MI300X accelerator
+-----------------------------------------------
+
+Here are some examples of running the benchmark with various options.
+See :ref:`Options <vllm-benchmark-standalone-v064-options>` for the list of
+options and their descriptions.
+
+Example 1: latency benchmark
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ 
+Use this command to benchmark the latency of the Llama 3.1 8B model on one GPU with the ``float16`` and ``float8`` data types.
+
+.. code-block::
+
+   ./vllm_benchmark_report.sh -s latency -m meta-llama/Meta-Llama-3.1-8B-Instruct -g 1 -d float16
+   ./vllm_benchmark_report.sh -s latency -m amd/Meta-Llama-3.1-8B-Instruct-FP8-KV -g 1 -d float8
+
+Find the latency reports at:
+
+- ``./reports_float16/summary/Meta-Llama-3.1-8B-Instruct_latency_report.csv``
+
+- ``./reports_float8/summary/Meta-Llama-3.1-8B-Instruct-FP8-KV_latency_report.csv``
+
+Example 2: throughput benchmark
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Use this command to benchmark the throughput of the Llama 3.1 8B model on one GPU with the ``float16`` and ``float8`` data types.
+
+.. code-block:: shell
+
+   ./vllm_benchmark_report.sh -s throughput -m meta-llama/Meta-Llama-3.1-8B-Instruct -g 1 -d float16
+   ./vllm_benchmark_report.sh -s throughput -m amd/Meta-Llama-3.1-8B-Instruct-FP8-KV -g 1 -d float8
+
+Find the throughput reports at:
+
+- ``./reports_float16/summary/Meta-Llama-3.1-8B-Instruct_throughput_report.csv``
+
+- ``./reports_float8/summary/Meta-Llama-3.1-8B-Instruct-FP8-KV_throughput_report.csv``
+
+.. raw:: html
+
+   <style>
+   mjx-container[jax="CHTML"][display="true"] {
+       text-align: left;
+       margin: 0;
+   }
+   </style>
+
+.. note::
+
+   Throughput is calculated as:
+
+   - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
+
+   - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
+
+Further reading
+===============
+
+- For application performance optimization strategies for HPC and AI workloads,
+  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
+
+- To learn more about the options for latency and throughput benchmark scripts,
+  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
+
+- To learn more about system settings and management practices to configure your system for
+  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+
+- To learn how to run community models from Hugging Face on AMD GPUs, see
+  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
+
+- To learn how to fine-tune LLMs and optimize inference, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`vllm-history` to find documentation for previous releases
+of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.6.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.6.rst
@@ -0,0 +1,461 @@
+:orphan:
+
+.. meta::
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
+                 ROCm vLLM Docker image.
+   :keywords: model, MAD, automation, dashboarding, validate
+
+***********************************************************
+LLM inference performance validation on AMD Instinct MI300X
+***********************************************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm vLLM
+   inference performance documentation. See :doc:`../vllm` for the latest version.
+
+.. _vllm-benchmark-unified-docker:
+
+The `ROCm vLLM Docker <https://hub.docker.com/r/rocm/vllm/tags>`_ image offers
+a prebuilt, optimized environment for validating large language model (LLM)
+inference performance on the AMD Instinct™ MI300X accelerator. This ROCm vLLM
+Docker image integrates vLLM and PyTorch tailored specifically for the MI300X
+accelerator and includes the following components:
+
+* `ROCm 6.3.1 <https://github.com/ROCm/ROCm>`_
+
+* `vLLM 0.6.6 <https://docs.vllm.ai/en/latest>`_
+
+* `PyTorch 2.7.0 (2.7.0a0+git3a58512) <https://github.com/pytorch/pytorch>`_
+
+With this Docker image, you can quickly validate the expected inference
+performance numbers for the MI300X accelerator. This topic also provides tips on
+optimizing performance with popular AI models. For more information, see the lists of
+:ref:`available models for MAD-integrated benchmarking <vllm-benchmark-mad-v066-models>`
+and :ref:`standalone benchmarking <vllm-benchmark-standalone-v066-options>`.
+
+.. _vllm-benchmark-vllm:
+
+.. note::
+
+   vLLM is a toolkit and library for LLM inference and serving. AMD implements
+   high-performance custom kernels and modules in vLLM to enhance performance.
+   See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
+   more information.
+
+Getting started
+===============
+
+Use the following procedures to reproduce the benchmark results on an
+MI300X accelerator with the prebuilt vLLM Docker image.
+
+.. _vllm-benchmark-get-started:
+
+1. Disable NUMA auto-balancing.
+
+   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
+   might hang until the periodic balancing is finalized. For more information,
+   see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
+
+   .. code-block:: shell
+
+      # disable automatic NUMA balancing
+      sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
+      # check if NUMA balancing is disabled (returns 0 if disabled)
+      cat /proc/sys/kernel/numa_balancing
+      0
+
+2. Download the :ref:`ROCm vLLM Docker image <vllm-benchmark-unified-docker>`.
+
+   Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull rocm/vllm:rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6
+
+Once the setup is complete, choose between two options to reproduce the
+benchmark results:
+
+-  :ref:`MAD-integrated benchmarking <vllm-benchmark-mad-v066>`
+
+-  :ref:`Standalone benchmarking <vllm-benchmark-standalone-v066>`
+
+.. _vllm-benchmark-mad-v066:
+
+MAD-integrated benchmarking
+===========================
+
+Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+directory and install the required packages on the host machine.
+
+.. code-block:: shell
+
+   git clone https://github.com/ROCm/MAD
+   cd MAD
+   pip install -r requirements.txt
+
+Use this command to run a performance benchmark test of the Llama 3.1 8B model
+on one GPU with ``float16`` data type in the host machine.
+
+.. code-block:: shell
+
+   export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+   python3 tools/run_models.py --tags pyt_vllm_llama-3.1-8b --keep-model-dir --live-output --timeout 28800
+
+ROCm MAD launches a Docker container with the name
+``container_ci-pyt_vllm_llama-3.1-8b``. The latency and throughput reports of the
+model are collected in the following path: ``~/MAD/reports_float16/``.
+
+Although the following models are preconfigured to collect latency and
+throughput performance data, you can also change the benchmarking parameters.
+Refer to the :ref:`Standalone benchmarking <vllm-benchmark-standalone-v066>` section.
+
+.. _vllm-benchmark-mad-v066-models:
+
+Available models
+----------------
+
+.. list-table::
+   :header-rows: 1
+   :widths: 2, 3
+
+   * - Model name
+     - Tag
+
+   * - `Llama 3.1 8B <https://huggingface.co/meta-llama/Llama-3.1-8B>`_
+     - ``pyt_vllm_llama-3.1-8b``
+
+   * - `Llama 3.1 70B <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
+     - ``pyt_vllm_llama-3.1-70b``
+
+   * - `Llama 3.1 405B <https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct>`_
+     - ``pyt_vllm_llama-3.1-405b``
+
+   * - `Llama 3.2 11B Vision <https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct>`_
+     - ``pyt_vllm_llama-3.2-11b-vision-instruct``
+
+   * - `Llama 2 7B <https://huggingface.co/meta-llama/Llama-2-7b-chat-hf>`__
+     - ``pyt_vllm_llama-2-7b``
+
+   * - `Llama 2 70B <https://huggingface.co/meta-llama/Llama-2-70b-chat-hf>`__
+     - ``pyt_vllm_llama-2-70b``
+
+   * - `Mixtral MoE 8x7B <https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1>`_
+     - ``pyt_vllm_mixtral-8x7b``
+
+   * - `Mixtral MoE 8x22B <https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1>`_
+     - ``pyt_vllm_mixtral-8x22b``
+
+   * - `Mistral 7B <https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3>`_
+     - ``pyt_vllm_mistral-7b``
+
+   * - `Qwen2 7B <https://huggingface.co/Qwen/Qwen2-7B-Instruct>`_
+     - ``pyt_vllm_qwen2-7b``
+
+   * - `Qwen2 72B <https://huggingface.co/Qwen/Qwen2-72B-Instruct>`_
+     - ``pyt_vllm_qwen2-72b``
+
+   * - `JAIS 13B <https://huggingface.co/core42/jais-13b-chat>`_
+     - ``pyt_vllm_jais-13b``
+
+   * - `JAIS 30B <https://huggingface.co/core42/jais-30b-chat-v3>`_
+     - ``pyt_vllm_jais-30b``
+
+   * - `DBRX Instruct <https://huggingface.co/databricks/dbrx-instruct>`_
+     - ``pyt_vllm_dbrx-instruct``
+
+   * - `Gemma 2 27B <https://huggingface.co/google/gemma-2-27b>`_
+     - ``pyt_vllm_gemma-2-27b``
+
+   * - `C4AI Command R+ 08-2024 <https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024>`_
+     - ``pyt_vllm_c4ai-command-r-plus-08-2024``
+
+   * - `DeepSeek MoE 16B <https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat>`_
+     - ``pyt_vllm_deepseek-moe-16b-chat``
+
+   * - `Llama 3.1 70B FP8 <https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV>`_
+     - ``pyt_vllm_llama-3.1-70b_fp8``
+
+   * - `Llama 3.1 405B FP8 <https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV>`_
+     - ``pyt_vllm_llama-3.1-405b_fp8``
+
+   * - `Mixtral MoE 8x7B FP8 <https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV>`_
+     - ``pyt_vllm_mixtral-8x7b_fp8``
+
+   * - `Mixtral MoE 8x22B FP8 <https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV>`_
+     - ``pyt_vllm_mixtral-8x22b_fp8``
+
+   * - `Mistral 7B FP8 <https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV>`_
+     - ``pyt_vllm_mistral-7b_fp8``
+
+   * - `DBRX Instruct FP8 <https://huggingface.co/amd/dbrx-instruct-FP8-KV>`_
+     - ``pyt_vllm_dbrx_fp8``
+
+   * - `C4AI Command R+ 08-2024 FP8 <https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV>`_
+     - ``pyt_vllm_command-r-plus_fp8``
+
+.. _vllm-benchmark-standalone-v066:
+
+Standalone benchmarking
+=======================
+
+You can run the vLLM benchmark tool independently by starting the
+:ref:`Docker container <vllm-benchmark-get-started>` as shown in the following
+snippet.
+
+.. code-block::
+
+   docker pull rocm/vllm:rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6
+   docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name vllm_v0.6.6 rocm/vllm:rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6
+
+In the Docker container, clone the ROCm MAD repository and navigate to the
+benchmark scripts directory at ``~/MAD/scripts/vllm``.
+
+.. code-block::
+
+   git clone https://github.com/ROCm/MAD
+   cd MAD/scripts/vllm
+
+Command
+-------
+
+To start the benchmark, use the following command with the appropriate options.
+See :ref:`Options <vllm-benchmark-standalone-v066-options>` for the list of
+options and their descriptions.
+
+.. code-block:: shell
+
+   ./vllm_benchmark_report.sh -s $test_option -m $model_repo -g $num_gpu -d $datatype
+
+See the :ref:`examples <vllm-benchmark-run-benchmark-v066>` for more information.
+
+.. note::
+
+   The input sequence length, output sequence length, and tensor parallel (TP) are
+   already configured. You don't need to specify them with this script.
+
+.. note::
+
+   If you encounter the following error, pass your access-authorized Hugging
+   Face token to the gated models.
+
+   .. code-block:: shell
+
+      OSError: You are trying to access a gated repo.
+
+      # pass your HF_TOKEN
+      export HF_TOKEN=$your_personal_hf_token
+
+.. _vllm-benchmark-standalone-v066-options:
+
+Options and available models
+----------------------------
+
+.. list-table::
+   :header-rows: 1
+   :align: center
+
+   * - Name
+     - Options
+     - Description
+
+   * - ``$test_option``
+     - latency
+     - Measure decoding token latency
+
+   * -
+     - throughput
+     - Measure token generation throughput
+
+   * -
+     - all
+     - Measure both throughput and latency
+
+   * - ``$model_repo``
+     - ``meta-llama/Llama-3.1-8B-Instruct``
+     - `Llama 3.1 8B <https://huggingface.co/meta-llama/Llama-3.1-8B>`_
+
+   * - (``float16``)
+     - ``meta-llama/Llama-3.1-70B-Instruct``
+     - `Llama 3.1 70B <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
+
+   * -
+     - ``meta-llama/Llama-3.1-405B-Instruct``
+     - `Llama 3.1 405B <https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct>`_
+
+   * -
+     - ``meta-llama/Llama-3.2-11B-Vision-Instruct``
+     - `Llama 3.2 11B Vision <https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct>`_
+
+   * -
+     - ``meta-llama/Llama-2-7b-chat-hf``
+     - `Llama 2 7B <https://huggingface.co/meta-llama/Llama-2-7b-chat-hf>`__
+
+   * -
+     - ``meta-llama/Llama-2-70b-chat-hf``
+     - `Llama 2 70B <https://huggingface.co/meta-llama/Llama-2-70b-chat-hf>`__
+
+   * -
+     - ``mistralai/Mixtral-8x7B-Instruct-v0.1``
+     - `Mixtral MoE 8x7B <https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1>`_
+
+   * -
+     - ``mistralai/Mixtral-8x22B-Instruct-v0.1``
+     - `Mixtral MoE 8x22B <https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1>`_
+
+   * -
+     - ``mistralai/Mistral-7B-Instruct-v0.3``
+     - `Mistral 7B <https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3>`_
+
+   * -
+     - ``Qwen/Qwen2-7B-Instruct``
+     - `Qwen2 7B <https://huggingface.co/Qwen/Qwen2-7B-Instruct>`_
+
+   * -
+     - ``Qwen/Qwen2-72B-Instruct``
+     - `Qwen2 72B <https://huggingface.co/Qwen/Qwen2-72B-Instruct>`_
+
+   * -
+     - ``core42/jais-13b-chat``
+     - `JAIS 13B <https://huggingface.co/core42/jais-13b-chat>`_
+
+   * -
+     - ``core42/jais-30b-chat-v3``
+     - `JAIS 30B <https://huggingface.co/core42/jais-30b-chat-v3>`_
+
+   * -
+     - ``databricks/dbrx-instruct``
+     - `DBRX Instruct <https://huggingface.co/databricks/dbrx-instruct>`_
+
+   * -
+     - ``google/gemma-2-27b``
+     - `Gemma 2 27B <https://huggingface.co/google/gemma-2-27b>`_
+
+   * -
+     - ``CohereForAI/c4ai-command-r-plus-08-2024``
+     - `C4AI Command R+ 08-2024 <https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024>`_
+
+   * -
+     - ``deepseek-ai/deepseek-moe-16b-chat``
+     - `DeepSeek MoE 16B <https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat>`_
+
+   * - ``$model_repo``
+     - ``amd/Llama-3.1-70B-Instruct-FP8-KV``
+     - `Llama 3.1 70B FP8 <https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV>`_
+
+   * - (``float8``)
+     - ``amd/Llama-3.1-405B-Instruct-FP8-KV``
+     - `Llama 3.1 405B FP8 <https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV>`_
+
+   * -
+     - ``amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV``
+     - `Mixtral MoE 8x7B FP8 <https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV>`_
+
+   * -
+     - ``amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV``
+     - `Mixtral MoE 8x22B FP8 <https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV>`_
+
+   * -
+     - ``amd/Mistral-7B-v0.1-FP8-KV``
+     - `Mistral 7B FP8 <https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV>`_
+
+   * -
+     - ``amd/dbrx-instruct-FP8-KV``
+     - `DBRX Instruct FP8 <https://huggingface.co/amd/dbrx-instruct-FP8-KV>`_
+
+   * -
+     - ``amd/c4ai-command-r-plus-FP8-KV``
+     - `C4AI Command R+ 08-2024 FP8 <https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV>`_
+
+   * - ``$num_gpu``
+     - 1 or 8
+     - Number of GPUs
+
+   * - ``$datatype``
+     - ``float16`` or ``float8``
+     - Data type
+
+.. _vllm-benchmark-run-benchmark-v066:
+
+Running the benchmark on the MI300X accelerator
+-----------------------------------------------
+
+Here are some examples of running the benchmark with various options.
+See :ref:`Options <vllm-benchmark-standalone-v066-options>` for the list of
+options and their descriptions.
+
+Example 1: latency benchmark
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ 
+Use this command to benchmark the latency of the Llama 3.1 70B model on eight GPUs with the ``float16`` and ``float8`` data types.
+
+.. code-block::
+
+   ./vllm_benchmark_report.sh -s latency -m meta-llama/Llama-3.1-70B-Instruct -g 8 -d float16
+   ./vllm_benchmark_report.sh -s latency -m amd/Llama-3.1-70B-Instruct-FP8-KV -g 8 -d float8
+
+Find the latency reports at:
+
+- ``./reports_float16/summary/Llama-3.1-70B-Instruct_latency_report.csv``
+
+- ``./reports_float8/summary/Llama-3.1-70B-Instruct-FP8-KV_latency_report.csv``
+
+Example 2: throughput benchmark
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Use this command to benchmark the throughput of the Llama 3.1 70B model on eight GPUs with the ``float16`` and ``float8`` data types.
+
+.. code-block:: shell
+
+   ./vllm_benchmark_report.sh -s throughput -m meta-llama/Llama-3.1-70B-Instruct -g 8 -d float16
+   ./vllm_benchmark_report.sh -s throughput -m amd/Llama-3.1-70B-Instruct-FP8-KV -g 8 -d float8
+
+Find the throughput reports at:
+
+- ``./reports_float16/summary/Llama-3.1-70B-Instruct_throughput_report.csv``
+
+- ``./reports_float8/summary/Llama-3.1-70B-Instruct-FP8-KV_throughput_report.csv``
+
+.. raw:: html
+
+   <style>
+   mjx-container[jax="CHTML"][display="true"] {
+       text-align: left;
+       margin: 0;
+   }
+   </style>
+
+.. note::
+
+   Throughput is calculated as:
+
+   - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
+
+   - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
+
+Further reading
+===============
+
+- For application performance optimization strategies for HPC and AI workloads,
+  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
+
+- To learn more about the options for latency and throughput benchmark scripts,
+  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
+
+- To learn more about system settings and management practices to configure your system for
+  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+
+- To learn how to run community models from Hugging Face on AMD GPUs, see
+  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
+
+- To learn how to fine-tune LLMs and optimize inference, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`vllm-history` to find documentation for previous releases
+of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.7.3-20250325.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.7.3-20250325.rst
@@ -0,0 +1,329 @@
+:orphan:
+
+.. meta::
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
+                 ROCm vLLM Docker image.
+   :keywords: model, MAD, automation, dashboarding, validate
+
+**********************************
+vLLM inference performance testing
+**********************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm vLLM
+   inference performance documentation. See :doc:`../vllm` for the latest version.
+
+.. _vllm-benchmark-unified-docker:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.7.3_20250325-benchmark-models.yaml
+
+   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
+   {% set model_groups = data.vllm_benchmark.model_groups %}
+
+   The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
+   a prebuilt, optimized environment for validating large language model (LLM)
+   inference performance on AMD Instinct™ MI300X series accelerator. This ROCm vLLM
+   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
+   accelerators and includes the following components:
+
+   * `ROCm {{ unified_docker.rocm_version }} <https://github.com/ROCm/ROCm>`_
+
+   * `vLLM {{ unified_docker.vllm_version }} <https://docs.vllm.ai/en/latest>`_
+
+   * `PyTorch {{ unified_docker.pytorch_version }} <https://github.com/pytorch/pytorch>`_
+
+   * `hipBLASLt {{ unified_docker.hipblaslt_version }} <https://github.com/ROCm/hipBLASLt>`_
+
+   With this Docker image, you can quickly test the :ref:`expected
+   inference performance numbers <vllm-benchmark-performance-measurements-v073>` for
+   MI300X series accelerators.
+
+   .. _vllm-benchmark-available-models-v073:
+
+   Available models
+   ================
+
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+        <div class="row">
+          <div class="col-2 me-2 model-param-head">Model</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+   {% endfor %}
+          </div>
+        </div>
+
+        <div class="row mt-1">
+          <div class="col-2 me-2 model-param-head">Model variant</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+      {% set models = model_group.models %}
+      {% for model in models %}
+         {% if models|length % 3 == 0 %}
+            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% else %}
+            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% endif %}
+      {% endfor %}
+   {% endfor %}
+          </div>
+        </div>
+      </div>
+
+   .. _vllm-benchmark-vllm:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. note::
+
+         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
+         Some models require access authorization prior to use via an external license agreement through a third party.
+
+      {% endfor %}
+   {% endfor %}
+
+   .. note::
+
+      vLLM is a toolkit and library for LLM inference and serving. AMD implements
+      high-performance custom kernels and modules in vLLM to enhance performance.
+      See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
+      more information.
+
+   .. _vllm-benchmark-performance-measurements-v073:
+
+   Performance measurements
+   ========================
+
+   To evaluate performance, the
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+   page provides reference throughput and latency measurements for inferencing
+   popular AI models.
+
+   .. important::
+
+      The performance data presented in
+      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+      only reflects the :doc:`latest version of this inference benchmarking environment <../vllm>`.
+      The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
+
+   Advanced features and known issues
+   ==================================
+
+   For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
+   see the developer's guide at `<https://github.com/ROCm/vllm/tree/25070a1841df0dca585b7ddcb967c42aaec4b7c5/docs/dev-docker>`__.
+
+   Getting started
+   ===============
+
+   Use the following procedures to reproduce the benchmark results on an
+   MI300X accelerator with the prebuilt vLLM Docker image.
+
+   .. _vllm-benchmark-get-started:
+
+   1. Disable NUMA auto-balancing.
+
+      To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
+      might hang until the periodic balancing is finalized. For more information,
+      see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
+
+      .. code-block:: shell
+
+         # disable automatic NUMA balancing
+         sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
+         # check if NUMA balancing is disabled (returns 0 if disabled)
+         cat /proc/sys/kernel/numa_balancing
+         0
+
+   2. Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
+
+      Use the following command to pull the Docker image from Docker Hub.
+
+      .. code-block:: shell
+
+         docker pull {{ unified_docker.pull_tag }}
+
+   Benchmarking
+   ============
+
+   Once the setup is complete, choose between two options to reproduce the
+   benchmark results:
+
+   .. _vllm-benchmark-mad-v073:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. tab-set::
+
+         .. tab-item:: MAD-integrated benchmarking
+
+            Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+            directory and install the required packages on the host machine.
+
+            .. code-block:: shell
+
+               git clone https://github.com/ROCm/MAD
+               cd MAD
+               pip install -r requirements.txt
+
+            Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
+            using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
+
+            .. code-block:: shell
+
+               export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+               python3 tools/run_models.py --tags {{model.mad_tag}} --keep-model-dir --live-output --timeout 28800
+
+            MAD launches a Docker container with the name
+            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
+            model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
+
+            Although the :ref:`available models <vllm-benchmark-available-models-v073>` are preconfigured
+            to collect latency and throughput performance data, you can also change the benchmarking
+            parameters. See the standalone benchmarking tab for more information.
+
+         .. tab-item:: Standalone benchmarking
+
+            Run the vLLM benchmark tool independently by starting the
+            `Docker container <{{ unified_docker.docker_hub_url }}>`_
+            as shown in the following snippet.
+
+            .. code-block::
+
+               docker pull {{ unified_docker.pull_tag }}
+               docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name test {{ unified_docker.pull_tag }}
+
+            In the Docker container, clone the ROCm MAD repository and navigate to the
+            benchmark scripts directory at ``~/MAD/scripts/vllm``.
+
+            .. code-block::
+
+               git clone https://github.com/ROCm/MAD
+               cd MAD/scripts/vllm
+
+            To start the benchmark, use the following command with the appropriate options.
+
+            .. code-block::
+
+               ./vllm_benchmark_report.sh -s $test_option -m {{model.model_repo}} -g $num_gpu -d {{model.precision}}
+
+            .. list-table::
+               :header-rows: 1
+               :align: center
+
+               * - Name
+                 - Options
+                 - Description
+
+               * - ``$test_option``
+                 - latency
+                 - Measure decoding token latency
+
+               * -
+                 - throughput
+                 - Measure token generation throughput
+
+               * -
+                 - all
+                 - Measure both throughput and latency
+
+               * - ``$num_gpu``
+                 - 1 or 8
+                 - Number of GPUs
+
+               * - ``$datatype``
+                 - ``float16`` or ``float8``
+                 - Data type
+
+            .. note::
+
+               The input sequence length, output sequence length, and tensor parallel (TP) are
+               already configured. You don't need to specify them with this script.
+
+            .. note::
+
+               If you encounter the following error, pass your access-authorized Hugging
+               Face token to the gated models.
+
+               .. code-block::
+
+                  OSError: You are trying to access a gated repo.
+
+                  # pass your HF_TOKEN
+                  export HF_TOKEN=$your_personal_hf_token
+
+            Here are some examples of running the benchmark with various options.
+
+            * Latency benchmark
+
+              Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with the :literal:`{{model.precision}}` data type.
+
+              .. code-block::
+
+                 ./vllm_benchmark_report.sh -s latency -m {{model.model_repo}} -g 8 -d {{model.precision}}
+
+              Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
+
+            * Throughput benchmark
+
+              Use this command to throughput the latency of the {{model.model}} model on eight GPUs with the :literal:`{{model.precision}}` data type.
+
+              .. code-block:: shell
+
+                 ./vllm_benchmark_report.sh -s latency -m {{model.model_repo}} -g 8 -d {{model.precision}}
+
+              Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
+
+            .. raw:: html
+
+               <style>
+               mjx-container[jax="CHTML"][display="true"] {
+                  text-align: left;
+                  margin: 0;
+               }
+               </style>
+
+            .. note::
+
+               Throughput is calculated as:
+
+               - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
+
+               - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
+      {% endfor %}
+   {% endfor %}
+
+Further reading
+===============
+
+- For application performance optimization strategies for HPC and AI workloads,
+  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
+
+- To learn more about the options for latency and throughput benchmark scripts,
+  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
+
+- To learn more about system settings and management practices to configure your system for
+  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+
+- To learn how to run community models from Hugging Face on AMD GPUs, see
+  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
+
+- To learn how to fine-tune LLMs and optimize inference, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`vllm-history` to find documentation for previous releases
+of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.3-20250415.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.3-20250415.rst
@@ -0,0 +1,345 @@
+:orphan:
+
+.. meta::
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
+                 ROCm vLLM Docker image.
+   :keywords: model, MAD, automation, dashboarding, validate
+
+**********************************
+vLLM inference performance testing
+**********************************
+
+.. _vllm-benchmark-unified-docker:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.8.3_20250415-benchmark-models.yaml
+
+   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
+   {% set model_groups = data.vllm_benchmark.model_groups %}
+
+   The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
+   a prebuilt, optimized environment for validating large language model (LLM)
+   inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
+   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
+   accelerators and includes the following components:
+
+   * `ROCm {{ unified_docker.rocm_version }} <https://github.com/ROCm/ROCm>`_
+
+   * `vLLM {{ unified_docker.vllm_version }} <https://docs.vllm.ai/en/latest>`_
+
+   * `PyTorch {{ unified_docker.pytorch_version }} <https://github.com/pytorch/pytorch>`_
+
+   * `hipBLASLt {{ unified_docker.hipblaslt_version }} <https://github.com/ROCm/hipBLASLt>`_
+
+   With this Docker image, you can quickly test the :ref:`expected
+   inference performance numbers <vllm-benchmark-performance-measurements-v083>` for
+   MI300X series accelerators.
+
+   .. _vllm-benchmark-available-models-v083:
+
+   Supported models
+   ================
+
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+        <div class="row">
+          <div class="col-2 me-2 model-param-head">Model</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+   {% endfor %}
+          </div>
+        </div>
+
+        <div class="row mt-1">
+          <div class="col-2 me-2 model-param-head">Model variant</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+      {% set models = model_group.models %}
+      {% for model in models %}
+         {% if models|length % 3 == 0 %}
+            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% else %}
+            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% endif %}
+      {% endfor %}
+   {% endfor %}
+          </div>
+        </div>
+      </div>
+
+   .. _vllm-benchmark-vllm:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. note::
+
+         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
+         Some models require access authorization prior to use via an external license agreement through a third party.
+
+      {% endfor %}
+   {% endfor %}
+
+   .. note::
+
+      vLLM is a toolkit and library for LLM inference and serving. AMD implements
+      high-performance custom kernels and modules in vLLM to enhance performance.
+      See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
+      more information.
+
+   .. _vllm-benchmark-performance-measurements-v083:
+
+   Performance measurements
+   ========================
+
+   To evaluate performance, the
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+   page provides reference throughput and latency measurements for inferencing
+   popular AI models.
+
+   .. important::
+
+      The performance data presented in
+      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+      only reflects the :doc:`latest version of this inference benchmarking environment <../vllm>`.
+      The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
+
+   Advanced features and known issues
+   ==================================
+
+   For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
+   see the developer's guide at `<https://github.com/ROCm/vllm/tree/7a9f58aae0e7215a5f3dccde60e35072c41656c2/docs/dev-docker>`__.
+
+   System validation
+   =================
+
+   Before running AI workloads, it's important to validate that your AMD hardware is configured
+   correctly and performing optimally.
+
+   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
+   might hang until the periodic balancing is finalized. For more information,
+   see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
+
+   .. code-block:: shell
+
+      # disable automatic NUMA balancing
+      sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
+      # check if NUMA balancing is disabled (returns 0 if disabled)
+      cat /proc/sys/kernel/numa_balancing
+      0
+
+   To test for optimal performance, consult the recommended :ref:`System health benchmarks
+   <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+   system's configuration.
+
+   Pull the Docker image
+   =====================
+
+   Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
+   Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull {{ unified_docker.pull_tag }}
+
+   Benchmarking
+   ============
+
+   Once the setup is complete, choose between two options to reproduce the
+   benchmark results:
+
+   .. _vllm-benchmark-mad:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. tab-set::
+
+         .. tab-item:: MAD-integrated benchmarking
+
+            Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+            directory and install the required packages on the host machine.
+
+            .. code-block:: shell
+
+               git clone https://github.com/ROCm/MAD
+               cd MAD
+               pip install -r requirements.txt
+
+            Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
+            using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
+
+            .. code-block:: shell
+
+               export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+               python3 tools/run_models.py --tags {{model.mad_tag}} --keep-model-dir --live-output --timeout 28800
+
+            MAD launches a Docker container with the name
+            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
+            model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
+
+            Although the :ref:`available models <vllm-benchmark-available-models-v083>` are preconfigured
+            to collect latency and throughput performance data, you can also change the benchmarking
+            parameters. See the standalone benchmarking tab for more information.
+
+            {% if model.tunableop %}
+
+            .. note::
+
+               For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
+               TunableOp automatically explores different implementations and configurations of certain PyTorch
+               operators to find the fastest one for your hardware.
+
+               By default, ``{{model.mad_tag}}`` runs with TunableOp disabled
+               (see
+               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To
+               enable it, edit the default run behavior in the ``models.json``
+               configuration before running inference -- update the model's run
+               ``args`` by changing ``--tunableop off`` to ``--tunableop on``.
+
+               Enabling TunableOp triggers a two-pass run -- a warm-up followed by the performance-collection run.
+
+            {% endif %}
+
+         .. tab-item:: Standalone benchmarking
+
+            Run the vLLM benchmark tool independently by starting the
+            `Docker container <{{ unified_docker.docker_hub_url }}>`_
+            as shown in the following snippet.
+
+            .. code-block::
+
+               docker pull {{ unified_docker.pull_tag }}
+               docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name test {{ unified_docker.pull_tag }}
+
+            In the Docker container, clone the ROCm MAD repository and navigate to the
+            benchmark scripts directory at ``~/MAD/scripts/vllm``.
+
+            .. code-block::
+
+               git clone https://github.com/ROCm/MAD
+               cd MAD/scripts/vllm
+
+            To start the benchmark, use the following command with the appropriate options.
+
+            .. code-block::
+
+               ./vllm_benchmark_report.sh -s $test_option -m {{model.model_repo}} -g $num_gpu -d {{model.precision}}
+
+            .. list-table::
+               :header-rows: 1
+               :align: center
+
+               * - Name
+                 - Options
+                 - Description
+
+               * - ``$test_option``
+                 - latency
+                 - Measure decoding token latency
+
+               * -
+                 - throughput
+                 - Measure token generation throughput
+
+               * -
+                 - all
+                 - Measure both throughput and latency
+
+               * - ``$num_gpu``
+                 - 1 or 8
+                 - Number of GPUs
+
+               * - ``$datatype``
+                 - ``float16`` or ``float8``
+                 - Data type
+
+            .. note::
+
+               The input sequence length, output sequence length, and tensor parallel (TP) are
+               already configured. You don't need to specify them with this script.
+
+            .. note::
+
+               If you encounter the following error, pass your access-authorized Hugging
+               Face token to the gated models.
+
+               .. code-block::
+
+                  OSError: You are trying to access a gated repo.
+
+                  # pass your HF_TOKEN
+                  export HF_TOKEN=$your_personal_hf_token
+
+            Here are some examples of running the benchmark with various options.
+
+            * Latency benchmark
+
+              Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
+
+              .. code-block::
+
+                 ./vllm_benchmark_report.sh -s latency -m {{model.model_repo}} -g 8 -d {{model.precision}}
+
+              Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
+
+            * Throughput benchmark
+
+              Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
+
+              .. code-block:: shell
+
+                 ./vllm_benchmark_report.sh -s throughput -m {{model.model_repo}} -g 8 -d {{model.precision}}
+
+              Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
+
+            .. raw:: html
+
+               <style>
+               mjx-container[jax="CHTML"][display="true"] {
+                  text-align: left;
+                  margin: 0;
+               }
+               </style>
+
+            .. note::
+
+               Throughput is calculated as:
+
+               - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
+
+               - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
+      {% endfor %}
+   {% endfor %}
+
+Further reading
+===============
+
+- For application performance optimization strategies for HPC and AI workloads,
+  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
+
+- To learn more about the options for latency and throughput benchmark scripts,
+  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
+
+- To learn more about system settings and management practices to configure your system for
+  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+
+- To learn how to run community models from Hugging Face on AMD GPUs, see
+  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
+
+- To learn how to fine-tune LLMs and optimize inference, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`vllm-history` to find documentation for previous releases
+of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250513.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250513.rst
@@ -1,3 +1,5 @@
+:orphan:
+
 .. meta::
   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
                 ROCm vLLM Docker image.
@@ -7,9 +9,15 @@
 vLLM inference performance testing
 **********************************

+.. caution::
+
+   This documentation does not reflect the latest version of ROCm vLLM
+   inference performance documentation. See :doc:`../vllm` for the latest version.
+
 .. _vllm-benchmark-unified-docker:

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.8.5_20250513-benchmark-models.yaml
+
   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
   {% set model_groups = data.vllm_benchmark.model_groups %}

@@ -28,10 +36,10 @@ vLLM inference performance testing
   * `hipBLASLt {{ unified_docker.hipblaslt_version }} <https://github.com/ROCm/hipBLASLt>`_

   With this Docker image, you can quickly test the :ref:`expected
-   inference performance numbers <vllm-benchmark-performance-measurements>` for
+   inference performance numbers <vllm-benchmark-performance-measurements-v085-20250513>` for
   MI300X series accelerators.

-   .. _vllm-benchmark-available-models:
+   .. _vllm-benchmark-available-models-v085-20250513:

   Supported models
   ================
@@ -91,7 +99,7 @@ vLLM inference performance testing
      See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
      more information.

-   .. _vllm-benchmark-performance-measurements:
+   .. _vllm-benchmark-performance-measurements-v085-20250513:

   Performance measurements
   ========================
@@ -101,18 +109,18 @@ vLLM inference performance testing
   page provides reference throughput and latency measurements for inferencing
   popular AI models.

-   .. note::
+   .. important::

      The performance data presented in
      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
-      should not be interpreted as the peak performance achievable by AMD
-      Instinct MI325X and MI300X accelerators or ROCm software.
+      only reflects the :doc:`latest version of this inference benchmarking environment <../vllm>`.
+      The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.

   Advanced features and known issues
   ==================================

   For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
-   see the developer's guide at `<https://github.com/ROCm/vllm/blob/main/docs/dev-docker/README.md>`__.
+   see the developer's guide at `<https://github.com/ROCm/vllm/tree/16d2b92ebcf90fe55cf73fa0b9329a6c9d3dede8/docs/dev-docker>`__.

   System validation
   =================
@@ -125,11 +133,13 @@ vLLM inference performance testing
   see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.

   .. code-block:: shell
+
      # disable automatic NUMA balancing
      sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
      # check if NUMA balancing is disabled (returns 0 if disabled)
      cat /proc/sys/kernel/numa_balancing
      0
+
   To test for optimal performance, consult the recommended :ref:`System health benchmarks
   <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
   system's configuration.
@@ -141,7 +151,9 @@ vLLM inference performance testing
   Use the following command to pull the Docker image from Docker Hub.

   .. code-block:: shell
+
      docker pull {{ unified_docker.pull_tag }}
+
   Benchmarking
   ============

@@ -163,20 +175,24 @@ vLLM inference performance testing
            directory and install the required packages on the host machine.

            .. code-block:: shell
+
               git clone https://github.com/ROCm/MAD
               cd MAD
               pip install -r requirements.txt
+
            Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
-            using one GPU with the ``{{model.precision}}`` data type on the host machine.
+            using one GPU with the :literal:`{{model.precision}}` data type on the host machine.

            .. code-block:: shell
+
               export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
               python3 tools/run_models.py --tags {{model.mad_tag}} --keep-model-dir --live-output --timeout 28800
+
            MAD launches a Docker container with the name
            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
            model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.

-            Although the :ref:`available models <vllm-benchmark-available-models>` are preconfigured
+            Although the :ref:`available models <vllm-benchmark-available-models-v085-20250513>` are preconfigured
            to collect latency and throughput performance data, you can also change the benchmarking
            parameters. See the standalone benchmarking tab for more information.

@@ -206,18 +222,24 @@ vLLM inference performance testing
            as shown in the following snippet.

            .. code-block::
+
               docker pull {{ unified_docker.pull_tag }}
               docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name test {{ unified_docker.pull_tag }}
+
            In the Docker container, clone the ROCm MAD repository and navigate to the
            benchmark scripts directory at ``~/MAD/scripts/vllm``.

            .. code-block::
+
               git clone https://github.com/ROCm/MAD
               cd MAD/scripts/vllm
+
            To start the benchmark, use the following command with the appropriate options.

            .. code-block::
+
               ./vllm_benchmark_report.sh -s $test_option -m {{model.model_repo}} -g $num_gpu -d {{model.precision}}
+
            .. list-table::
               :header-rows: 1
               :align: center
@@ -257,25 +279,32 @@ vLLM inference performance testing
               Face token to the gated models.

               .. code-block::
+
                  OSError: You are trying to access a gated repo.
+
                  # pass your HF_TOKEN
                  export HF_TOKEN=$your_personal_hf_token
+
            Here are some examples of running the benchmark with various options.

            * Latency benchmark

-              Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision.
+              Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.

              .. code-block::
+
                 ./vllm_benchmark_report.sh -s latency -m {{model.model_repo}} -g 8 -d {{model.precision}}
+
              Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.

            * Throughput benchmark

-              Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision.
+              Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.

              .. code-block:: shell
+
                 ./vllm_benchmark_report.sh -s throughput -m {{model.model_repo}} -g 8 -d {{model.precision}}
+
              Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.

            .. raw:: html
@@ -304,16 +333,22 @@ Further reading
  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.

 - To learn more about system settings and management practices to configure your system for
-  MI300X accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_

 - For application performance optimization strategies for HPC and AI workloads,
-  including inference with vLLM, see :doc:`../../../inference-optimization/workload`.
+  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.

- To learn how to run LLM models from Hugging Face or your own model, see
-  :doc:`Running models from Hugging Face <../../hugging-face-models>`.
+- To learn how to run community models from Hugging Face on AMD GPUs, see
+  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.

- To learn how to optimize inference on LLMs, see
-  :doc:`Inference optimization <../../../inference-optimization/index>`.
+- To learn how to fine-tune LLMs and optimize inference, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.

- To learn how to fine-tune LLMs, see
-  :doc:`Fine-tuning LLMs <../../../fine-tuning/index>`.
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`vllm-history` to find documentation for previous releases
+of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250521.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250521.rst
@@ -0,0 +1,355 @@
+:orphan:
+
+.. meta::
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
+                 ROCm vLLM Docker image.
+   :keywords: model, MAD, automation, dashboarding, validate
+
+**********************************
+vLLM inference performance testing
+**********************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm vLLM
+   inference performance documentation. See :doc:`../vllm` for the latest version.
+
+.. _vllm-benchmark-unified-docker:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.8.5_20250521-benchmark-models.yaml
+
+   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
+   {% set model_groups = data.vllm_benchmark.model_groups %}
+
+   The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
+   a prebuilt, optimized environment for validating large language model (LLM)
+   inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
+   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
+   accelerators and includes the following components:
+
+   * `ROCm {{ unified_docker.rocm_version }} <https://github.com/ROCm/ROCm>`_
+
+   * `vLLM {{ unified_docker.vllm_version }} <https://docs.vllm.ai/en/latest>`_
+
+   * `PyTorch {{ unified_docker.pytorch_version }} <https://github.com/ROCm/pytorch.git>`_
+
+   * `hipBLASLt {{ unified_docker.hipblaslt_version }} <https://github.com/ROCm/hipBLASLt>`_
+
+   With this Docker image, you can quickly test the :ref:`expected
+   inference performance numbers <vllm-benchmark-performance-measurements-v085-20250521>` for
+   MI300X series accelerators.
+
+   .. _vllm-benchmark-available-models-v085-20250521:
+
+   Supported models
+   ================
+
+   The following models are supported for inference performance benchmarking
+   with vLLM and ROCm. Some instructions, commands, and recommendations in this
+   documentation might vary by model -- select one to get started.
+
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+        <div class="row">
+          <div class="col-2 me-2 model-param-head">Model group</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+   {% endfor %}
+          </div>
+        </div>
+
+        <div class="row mt-1">
+          <div class="col-2 me-2 model-param-head">Model</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+      {% set models = model_group.models %}
+      {% for model in models %}
+         {% if models|length % 3 == 0 %}
+            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% else %}
+            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% endif %}
+      {% endfor %}
+   {% endfor %}
+          </div>
+        </div>
+      </div>
+
+   .. _vllm-benchmark-vllm:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. note::
+
+         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
+         Some models require access authorization prior to use via an external license agreement through a third party.
+
+      {% endfor %}
+   {% endfor %}
+
+   .. note::
+
+      vLLM is a toolkit and library for LLM inference and serving. AMD implements
+      high-performance custom kernels and modules in vLLM to enhance performance.
+      See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
+      more information.
+
+   .. _vllm-benchmark-performance-measurements-v085-20250521:
+
+   Performance measurements
+   ========================
+
+   To evaluate performance, the
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+   page provides reference throughput and latency measurements for inferencing
+   popular AI models.
+
+   .. note::
+
+      The performance data presented in
+      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+      should not be interpreted as the peak performance achievable by AMD
+      Instinct MI325X and MI300X accelerators or ROCm software.
+
+   Advanced features and known issues
+   ==================================
+
+   For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
+   see the developer's guide at `<https://github.com/ROCm/vllm/blob/main/docs/dev-docker/README.md>`__.
+
+   System validation
+   =================
+
+   Before running AI workloads, it's important to validate that your AMD hardware is configured
+   correctly and performing optimally.
+
+   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
+   might hang until the periodic balancing is finalized. For more information,
+   see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
+
+   .. code-block:: shell
+
+      # disable automatic NUMA balancing
+      sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
+      # check if NUMA balancing is disabled (returns 0 if disabled)
+      cat /proc/sys/kernel/numa_balancing
+      0
+
+   To test for optimal performance, consult the recommended :ref:`System health benchmarks
+   <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+   system's configuration.
+
+   Pull the Docker image
+   =====================
+
+   Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
+   Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull {{ unified_docker.pull_tag }}
+
+   Benchmarking
+   ============
+
+   Once the setup is complete, choose between two options to reproduce the
+   benchmark results:
+
+   .. _vllm-benchmark-mad:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. tab-set::
+
+         .. tab-item:: MAD-integrated benchmarking
+
+            Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+            directory and install the required packages on the host machine.
+
+            .. code-block:: shell
+
+               git clone https://github.com/ROCm/MAD
+               cd MAD
+               pip install -r requirements.txt
+
+            Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
+            using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
+
+            .. code-block:: shell
+
+               export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+               python3 tools/run_models.py --tags {{model.mad_tag}} --keep-model-dir --live-output --timeout 28800
+
+            MAD launches a Docker container with the name
+            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
+            model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
+
+            Although the :ref:`available models <vllm-benchmark-available-models-v085-20250521>` are preconfigured
+            to collect latency and throughput performance data, you can also change the benchmarking
+            parameters. See the standalone benchmarking tab for more information.
+
+            {% if model.tunableop %}
+
+            .. note::
+
+               For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
+               TunableOp automatically explores different implementations and configurations of certain PyTorch
+               operators to find the fastest one for your hardware.
+
+               By default, ``{{model.mad_tag}}`` runs with TunableOp disabled
+               (see
+               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To
+               enable it, edit the default run behavior in the ``models.json``
+               configuration before running inference -- update the model's run
+               ``args`` by changing ``--tunableop off`` to ``--tunableop on``.
+
+               Enabling TunableOp triggers a two-pass run -- a warm-up followed by the performance-collection run.
+
+            {% endif %}
+
+         .. tab-item:: Standalone benchmarking
+
+            Run the vLLM benchmark tool independently by starting the
+            `Docker container <{{ unified_docker.docker_hub_url }}>`_
+            as shown in the following snippet.
+
+            .. code-block::
+
+               docker pull {{ unified_docker.pull_tag }}
+               docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name test {{ unified_docker.pull_tag }}
+
+            In the Docker container, clone the ROCm MAD repository and navigate to the
+            benchmark scripts directory at ``~/MAD/scripts/vllm``.
+
+            .. code-block::
+
+               git clone https://github.com/ROCm/MAD
+               cd MAD/scripts/vllm
+
+            To start the benchmark, use the following command with the appropriate options.
+
+            .. code-block::
+
+               ./vllm_benchmark_report.sh -s $test_option -m {{model.model_repo}} -g $num_gpu -d {{model.precision}}
+
+            .. list-table::
+               :header-rows: 1
+               :align: center
+
+               * - Name
+                 - Options
+                 - Description
+
+               * - ``$test_option``
+                 - latency
+                 - Measure decoding token latency
+
+               * -
+                 - throughput
+                 - Measure token generation throughput
+
+               * -
+                 - all
+                 - Measure both throughput and latency
+
+               * - ``$num_gpu``
+                 - 1 or 8
+                 - Number of GPUs
+
+               * - ``$datatype``
+                 - ``float16`` or ``float8``
+                 - Data type
+
+            .. note::
+
+               The input sequence length, output sequence length, and tensor parallel (TP) are
+               already configured. You don't need to specify them with this script.
+
+            .. note::
+
+               If you encounter the following error, pass your access-authorized Hugging
+               Face token to the gated models.
+
+               .. code-block::
+
+                  OSError: You are trying to access a gated repo.
+
+                  # pass your HF_TOKEN
+                  export HF_TOKEN=$your_personal_hf_token
+
+            Here are some examples of running the benchmark with various options.
+
+            * Latency benchmark
+
+              Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
+
+              .. code-block::
+
+                 ./vllm_benchmark_report.sh -s latency -m {{model.model_repo}} -g 8 -d {{model.precision}}
+
+              Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
+
+            * Throughput benchmark
+
+              Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
+
+              .. code-block:: shell
+
+                 ./vllm_benchmark_report.sh -s throughput -m {{model.model_repo}} -g 8 -d {{model.precision}}
+
+              Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
+
+            .. raw:: html
+
+               <style>
+               mjx-container[jax="CHTML"][display="true"] {
+                  text-align: left;
+                  margin: 0;
+               }
+               </style>
+
+            .. note::
+
+               Throughput is calculated as:
+
+               - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
+
+               - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
+      {% endfor %}
+   {% endfor %}
+
+Further reading
+===============
+
+- To learn more about the options for latency and throughput benchmark scripts,
+  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
+
+- To learn more about system settings and management practices to configure your system for
+  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+
+- For application performance optimization strategies for HPC and AI workloads,
+  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
+
+- To learn how to run community models from Hugging Face on AMD GPUs, see
+  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
+
+- To learn how to fine-tune LLMs and optimize inference, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`vllm-history` to find documentation for previous releases
+of the ``ROCm/vllm`` Docker image.
+
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250605.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250605.rst
@@ -0,0 +1,353 @@
+:orphan:
+
+.. meta::
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
+                 ROCm vLLM Docker image.
+   :keywords: model, MAD, automation, dashboarding, validate
+
+**********************************
+vLLM inference performance testing
+**********************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm vLLM
+   inference performance documentation. See :doc:`../vllm` for the latest version.
+
+.. _vllm-benchmark-unified-docker:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.0.1_20250605-benchmark-models.yaml
+
+   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
+   {% set model_groups = data.vllm_benchmark.model_groups %}
+
+   The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
+   a prebuilt, optimized environment for validating large language model (LLM)
+   inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
+   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
+   accelerators and includes the following components:
+
+   * `ROCm {{ unified_docker.rocm_version }} <https://github.com/ROCm/ROCm>`_
+
+   * `vLLM {{ unified_docker.vllm_version }} <https://docs.vllm.ai/en/latest>`_
+
+   * `PyTorch {{ unified_docker.pytorch_version }} <https://github.com/ROCm/pytorch.git>`_
+
+   * `hipBLASLt {{ unified_docker.hipblaslt_version }} <https://github.com/ROCm/hipBLASLt>`_
+
+   With this Docker image, you can quickly test the :ref:`expected
+   inference performance numbers <vllm-benchmark-performance-measurements-v0901-20250605>` for
+   MI300X series accelerators.
+
+   .. _vllm-benchmark-available-models-v0901-20250605:
+
+   Supported models
+   ================
+
+   The following models are supported for inference performance benchmarking
+   with vLLM and ROCm. Some instructions, commands, and recommendations in this
+   documentation might vary by model -- select one to get started.
+
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+        <div class="row">
+          <div class="col-2 me-2 model-param-head">Model group</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+   {% endfor %}
+          </div>
+        </div>
+
+        <div class="row mt-1">
+          <div class="col-2 me-2 model-param-head">Model</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+      {% set models = model_group.models %}
+      {% for model in models %}
+         {% if models|length % 3 == 0 %}
+            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% else %}
+            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% endif %}
+      {% endfor %}
+   {% endfor %}
+          </div>
+        </div>
+      </div>
+
+   .. _vllm-benchmark-vllm:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. note::
+
+         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
+         Some models require access authorization prior to use via an external license agreement through a third party.
+
+      {% endfor %}
+   {% endfor %}
+
+   .. note::
+
+      vLLM is a toolkit and library for LLM inference and serving. AMD implements
+      high-performance custom kernels and modules in vLLM to enhance performance.
+      See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
+      more information.
+
+   .. _vllm-benchmark-performance-measurements-v0901-20250605:
+
+   Performance measurements
+   ========================
+
+   To evaluate performance, the
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+   page provides reference throughput and latency measurements for inferencing popular AI models.
+
+   .. important::
+
+      The performance data presented in
+      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+      only reflects the latest version of this inference benchmarking environment.
+      The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
+
+   Advanced features and known issues
+   ==================================
+
+   For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
+   see the developer's guide at `<https://github.com/ROCm/vllm/tree/7bb0618b1fe725b7d4fad9e525aa44da12c94a8b/docs/dev-docker>`__.
+
+   System validation
+   =================
+
+   Before running AI workloads, it's important to validate that your AMD hardware is configured
+   correctly and performing optimally.
+
+   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
+   might hang until the periodic balancing is finalized. For more information,
+   see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
+
+   .. code-block:: shell
+
+      # disable automatic NUMA balancing
+      sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
+      # check if NUMA balancing is disabled (returns 0 if disabled)
+      cat /proc/sys/kernel/numa_balancing
+      0
+
+   To test for optimal performance, consult the recommended :ref:`System health benchmarks
+   <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+   system's configuration.
+
+   Pull the Docker image
+   =====================
+
+   Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
+   Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull {{ unified_docker.pull_tag }}
+
+   Benchmarking
+   ============
+
+   Once the setup is complete, choose between two options to reproduce the
+   benchmark results:
+
+   .. _vllm-benchmark-mad:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. tab-set::
+
+         .. tab-item:: MAD-integrated benchmarking
+
+            Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+            directory and install the required packages on the host machine.
+
+            .. code-block:: shell
+
+               git clone https://github.com/ROCm/MAD
+               cd MAD
+               pip install -r requirements.txt
+
+            Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
+            using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
+
+            .. code-block:: shell
+
+               export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+               python3 tools/run_models.py --tags {{model.mad_tag}} --keep-model-dir --live-output --timeout 28800
+
+            MAD launches a Docker container with the name
+            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
+            model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
+
+            Although the :ref:`available models <vllm-benchmark-available-models-v0901-20250605>` are preconfigured
+            to collect latency and throughput performance data, you can also change the benchmarking
+            parameters. See the standalone benchmarking tab for more information.
+
+            {% if model.tunableop %}
+
+            .. note::
+
+               For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
+               TunableOp automatically explores different implementations and configurations of certain PyTorch
+               operators to find the fastest one for your hardware.
+
+               By default, ``{{model.mad_tag}}`` runs with TunableOp disabled
+               (see
+               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To
+               enable it, edit the default run behavior in the ``models.json``
+               configuration before running inference -- update the model's run
+               ``args`` by changing ``--tunableop off`` to ``--tunableop on``.
+
+               Enabling TunableOp triggers a two-pass run -- a warm-up followed by the performance-collection run.
+
+            {% endif %}
+
+         .. tab-item:: Standalone benchmarking
+
+            Run the vLLM benchmark tool independently by starting the
+            `Docker container <{{ unified_docker.docker_hub_url }}>`_
+            as shown in the following snippet.
+
+            .. code-block::
+
+               docker pull {{ unified_docker.pull_tag }}
+               docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name test {{ unified_docker.pull_tag }}
+
+            In the Docker container, clone the ROCm MAD repository and navigate to the
+            benchmark scripts directory at ``~/MAD/scripts/vllm``.
+
+            .. code-block::
+
+               git clone https://github.com/ROCm/MAD
+               cd MAD/scripts/vllm
+
+            To start the benchmark, use the following command with the appropriate options.
+
+            .. code-block::
+
+               ./vllm_benchmark_report.sh -s $test_option -m {{model.model_repo}} -g $num_gpu -d {{model.precision}}
+
+            .. list-table::
+               :header-rows: 1
+               :align: center
+
+               * - Name
+                 - Options
+                 - Description
+
+               * - ``$test_option``
+                 - latency
+                 - Measure decoding token latency
+
+               * -
+                 - throughput
+                 - Measure token generation throughput
+
+               * -
+                 - all
+                 - Measure both throughput and latency
+
+               * - ``$num_gpu``
+                 - 1 or 8
+                 - Number of GPUs
+
+               * - ``$datatype``
+                 - ``float16`` or ``float8``
+                 - Data type
+
+            .. note::
+
+               The input sequence length, output sequence length, and tensor parallel (TP) are
+               already configured. You don't need to specify them with this script.
+
+            .. note::
+
+               If you encounter the following error, pass your access-authorized Hugging
+               Face token to the gated models.
+
+               .. code-block::
+
+                  OSError: You are trying to access a gated repo.
+
+                  # pass your HF_TOKEN
+                  export HF_TOKEN=$your_personal_hf_token
+
+            Here are some examples of running the benchmark with various options.
+
+            * Latency benchmark
+
+              Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
+
+              .. code-block::
+
+                 ./vllm_benchmark_report.sh -s latency -m {{model.model_repo}} -g 8 -d {{model.precision}}
+
+              Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
+
+            * Throughput benchmark
+
+              Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
+
+              .. code-block:: shell
+
+                 ./vllm_benchmark_report.sh -s throughput -m {{model.model_repo}} -g 8 -d {{model.precision}}
+
+              Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
+
+            .. raw:: html
+
+               <style>
+               mjx-container[jax="CHTML"][display="true"] {
+                  text-align: left;
+                  margin: 0;
+               }
+               </style>
+
+            .. note::
+
+               Throughput is calculated as:
+
+               - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
+
+               - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
+      {% endfor %}
+   {% endfor %}
+
+Further reading
+===============
+
+- To learn more about the options for latency and throughput benchmark scripts,
+  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
+
+- To learn more about system settings and management practices to configure your system for
+  MI300X accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+
+- For application performance optimization strategies for HPC and AI workloads,
+  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
+
+- To learn how to run community models from Hugging Face on AMD GPUs, see
+  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
+
+- To learn how to fine-tune LLMs and optimize inference, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`vllm-history` to find documentation for previous releases
+of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250702.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250702.rst
@@ -0,0 +1,353 @@
+:orphan:
+
+.. meta::
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
+                 ROCm vLLM Docker image.
+   :keywords: model, MAD, automation, dashboarding, validate
+
+**********************************
+vLLM inference performance testing
+**********************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm vLLM
+   inference performance documentation. See :doc:`../vllm` for the latest version.
+
+.. _vllm-benchmark-unified-docker:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250702-benchmark-models.yaml
+
+   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
+   {% set model_groups = data.vllm_benchmark.model_groups %}
+
+   The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
+   a prebuilt, optimized environment for validating large language model (LLM)
+   inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
+   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
+   accelerators and includes the following components:
+
+   * `ROCm {{ unified_docker.rocm_version }} <https://github.com/ROCm/ROCm>`_
+
+   * `vLLM {{ unified_docker.vllm_version }} <https://docs.vllm.ai/en/latest>`_
+
+   * `PyTorch {{ unified_docker.pytorch_version }} <https://github.com/ROCm/pytorch.git>`_
+
+   * `hipBLASLt {{ unified_docker.hipblaslt_version }} <https://github.com/ROCm/hipBLASLt>`_
+
+   With this Docker image, you can quickly test the :ref:`expected
+   inference performance numbers <vllm-benchmark-performance-measurements-20250702>` for
+   MI300X series accelerators.
+
+   .. _vllm-benchmark-available-models-20250702:
+
+   Supported models
+   ================
+
+   The following models are supported for inference performance benchmarking
+   with vLLM and ROCm. Some instructions, commands, and recommendations in this
+   documentation might vary by model -- select one to get started.
+
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+        <div class="row">
+          <div class="col-2 me-2 model-param-head">Model group</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+   {% endfor %}
+          </div>
+        </div>
+
+        <div class="row mt-1">
+          <div class="col-2 me-2 model-param-head">Model</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+      {% set models = model_group.models %}
+      {% for model in models %}
+         {% if models|length % 3 == 0 %}
+            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% else %}
+            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% endif %}
+      {% endfor %}
+   {% endfor %}
+          </div>
+        </div>
+      </div>
+
+   .. _vllm-benchmark-vllm:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. note::
+
+         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
+         Some models require access authorization prior to use via an external license agreement through a third party.
+
+      {% endfor %}
+   {% endfor %}
+
+   .. note::
+
+      vLLM is a toolkit and library for LLM inference and serving. AMD implements
+      high-performance custom kernels and modules in vLLM to enhance performance.
+      See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
+      more information.
+
+   .. _vllm-benchmark-performance-measurements-20250702:
+
+   Performance measurements
+   ========================
+
+   To evaluate performance, the
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+   page provides reference throughput and latency measurements for inferencing popular AI models.
+
+   .. important::
+
+      The performance data presented in
+      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+      only reflects the latest version of this inference benchmarking environment.
+      The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
+
+   Advanced features and known issues
+   ==================================
+
+   For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
+   see the developer's guide at `<https://github.com/ROCm/vllm/tree/5486e7bc8523be0324ccd68f221959445b56cc2a/docs/dev-docker>`__.
+
+   System validation
+   =================
+
+   Before running AI workloads, it's important to validate that your AMD hardware is configured
+   correctly and performing optimally.
+
+   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
+   might hang until the periodic balancing is finalized. For more information,
+   see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
+
+   .. code-block:: shell
+
+      # disable automatic NUMA balancing
+      sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
+      # check if NUMA balancing is disabled (returns 0 if disabled)
+      cat /proc/sys/kernel/numa_balancing
+      0
+
+   To test for optimal performance, consult the recommended :ref:`System health benchmarks
+   <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+   system's configuration.
+
+   Pull the Docker image
+   =====================
+
+   Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
+   Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull {{ unified_docker.pull_tag }}
+
+   Benchmarking
+   ============
+
+   Once the setup is complete, choose between two options to reproduce the
+   benchmark results:
+
+   .. _vllm-benchmark-mad:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. tab-set::
+
+         .. tab-item:: MAD-integrated benchmarking
+
+            Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+            directory and install the required packages on the host machine.
+
+            .. code-block:: shell
+
+               git clone https://github.com/ROCm/MAD
+               cd MAD
+               pip install -r requirements.txt
+
+            Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
+            using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
+
+            .. code-block:: shell
+
+               export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+               python3 tools/run_models.py --tags {{model.mad_tag}} --keep-model-dir --live-output --timeout 28800
+
+            MAD launches a Docker container with the name
+            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
+            model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
+
+            Although the :ref:`available models <vllm-benchmark-available-models-20250702>` are preconfigured
+            to collect latency and throughput performance data, you can also change the benchmarking
+            parameters. See the standalone benchmarking tab for more information.
+
+            {% if model.tunableop %}
+
+            .. note::
+
+               For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
+               TunableOp automatically explores different implementations and configurations of certain PyTorch
+               operators to find the fastest one for your hardware.
+
+               By default, ``{{model.mad_tag}}`` runs with TunableOp disabled
+               (see
+               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To
+               enable it, edit the default run behavior in the ``models.json``
+               configuration before running inference -- update the model's run
+               ``args`` by changing ``--tunableop off`` to ``--tunableop on``.
+
+               Enabling TunableOp triggers a two-pass run -- a warm-up followed by the performance-collection run.
+
+            {% endif %}
+
+         .. tab-item:: Standalone benchmarking
+
+            Run the vLLM benchmark tool independently by starting the
+            `Docker container <{{ unified_docker.docker_hub_url }}>`_
+            as shown in the following snippet.
+
+            .. code-block::
+
+               docker pull {{ unified_docker.pull_tag }}
+               docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name test {{ unified_docker.pull_tag }}
+
+            In the Docker container, clone the ROCm MAD repository and navigate to the
+            benchmark scripts directory at ``~/MAD/scripts/vllm``.
+
+            .. code-block::
+
+               git clone https://github.com/ROCm/MAD
+               cd MAD/scripts/vllm
+
+            To start the benchmark, use the following command with the appropriate options.
+
+            .. code-block::
+
+               ./vllm_benchmark_report.sh -s $test_option -m {{model.model_repo}} -g $num_gpu -d {{model.precision}}
+
+            .. list-table::
+               :header-rows: 1
+               :align: center
+
+               * - Name
+                 - Options
+                 - Description
+
+               * - ``$test_option``
+                 - latency
+                 - Measure decoding token latency
+
+               * -
+                 - throughput
+                 - Measure token generation throughput
+
+               * -
+                 - all
+                 - Measure both throughput and latency
+
+               * - ``$num_gpu``
+                 - 1 or 8
+                 - Number of GPUs
+
+               * - ``$datatype``
+                 - ``float16`` or ``float8``
+                 - Data type
+
+            .. note::
+
+               The input sequence length, output sequence length, and tensor parallel (TP) are
+               already configured. You don't need to specify them with this script.
+
+            .. note::
+
+               If you encounter the following error, pass your access-authorized Hugging
+               Face token to the gated models.
+
+               .. code-block::
+
+                  OSError: You are trying to access a gated repo.
+
+                  # pass your HF_TOKEN
+                  export HF_TOKEN=$your_personal_hf_token
+
+            Here are some examples of running the benchmark with various options.
+
+            * Latency benchmark
+
+              Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with :literal`{{model.precision}}` precision.
+
+              .. code-block::
+
+                 ./vllm_benchmark_report.sh -s latency -m {{model.model_repo}} -g 8 -d {{model.precision}}
+
+              Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
+
+            * Throughput benchmark
+
+              Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
+
+              .. code-block:: shell
+
+                 ./vllm_benchmark_report.sh -s throughput -m {{model.model_repo}} -g 8 -d {{model.precision}}
+
+              Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
+
+            .. raw:: html
+
+               <style>
+               mjx-container[jax="CHTML"][display="true"] {
+                  text-align: left;
+                  margin: 0;
+               }
+               </style>
+
+            .. note::
+
+               Throughput is calculated as:
+
+               - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
+
+               - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
+      {% endfor %}
+   {% endfor %}
+
+Further reading
+===============
+
+- To learn more about the options for latency and throughput benchmark scripts,
+  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
+
+- To learn more about system settings and management practices to configure your system for
+  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+
+- For application performance optimization strategies for HPC and AI workloads,
+  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
+
+- To learn how to run community models from Hugging Face on AMD GPUs, see
+  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
+
+- To learn how to fine-tune LLMs and optimize inference, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`vllm-history` to find documentation for previous releases
+of the ``ROCm/vllm`` Docker image.
--- a/Show More
+++ b/Show More