rocm-docs-core experiment

[docs/7.0.0-alpha] Add docs for 7.0 alpha (#4978 )
[Ex CI] add component name to artifact download filter (#4974 )
2026-01-11 07:38:17 -05:00 · 2025-06-26 15:58:18 -04:00 · 2025-06-26 15:47:42 -04:00 · 2025-06-26 13:55:03 -04:00 · 2025-06-23 16:35:02 -04:00 · 2025-06-23 15:37:52 -04:00
131 changed files with 5997 additions and 6096 deletions
--- a/.azuredevops/ci-builds/mathlibs-trigger.yml
+++ b/.azuredevops/ci-builds/mathlibs-trigger.yml
@@ -0,0 +1,33 @@
+variables:
+- group: common
+- template: /.azuredevops/variables-global.yml@pipelines_repo
+
+parameters:
+- name: pipelinesRepoRef
+  type: string
+  default: refs/heads/develop
+- name: librariesRepoRef
+  type: string
+  default: refs/heads/develop
+
+resources:
+  repositories:
+  - repository: pipelines_repo
+    type: github
+    endpoint: ROCm
+    name: ROCm/ROCm
+    ref: ${{ parameters.pipelinesRepoRef }}
+  - repository: libraries_repo
+    type: github
+    endpoint: ROCm
+    name: ROCm/rocm-libraries
+    ref: ${{ parameters.librariesRepoRef }}
+
+trigger: none
+pr: none
+
+jobs:
+  - template: /.azuredevops/ci-builds/mathlibs.yml@pipelines_repo
+    parameters:
+      checkoutRepo: libraries_repo
+      buildDependsOn: false
--- a/.azuredevops/ci-builds/mathlibs.yml
+++ b/.azuredevops/ci-builds/mathlibs.yml
@@ -0,0 +1,38 @@
+# entrypoint for kicking off a unified build of the mathlibs
+# this template is designed to be called by another pipeline (llvm, clr, etc.)
+# `buildDependsOn` will need to be set when calling this template
+# passes a `unifiedBuild` param to downstream pipelines, which will prevent duplicate jobs
+# logic needs to be added in individual mathlib pipelines for handling `unifiedBuild`
+
+parameters:
+- name: checkoutRepo
+  type: string
+  default: monorepo
+- name: buildDependsOn
+  type: object
+  default: false
+- name: downstreamComponentMatrix
+  type: object
+  default:
+    - rocRAND:
+      name: rocRAND
+      sparseCheckoutDir: projects/rocrand
+    - rocPRIM:
+      name: rocPRIM
+      sparseCheckoutDir: projects/rocprim
+    - hipBLAS-common:
+      name: hipBLAS-common
+      sparseCheckoutDir: projects/hipblas-common
+    # - composable_kernel:
+    #   name: composable_kernel
+    #   sparseCheckoutDir: projects/composablekernel
+
+jobs:
+- ${{ each component in parameters.downstreamComponentMatrix }}:
+  - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
+    parameters:
+      checkoutRepo: ${{ parameters.checkoutRepo }}
+      sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
+      buildDependsOn: ${{ parameters.buildDependsOn }}
+      triggerDownstreamJobs: true
+      unifiedBuild: true
--- a/.azuredevops/components/HIP.yml
+++ b/.azuredevops/components/HIP.yml
@@ -20,7 +20,7 @@ parameters:
    - ocl-icd-libopencl1
    - ocl-icd-opencl-dev
    - opencl-headers
-    - python3-pip
+    - zlib1g-dev
 - name: pipModules
  type: object
  default:
@@ -41,120 +41,148 @@ parameters:
 # any changes for clr should just trigger HIP pipeline
 # similarly for hipother repo, for Nvidia backend

+- name: jobMatrix
+  type: object
+  default:
+    buildJobs:
+      - { os: ubuntu2204, packageManager: apt }
+      - { os: almalinux8, packageManager: dnf }
+
 # HIP with AMD backend
 jobs:
- job: hip_clr_combined_amd
-  variables:
-  - group: common
-  - template: /.azuredevops/variables-global.yml
-  pool:
-    vmImage: ${{ variables.BASE_BUILD_POOL }}
-  workspace:
-    clean: all
-  steps:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-    parameters:
-      aptPackages: ${{ parameters.aptPackages }}
-      pipModules: ${{ parameters.pipModules }}
-# checkout triggering repo (either HIP or clr)
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-    parameters:
-      checkoutRepo: ${{ parameters.checkoutRepo }}
-# if this is triggered by HIP repo, matching repo is clr
-# if this is triggered by clr repo, matching repo is HIP
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-    parameters:
-      checkoutRepo: matching_repo
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-    parameters:
-      checkoutRepo: hipother_repo
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-    parameters:
-      checkoutRef: ${{ parameters.checkoutRef }}
-      dependencyList: ${{ parameters.rocmDependenciesAMD }}
-      aggregatePipeline: ${{ parameters.aggregatePipeline }}
-# compile clr
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-    parameters:
-      componentName: clr
-      cmakeBuildDir: '$(Build.SourcesDirectory)/clr/build'
-      cmakeSourceDir: '$(Build.SourcesDirectory)/clr'
-      extraBuildFlags: >-
-        -DHIP_COMMON_DIR=$(Build.SourcesDirectory)/HIP
-        -DHIP_PLATFORM=amd
-        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-        -DROCM_PATH=$(Agent.BuildDirectory)/rocm
-        -DHIPCC_BIN_DIR=$(Agent.BuildDirectory)/rocm/bin
-        -DCLR_BUILD_HIP=ON
-        -DCLR_BUILD_OCL=ON
-        -GNinja
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-    parameters:
-      artifactName: amd
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-    parameters:
-      artifactName: amd
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-  # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-  #   parameters:
-  #     aptPackages: ${{ parameters.aptPackages }}
-  #     pipModules: ${{ parameters.pipModules }}
-  #     environment: amd
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
+  - job: hip_clr_combined_${{ job.os }}_amd
+    pool:
+      vmImage: 'ubuntu-22.04'
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        pipModules: ${{ parameters.pipModules }}
+        packageManager: ${{ job.packageManager }}
+  # checkout triggering repo (either HIP or clr)
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: ${{ parameters.checkoutRepo }}
+  # if this is triggered by HIP repo, matching repo is clr
+  # if this is triggered by clr repo, matching repo is HIP
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: matching_repo
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: hipother_repo
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      parameters:
+        checkoutRef: ${{ parameters.checkoutRef }}
+        dependencyList: ${{ parameters.rocmDependenciesAMD }}
+        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        os: ${{ job.os }}
+  # compile clr
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        componentName: clr
+        cmakeBuildDir: '$(Build.SourcesDirectory)/clr/build'
+        cmakeSourceDir: '$(Build.SourcesDirectory)/clr'
+        os: ${{ job.os }}
+        useAmdclang: false
+        extraBuildFlags: >-
+          -DHIP_COMMON_DIR=$(Build.SourcesDirectory)/HIP
+          -DHIP_PLATFORM=amd
+          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+          -DROCM_PATH=$(Agent.BuildDirectory)/rocm
+          -DHIPCC_BIN_DIR=$(Agent.BuildDirectory)/rocm/bin
+          -DCLR_BUILD_HIP=ON
+          -DCLR_BUILD_OCL=ON
+          -GNinja
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+      parameters:
+        artifactName: amd
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
+        artifactName: amd
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+    #   parameters:
+    #     aptPackages: ${{ parameters.aptPackages }}
+    #     pipModules: ${{ parameters.pipModules }}
+    #     environment: amd

 # HIP with Nvidia backend
- job: hip_clr_combined_nvidia
-  variables:
-  - group: common
-  - template: /.azuredevops/variables-global.yml
-  pool:
-    vmImage: ${{ variables.BASE_BUILD_POOL }}
-  workspace:
-    clean: all
-  steps:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-    parameters:
-      aptPackages: ${{ parameters.aptPackages }}
-      pipModules: ${{ parameters.pipModules }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-# checkout triggering repo (either HIP or clr)
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-    parameters:
-      checkoutRepo: ${{ parameters.checkoutRepo }}
-# if this is triggered by HIP repo, matching repo is clr
-# if this is triggered by clr repo, matching repo is HIP
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-    parameters:
-      checkoutRepo: matching_repo
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-    parameters:
-      checkoutRepo: hipother_repo
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-    parameters:
-      checkoutRef: ${{ parameters.checkoutRef }}
-      dependencyList: ${{ parameters.rocmDependenciesNvidia }}
-      aggregatePipeline: ${{ parameters.aggregatePipeline }}
-  - script: 'ls -1R $(Agent.BuildDirectory)/rocm'
-    displayName: 'Artifact listing'
-# compile clr
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-    parameters:
-      componentName: clr
-      cmakeBuildDir: '$(Build.SourcesDirectory)/clr/build'
-      cmakeSourceDir: '$(Build.SourcesDirectory)/clr'
-      extraBuildFlags: >-
-        -DHIP_COMMON_DIR=$(Build.SourcesDirectory)/HIP
-        -DHIP_PLATFORM=nvidia
-        -DHIPCC_BIN_DIR=$(Agent.BuildDirectory)/rocm/bin
-        -DCLR_BUILD_HIP=ON
-        -DCLR_BUILD_OCL=OFF
-        -DHIPNV_DIR=$(Build.SourcesDirectory)/hipother/hipnv
-        -GNinja
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-    parameters:
-      artifactName: nvidia
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-  # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-  #   parameters:
-  #     aptPackages: ${{ parameters.aptPackages }}
-  #     pipModules: ${{ parameters.pipModules }}
-  #     environment: nvidia
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
+  - job: hip_clr_combined_${{ job.os }}_nvidia
+    pool:
+      vmImage: 'ubuntu-22.04'
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        pipModules: ${{ parameters.pipModules }}
+        packageManager: ${{ job.packageManager }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+  # checkout triggering repo (either HIP or clr)
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: ${{ parameters.checkoutRepo }}
+  # if this is triggered by HIP repo, matching repo is clr
+  # if this is triggered by clr repo, matching repo is HIP
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: matching_repo
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: hipother_repo
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      parameters:
+        checkoutRef: ${{ parameters.checkoutRef }}
+        dependencyList: ${{ parameters.rocmDependenciesNvidia }}
+        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        os: ${{ job.os }}
+    - script: 'ls -1R $(Agent.BuildDirectory)/rocm'
+      displayName: 'Artifact listing'
+  # compile clr
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        componentName: clr
+        cmakeBuildDir: '$(Build.SourcesDirectory)/clr/build'
+        cmakeSourceDir: '$(Build.SourcesDirectory)/clr'
+        os: ${{ job.os }}
+        useAmdclang: false
+        extraBuildFlags: >-
+          -DHIP_COMMON_DIR=$(Build.SourcesDirectory)/HIP
+          -DHIP_PLATFORM=nvidia
+          -DHIPCC_BIN_DIR=$(Agent.BuildDirectory)/rocm/bin
+          -DCLR_BUILD_HIP=ON
+          -DCLR_BUILD_OCL=OFF
+          -DHIPNV_DIR=$(Build.SourcesDirectory)/hipother/hipnv
+          -GNinja
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
+        artifactName: nvidia
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+    #   parameters:
+    #     aptPackages: ${{ parameters.aptPackages }}
+    #     pipModules: ${{ parameters.pipModules }}
+    #     environment: nvidia
--- a/.azuredevops/components/HIPIFY.yml
+++ b/.azuredevops/components/HIPIFY.yml
@@ -1,4 +1,7 @@
 parameters:
+- name: componentName
+  type: string
+  default: HIPIFY
 - name: checkoutRepo
  type: string
  default: 'self'
@@ -13,113 +16,140 @@ parameters:
 - name: aptPackages
  type: object
  default:
-    - cmake
-    - ninja-build
+    - cuda-toolkit-12-9
+    - libcudnn9-dev-cuda-12
    - libnuma-dev
+    - mesa-common-dev
+    - ninja-build
+    - python-is-python3
    - python3-dev
    - python3-pip
-    - python-is-python3
-    - mesa-common-dev
-    - ccache
-    - cuda-toolkit
-    - cudnn
+- name: pipModules
+  type: object
+  default:
+    - lit
+- name: rocmDependencies
+  type: object
+  default:
+    - llvm-project
+
+- name: jobMatrix
+  type: object
+  default:
+    buildJobs:
+      - { os: ubuntu2204, packageManager: apt }
+      - { os: almalinux8, packageManager: dnf }
+    testJobs:
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }

 jobs:
- job: HIPIFY
-  variables:
-  - group: common
-  - template: /.azuredevops/variables-global.yml
-  - name: UPSTREAM_LLVM_GIT_URL
-    value: https://github.com/llvm/llvm-project.git
-  - name: UPSTREAM_LLVM_TAG
-    value: llvmorg-18.1.2
-  pool: ${{ variables.MEDIUM_BUILD_POOL }}
-  workspace:
-    clean: all
-  steps:
-  - task: Bash@3
-    displayName: 'Register CUDA packages'
-    inputs:
-      targetType: inline
-      script: |
-        wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
-        sudo dpkg -i cuda-keyring_1.1-1_all.deb
-        sudo rm -f cuda-keyring_1.1-1_all.deb
-        sudo apt update
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-    parameters:
-      aptPackages: ${{ parameters.aptPackages }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-  - task: Bash@3
-    displayName: git clone upstream llvm-project
-    inputs:
-      targetType: inline
-      script: git clone $(UPSTREAM_LLVM_GIT_URL) --depth=1 --branch $(UPSTREAM_LLVM_TAG) --recurse-submodules
-      workingDirectory: $(Pipeline.Workspace)
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-    parameters:
-      checkoutRepo: ${{ parameters.checkoutRepo }}
-  - script: |
-      mkdir -p $(CCACHE_DIR)
-      echo "##vso[task.prependpath]/usr/lib/ccache:/usr/local/cuda/bin"
-    displayName: Update path for cuda and ccache
-  - task: Cache@2
-    displayName: Ccache caching
-    inputs:
-      key: HIPIFY | $(Agent.OS) | "$(UPSTREAM_LLVM_TAG)"
-      path: $(CCACHE_DIR)
-      restoreKeys: HIPIFY | $(Agent.OS)
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-    parameters:
-      componentName: upstream-llvm
-      cmakeBuildDir: $(Pipeline.Workspace)/llvm-project/llvm/build
-      cmakeSourceDir: $(Pipeline.Workspace)/llvm-project/llvm
-      installDir: $(Pipeline.Workspace)/llvm
-      extraBuildFlags: >-
-        -DCMAKE_BUILD_TYPE=Release
-        -DLLVM_ENABLE_PROJECTS=clang
-        -DLLVM_INCLUDE_TESTS=OFF
-        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
-        -DCMAKE_C_COMPILER_LAUNCHER=ccache
-        -GNinja
-  - task: Bash@3
-    displayName: python install lit
-    inputs:
-      targetType: inline
-      script: sudo python3 $(Pipeline.Workspace)/llvm-project/llvm/utils/lit/setup.py install
-  - task: Bash@3
-    displayName: install FileCheck
-    inputs:
-      targetType: inline
-      script: cp $(Pipeline.Workspace)/llvm-project/llvm/build/bin/FileCheck $(Pipeline.Workspace)/llvm/bin
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-    parameters:
-      componentName: HIPIFY
-      extraBuildFlags: >-
-        -DHIPIFY_CLANG_TESTS=ON
-        -DCMAKE_BUILD_TYPE=Release
-        -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda/targets/x86_64-linux
-        -DCUDA_DNN_ROOT_DIR=/usr/local/cuda/targets/x86_64-linux
-        -DCMAKE_PREFIX_PATH=$(Pipeline.Workspace)/llvm;/usr/local/cuda/targets/x86_64-linux/lib
-        -DLLVM_EXTERNAL_LIT=$(Pipeline.Workspace)/llvm-project/llvm/build/bin/llvm-lit
-      multithreadFlag: -- -j32
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-    parameters:
-      componentName: HIPIFY
-      testDir: $(Build.SourcesDirectory)/build
-      testExecutable: make
-      testParameters: test-hipify
-      testPublishResults: false
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-    parameters:
-      aptPackages: ${{ parameters.aptPackages }}
-      environment: combined
-      registerCUDAPackages: true
-      extraCopyDirectories:
-        - llvm-project
-      extraEnvVars:
-        - UPSTREAM_LLVM_GIT_URL:::https://github.com/llvm/llvm-project.git
-        - UPSTREAM_LLVM_TAG:::llvmorg-18.1.2
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    pool:
+      ${{ if eq(job.os, 'ubuntu2404') }}:
+        name: rocm-ci_medium_build_pool_2404
+      ${{ else }}:
+        name: ${{ variables.MEDIUM_BUILD_POOL }}
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
+    workspace:
+      clean: all
+    steps:
+    - task: Bash@3
+      displayName: 'Register CUDA packages'
+      inputs:
+        targetType: inline
+        ${{ if eq(job.os, 'ubuntu2204') }}:
+          script: |
+            wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+            sudo dpkg -i cuda-keyring_1.1-1_all.deb
+            sudo rm -f cuda-keyring_1.1-1_all.deb
+            sudo apt update
+        ${{ if eq(job.os, 'almalinux8') }}:
+          script: |
+            sudo dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        pipModules: ${{ parameters.pipModules }}
+        packageManager: ${{ job.packageManager }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
+    - task: Bash@3
+      displayName: Add lit to PATH
+      inputs:
+        targetType: inline
+        script: |
+          site_packages=$(python3 -m site --user-base)/bin
+          sudo ln -sf $site_packages/bin/lit $(Pipeline.Workspace)/llvm-lit
+          echo "##vso[task.prependpath]$site_packages"
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: ${{ parameters.checkoutRepo }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      parameters:
+        checkoutRef: ${{ parameters.checkoutRef }}
+        dependencyList: ${{ parameters.rocmDependencies }}
+        os: ${{ job.os }}
+        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+    # cutensor is not available from apt or dnf
+    - task: Bash@3
+      displayName: 'Download and install cutensor'
+      inputs:
+        targetType: inline
+        script: |
+          wget -q --show-progress https://developer.download.nvidia.com/compute/cutensor/redist/libcutensor/linux-x86_64/libcutensor-linux-x86_64-2.2.0.0-archive.tar.xz
+          tar -xvJf libcutensor-linux-x86_64-*.tar.xz
+          mkdir -p $(Pipeline.Workspace)/cutensor
+          cp -r libcutensor-linux-x86_64-*/* $(Pipeline.Workspace)/cutensor/
+    - task: Bash@3
+      displayName: 'List downloaded CUDA files'
+      inputs:
+        targetType: inline
+        script: ls -la1R /usr/local/cuda-12.9
+    # script: cp $(Pipeline.Workspace)/llvm-project/llvm/build/bin/FileCheck $(Pipeline.Workspace)/llvm/bin
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        componentName: ${{ parameters.componentName }}
+        os: ${{ job.os }}
+        consolidateBuildAndInstall: true
+        extraBuildFlags: >-
+          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm/llvm;/usr/local/cuda/targets/x86_64-linux/lib
+          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang++
+          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang
+          -DHIPIFY_CLANG_TESTS=ON
+          -DCMAKE_BUILD_TYPE=Release
+          -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-12.9
+          -DCUDA_DNN_ROOT_DIR=/usr/local/cuda-12.9
+          -DCUDA_CUB_ROOT_DIR=/usr/local/cuda-12.9/targets/x86_64-linux/include/cub
+          -DCUDA_TENSOR_ROOT_DIR=$(Pipeline.Workspace)/cutensor/
+        multithreadFlag: -- -j32
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+      parameters:
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
+        os: ${{ job.os }}
+    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+    #  parameters:
+    #    componentName: HIPIFY
+    #    testDir: $(Build.SourcesDirectory)/build
+    #    testExecutable: make
+    #    testParameters: -j 32 test-hipify
+    #    testPublishResults: false
+    #    os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+    - ${{ if eq(job.os, 'ubuntu2204') }}:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          environment: combined
+          registerCUDAPackages: true
+          extraCopyDirectories:
+            - llvm-project
--- a/.azuredevops/components/MIOpen.yml
+++ b/.azuredevops/components/MIOpen.yml
@@ -16,6 +16,7 @@ parameters:
    - cmake
    - jq
    - libdrm-dev
+    - libmsgpack-dev
    - libsqlite3-dev
    - libstdc++-12-dev
    - ninja-build
--- a/.azuredevops/components/MIVisionX.yml
+++ b/.azuredevops/components/MIVisionX.yml
@@ -43,18 +43,20 @@ parameters:
 - name: rocmDependencies
  type: object
  default:
-    - rocm-cmake
-    - llvm-project
-    - ROCR-Runtime
+    - AMDMIGraphX
    - clr
+    - half
+    - hipBLAS-common
+    - hipBLASLt
+    - llvm-project
+    - MIOpen
+    - rocBLAS
+    - rocDecode
+    - rocm-cmake
    - rocminfo
    - rocprofiler-register
-    - half
-    - rocBLAS
-    - MIOpen
-    - AMDMIGraphX
+    - ROCR-Runtime
    - rpp
-    - rocDecode
 - name: rocmTestDependencies
  type: object
  default:
@@ -90,8 +92,7 @@ jobs:
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
-    pool:
-      vmImage: ${{ variables.BASE_BUILD_POOL }}
+    pool: ${{ variables.MEDIUM_BUILD_POOL }}
    workspace:
      clean: all
    steps:
--- a/.azuredevops/components/ROCR-Runtime.yml
+++ b/.azuredevops/components/ROCR-Runtime.yml
@@ -20,7 +20,6 @@ parameters:
    - libnuma-dev
    - ninja-build
    - pkg-config
-    - python3-pip
 - name: rocmDependencies
  type: object
  default:
@@ -36,51 +35,65 @@ parameters:
 - name: jobMatrix
  type: object
  default:
+    buildJobs:
+      - { os: ubuntu2204, packageManager: apt }
+      - { os: almalinux8, packageManager: dnf }
    testJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }

 jobs:
- job: ROCR_Runtime_build
-  variables:
-  - group: common
-  - template: /.azuredevops/variables-global.yml
-  pool:
-    vmImage: ${{ variables.BASE_BUILD_POOL }}
-  workspace:
-    clean: all
-  steps:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-    parameters:
-      aptPackages: ${{ parameters.aptPackages }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-    parameters:
-      checkoutRepo: ${{ parameters.checkoutRepo }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-    parameters:
-      checkoutRef: ${{ parameters.checkoutRef }}
-      dependencyList: ${{ parameters.rocmDependencies }}
-      aggregatePipeline: ${{ parameters.aggregatePipeline }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-    parameters:
-      extraBuildFlags: >-
-        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-        -DBUILD_SHARED_LIBS=ON
-        -DCMAKE_BUILD_TYPE=Release
-        -GNinja
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-  # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-  #   parameters:
-  #     aptPackages: ${{ parameters.aptPackages }}
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
+  - job: ROCR_Runtime_build_${{ job.os }}
+    pool:
+      vmImage: 'ubuntu-22.04'
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: ${{ parameters.checkoutRepo }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      parameters:
+        checkoutRef: ${{ parameters.checkoutRef }}
+        dependencyList: ${{ parameters.rocmDependencies }}
+        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        os: ${{ job.os }}
+        useAmdclang: false
+        extraBuildFlags: >-
+          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+          -DBUILD_SHARED_LIBS=ON
+          -DCMAKE_BUILD_TYPE=Release
+          -GNinja
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+      parameters:
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+    #   parameters:
+    #     aptPackages: ${{ parameters.aptPackages }}

 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: ROCR_Runtime_test_${{ job.target }}
-    dependsOn: ROCR_Runtime_build
+  - job: ROCR_Runtime_test_${{ job.os }}_${{ job.target }}
+    dependsOn: ROCR_Runtime_build_${{ job.os }}
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -97,6 +110,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
    - task: Bash@3
      displayName: Install libhwloc5
      inputs:
@@ -107,12 +121,15 @@ jobs:
          sudo apt install -y --allow-downgrades ./libhwloc5_1.11.12-3_amd64.deb ./libhwloc-dev_1.11.12-3_amd64.deb
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+      parameters:
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmTestDependencies }}
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
@@ -121,11 +138,13 @@ jobs:
        runRocminfo: false
    - task: Bash@3
      displayName: Build kfdtest
-      continueOnError: true
      inputs:
        targetType: 'inline'
        workingDirectory: $(Build.SourcesDirectory)/libhsakmt/tests/kfdtest
        script: |
+          if [ -e /opt/rh/gcc-toolset-14/enable ]; then
+            source /opt/rh/gcc-toolset-14/enable
+          fi
          mkdir build && cd build
          cmake -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm ..
          make
@@ -135,13 +154,16 @@ jobs:
        testExecutable: BIN_DIR=$(Build.SourcesDirectory)/libhsakmt/tests/kfdtest/build ./run_kfdtest.sh
        testParameters: '-p core --gtest_output=xml:./test_output.xml --gtest_color=yes'
        testDir: $(Build.SourcesDirectory)/libhsakmt/tests/kfdtest/scripts
+        os: ${{ job.os }}
    - task: Bash@3
      displayName: Build rocrtst
-      continueOnError: true
      inputs:
        targetType: 'inline'
        workingDirectory: $(Build.SourcesDirectory)/rocrtst/suites/test_common
        script: |
+          if [ -e /opt/rh/gcc-toolset-14/enable ]; then
+            source /opt/rh/gcc-toolset-14/enable
+          fi
          BASE_CLANG_DIR=$(Agent.BuildDirectory)/rocm/llvm/lib/clang
          export NEWEST_CLANG_VER=$(ls -1 $BASE_CLANG_DIR | sort -V | tail -n 1)
          mkdir build && cd build
@@ -159,6 +181,7 @@ jobs:
        testExecutable: ./rocrtst64
        testParameters: '--gtest_filter="-rocrtstNeg.Memory_Negative_Tests:rocrtstFunc.Memory_Max_Mem" --gtest_output=xml:./test_output.xml --gtest_color=yes'
        testDir: $(Build.SourcesDirectory)/rocrtst/suites/test_common/build/${{ job.target }}
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/ROCdbgapi.yml
+++ b/.azuredevops/components/ROCdbgapi.yml
@@ -15,7 +15,6 @@ parameters:
  default:
    - cmake
    - ninja-build
-    - python3-pip
 - name: rocmDependencies
  type: object
  default:
@@ -24,37 +23,57 @@ parameters:
    - rocminfo
    - ROCR-Runtime

+- name: jobMatrix
+  type: object
+  default:
+    buildJobs:
+      - { os: ubuntu2204, packageManager: apt }
+      - { os: almalinux8, packageManager: dnf }
+
 jobs:
- job: ROCdbgapi
-  variables:
-  - group: common
-  - template: /.azuredevops/variables-global.yml
-  pool:
-    vmImage: ${{ variables.BASE_BUILD_POOL }}
-  workspace:
-    clean: all
-  steps:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-    parameters:
-      aptPackages: ${{ parameters.aptPackages }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-    parameters:
-      checkoutRepo: ${{ parameters.checkoutRepo }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-    parameters:
-      checkoutRef: ${{ parameters.checkoutRef }}
-      dependencyList: ${{ parameters.rocmDependencies }}
-      aggregatePipeline: ${{ parameters.aggregatePipeline }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-    parameters:
-      extraBuildFlags: >-
-        -DCMAKE_BUILD_TYPE=Release
-        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-        -GNinja
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-  # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-  #   parameters:
-  #     aptPackages: ${{ parameters.aptPackages }}
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
+  - job: ROCdbgapi_build_${{ job.os }}
+    pool:
+      vmImage: 'ubuntu-22.04'
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: ${{ parameters.checkoutRepo }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      parameters:
+        checkoutRef: ${{ parameters.checkoutRef }}
+        dependencyList: ${{ parameters.rocmDependencies }}
+        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        os: ${{ job.os }}
+        useAmdclang: false
+        extraBuildFlags: >-
+          -DCMAKE_BUILD_TYPE=Release
+          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+          -GNinja
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+      parameters:
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+    #   parameters:
+    #     aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/ROCgdb.yml
+++ b/.azuredevops/components/ROCgdb.yml
@@ -1,4 +1,7 @@
 parameters:
+- name: componentName
+  type: string
+  default: ROCgdb
 - name: checkoutRepo
  type: string
  default: 'self'
@@ -23,8 +26,10 @@ parameters:
    - libgmp-dev
    - liblzma-dev
    - libmpfr-dev
-    - pkg-config
    - ncurses-dev
+    - pkg-config
+    - python3-dev
+    - python3-pip
    - texinfo
    - zlib1g-dev
 - name: rocmDependencies
@@ -40,67 +45,87 @@ parameters:
 - name: jobMatrix
  type: object
  default:
+    buildJobs:
+      - { os: ubuntu2204, packageManager: apt }
+      - { os: almalinux8, packageManager: dnf }
    testJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }

 jobs:
- job: ROCgdb
-  variables:
-  - group: common
-  - template: /.azuredevops/variables-global.yml
-  - name: PKG_CONFIG_PATH
-    value: $(Agent.BuildDirectory)/rocm/share/pkgconfig
-  pool:
-    vmImage: ${{ variables.BASE_BUILD_POOL }}
-  workspace:
-    clean: all
-  steps:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-    parameters:
-      aptPackages: ${{ parameters.aptPackages }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-    parameters:
-      checkoutRepo: ${{ parameters.checkoutRepo }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-    parameters:
-      checkoutRef: ${{ parameters.checkoutRef }}
-      dependencyList: ${{ parameters.rocmDependencies }}
-      aggregatePipeline: ${{ parameters.aggregatePipeline }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-autotools.yml
-    parameters:
-      configureFlags: >-
-        --program-prefix=roc
-        --enable-64-bit-bfd
-        --enable-targets="x86_64-linux-gnu,amdgcn-amd-amdhsa"
-        --disable-ld
-        --disable-gas
-        --disable-gdbserver
-        --disable-sim
-        --enable-tui
-        --disable-gdbtk
-        --disable-shared
-        --disable-gprofng
-        --with-expat
-        --with-system-zlib
-        --without-guile
-        --with-babeltrace
-        --with-lzma
-        --with-python=python3
-        --with-rocm-dbgapi=$(Agent.BuildDirectory)/rocm
-        LDFLAGS="-Wl,--enable-new-dtags,-rpath=$(Agent.BuildDirectory)/rocm/lib"
-      makeCallPrefix: LD_RUN_PATH='${ORIGIN}/../lib'
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    - name: PKG_CONFIG_PATH
+      value: $(Agent.BuildDirectory)/rocm/share/pkgconfig
+    pool:
+      ${{ if eq(job.os, 'ubuntu2404') }}:
+        name: rocm-ci_medium_build_pool_2404
+      ${{ else }}:
+        name: ${{ variables.MEDIUM_BUILD_POOL }}
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: ${{ parameters.checkoutRepo }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+      parameters:
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      parameters:
+        checkoutRef: ${{ parameters.checkoutRef }}
+        dependencyList: ${{ parameters.rocmDependencies }}
+        os: ${{ job.os }}
+        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-autotools.yml
+      parameters:
+        os: ${{ job.os }}
+        configureFlags: >-
+          --program-prefix=roc
+          --enable-64-bit-bfd
+          --enable-targets="x86_64-linux-gnu,amdgcn-amd-amdhsa"
+          --disable-ld
+          --disable-gas
+          --disable-gdbserver
+          --disable-sim
+          --enable-tui
+          --disable-gdbtk
+          --disable-shared
+          --disable-gprofng
+          --with-expat
+          --with-system-zlib
+          --without-guile
+          --with-babeltrace
+          --with-lzma
+          --with-python=python3
+          --with-rocm-dbgapi=$(Agent.BuildDirectory)/rocm
+          LDFLAGS="-Wl,--enable-new-dtags,-rpath=$(Agent.BuildDirectory)/rocm/lib"
+        makeCallPrefix: LD_RUN_PATH='${ORIGIN}/../lib'
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+      parameters:
+        componentName: ${{ parameters.componentName }}
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
+        componentName: ${{ parameters.componentName }}
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml

 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: ROCgdb_test_${{ job.target }}
-    dependsOn: ROCgdb
+  - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
+    dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -119,18 +144,23 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+      parameters:
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
+        os: ${{ job.os }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-autotools.yml
      parameters:
+        os: ${{ job.os }}
        configureFlags: >-
          --program-prefix=roc
          --enable-64-bit-bfd
@@ -166,7 +196,9 @@ jobs:
      continueOnError: true
      inputs:
        targetType: inline
-        script: make check-gdb TESTS=gdb.rocm/simple.exp
+        script: |
+          ${{ iif(eq(job.os, 'almalinux8'), 'source /opt/rh/gcc-toolset-14/enable', '') }}
+          make check-gdb TESTS=gdb.rocm/simple.exp
        workingDirectory: $(Build.SourcesDirectory)
    - task: Bash@3
      displayName: print gdb log
--- a/.azuredevops/components/Tensile.yml
+++ b/.azuredevops/components/Tensile.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: Tensile
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -13,7 +32,6 @@ parameters:
 - name: aptPackages
  type: object
  default:
-    - python3-pip
    - cmake
    - libmsgpack-dev
    - libboost-program-options-dev
@@ -38,75 +56,97 @@ parameters:
 - name: jobMatrix
  type: object
  default:
+    buildJobs:
+      - { os: ubuntu2204, packageManager: apt }
+      - { os: almalinux8, packageManager: dnf }
    testJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }

 jobs:
- job: Tensile_build
-  variables:
-  - group: common
-  - template: /.azuredevops/variables-global.yml
-  - name: ROCM_PATH
-    value: $(Agent.BuildDirectory)/rocm
-  pool:
-    vmImage: ${{ variables.BASE_BUILD_POOL }}
-  workspace:
-    clean: all
-  steps:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-    parameters:
-      aptPackages: ${{ parameters.aptPackages }}
-      pipModules: ${{ parameters.pipModules }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-    parameters:
-      checkoutRepo: ${{ parameters.checkoutRepo }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-    parameters:
-      checkoutRef: ${{ parameters.checkoutRef }}
-      dependencyList: ${{ parameters.rocmDependencies }}
-      aggregatePipeline: ${{ parameters.aggregatePipeline }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-  - task: Bash@3
-    displayName: Create wheel file
-    inputs:
-      targetType: inline
-      script: python3 setup.py bdist_wheel
-      workingDirectory: $(Build.SourcesDirectory)
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-prepare-package.yml
-    parameters:
-      sourceDir: $(Build.SourcesDirectory)/dist
-      contentsString: '*.whl'
-      targetDir: $(Build.ArtifactStagingDirectory)
-      clean: false
-  - task: PublishPipelineArtifact@1
-    displayName: 'wheel file Publish'
-    retryCountOnTaskFailure: 3
-    inputs:
-      targetPath: $(Build.ArtifactStagingDirectory)
-  - task: Bash@3
-    displayName: Save pipeline artifact file names
-    inputs:
-      workingDirectory: $(Pipeline.Workspace)
-      targetType: inline
-      script: |
-        whlFile=$(find "$(Build.ArtifactStagingDirectory)" -type f -name "*.whl" | head -n 1)
-        if [ -n "$whlFile" ]; then
-          echo $(basename "$whlFile") >> pipelineArtifacts.txt
-        fi
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-  # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-  #   parameters:
-  #     aptPackages: ${{ parameters.aptPackages }}
-  #     pipModules: ${{ parameters.pipModules }}
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn: ${{ parameters.buildDependsOn[job.target] }}
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    - name: ROCM_PATH
+      value: $(Agent.BuildDirectory)/rocm
+    pool:
+      vmImage: ${{ variables.BASE_BUILD_POOL }}
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        pipModules: ${{ parameters.pipModules }}
+        packageManager: ${{ job.packageManager }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      parameters:
+        checkoutRef: ${{ parameters.checkoutRef }}
+        dependencyList: ${{ parameters.rocmDependencies }}
+        os: ${{ job.os }}
+        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+      parameters:
+        os: ${{ job.os }}
+    - task: Bash@3
+      displayName: Create wheel file
+      inputs:
+        targetType: inline
+        script: python3 setup.py bdist_wheel
+        workingDirectory: $(Agent.BuildDirectory)/s
+    - task: Bash@3
+      displayName: Rename wheel file with job OS
+      inputs:
+        targetType: inline
+        workingDirectory: $(Agent.BuildDirectory)/s
+        script: |
+          wheelFile=$(find "$(Agent.BuildDirectory)/s/dist" -type f -name "*.whl" | head -n 1)
+          newWheelFile="$(basename "$wheelFile" .whl)-${{ job.os }}.whl"
+          mv "$wheelFile" "$(dirname "$wheelFile")/$newWheelFile"
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-prepare-package.yml
+      parameters:
+        sourceDir: $(Agent.BuildDirectory)/s/dist
+        contentsString: '*.whl'
+        targetDir: $(Build.ArtifactStagingDirectory)
+        clean: false
+    - task: PublishPipelineArtifact@1
+      displayName: 'wheel file Publish'
+      retryCountOnTaskFailure: 3
+      inputs:
+        targetPath: $(Build.ArtifactStagingDirectory)
+    - task: Bash@3
+      displayName: Save pipeline artifact file names
+      inputs:
+        workingDirectory: $(Pipeline.Workspace)
+        targetType: inline
+        script: |
+          whlFile=$(find "$(Build.ArtifactStagingDirectory)" -type f -name "*.whl" | head -n 1)
+          if [ -n "$whlFile" ]; then
+            echo $(basename "$whlFile") >> pipelineArtifacts.txt
+          fi
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+    #   parameters:
+    #     aptPackages: ${{ parameters.aptPackages }}
+    #     pipModules: ${{ parameters.pipModules }}

 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: Tensile_test_${{ job.target }}
+  - job: Tensile_test_${{ job.os }}_${{ job.target }}
    timeoutInMinutes: 180
-    dependsOn: Tensile_build
+    dependsOn: Tensile_build_${{ job.os }}
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -126,20 +166,23 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
+        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - task: DownloadPipelineArtifact@2
      displayName: 'Download Pipeline Wheel Files'
      inputs:
-        itemPattern: '**/*.whl'
+        itemPattern: '**/*${{ job.os }}*.whl'
        targetPath: $(Agent.BuildDirectory)
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
+        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - task: Bash@3
      displayName: pip install
@@ -164,7 +207,7 @@ jobs:
      inputs:
        targetType: inline
        script: tox run -v -e ci -- -m pre_checkin
-        workingDirectory: $(Build.SourcesDirectory)
+        workingDirectory: $(Agent.BuildDirectory)/s
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/amdsmi.yml
+++ b/.azuredevops/components/amdsmi.yml
@@ -16,50 +16,66 @@ parameters:
    - cmake
    - libdrm-dev
    - ninja-build
-    - python3-pip
    - pkg-config

 - name: jobMatrix
  type: object
  default:
+    buildJobs:
+      - { os: ubuntu2204, packageManager: apt }
+      - { os: ubuntu2404, packageManager: apt }
+      - { os: almalinux8, packageManager: dnf }
    testJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }

 jobs:
- job: amdsmi_build
-  variables:
-  - group: common
-  - template: /.azuredevops/variables-global.yml
-  pool:
-    vmImage: ${{ variables.BASE_BUILD_POOL }}
-  workspace:
-    clean: all
-  steps:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-    parameters:
-      aptPackages: ${{ parameters.aptPackages }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-    parameters:
-      checkoutRepo: ${{ parameters.checkoutRepo }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-    parameters:
-      extraBuildFlags: >-
-        -DBUILD_TESTS=ON
-        -GNinja
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-  # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-  #   parameters:
-  #     aptPackages: ${{ parameters.aptPackages }}
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
+  - job: amdsmi_build_${{ job.os }}
+    pool:
+      ${{ if eq(job.os, 'ubuntu2404') }}:
+        vmImage: 'ubuntu-24.04'
+      ${{ else }}:
+        vmImage: 'ubuntu-22.04'
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: ${{ parameters.checkoutRepo }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        os: ${{ job.os }}
+        useAmdclang: false
+        extraBuildFlags: >-
+          -DBUILD_TESTS=ON
+          -GNinja
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+      parameters:
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+    #   parameters:
+    #     aptPackages: ${{ parameters.aptPackages }}

 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: amdsmi_test_${{ job.target }}
-    dependsOn: amdsmi_build
+  - job: amdsmi_test_${{ job.os }}_${{ job.target }}
+    dependsOn: amdsmi_build_${{ job.os }}
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -76,8 +92,11 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+      parameters:
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
      parameters:
        runRocminfo: false
@@ -85,8 +104,9 @@ jobs:
      parameters:
        componentName: amdsmi
        testDir: '$(Agent.BuildDirectory)'
-        testExecutable: './rocm/share/amd_smi/tests/amdsmitst'
+        testExecutable: 'sudo ./rocm/share/amd_smi/tests/amdsmitst'
        testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/aomp.yml
+++ b/.azuredevops/components/aomp.yml
@@ -1,4 +1,7 @@
 parameters:
+- name: componentName
+  type: string
+  default: aomp
 - name: checkoutRepo
  type: string
  default: 'self'
@@ -15,173 +18,187 @@ parameters:
 - name: aptPackages
  type: object
  default:
+    - bison
+    - ccache
    - cmake
-    - python3-pip
-    - ninja-build
-    - pkg-config
-    - libpci-dev
-    - libnuma-dev
-    - libffi-dev
-    - git
-    - libopenmpi-dev
+    - flex
    - gawk
+    - git
    - mesa-common-dev
-    - libtool
+    - ninja-build
+    - libbabeltrace-dev
+    - libbison-dev
    - libdrm-amdgpu1
    - libdrm-dev
    - libdw-dev
-    - libgtest-dev
-    - libsystemd-dev
+    - libffi-dev
+    - libgmp-dev
+    - liblzma-dev
+    - libmpfr-dev
+    - libncurses5-dev
+    - libnuma-dev
+    - libopenmpi-dev
+    - libpci-dev
    - libssl-dev
    - libstdc++-12-dev
-    - ccache
-    - libgmp-dev
-    - libmpfr-dev
-    - texinfo
-    - libbison-dev
-    - bison
-    - flex
-    - libbabeltrace-dev
-    - libncurses5-dev
-    - liblzma-dev
-    - python3-setuptools
-    - python3-dev
+    - libsystemd-dev
+    - libtool
    - libudev-dev
    - parallel
-  # Referencing comment snippet.
-  #
-  # snippet from https://github.com/ROCm/aomp/blob/aomp-dev/bin/build_aomp.sh#L131-L134
-  #
-  # For ROCM build (AOMP_STANDALONE_BUILD=0) the components roct, rocr,
-  # libdevice, project, comgr, rocminfo, hipamd, rocdbgapi, rocgdb,
-  # roctracer, rocprofiler, rocm_smi_lib, and amdsmi should be found
-  # in ROCM in /opt/rocm.  The ROCM build only needs these components:
+    - pkg-config
+    - python3-dev
+    - python3-pip
+    - python3-setuptools
+    - texinfo
 - name: rocmDependencies
  type: object
  default:
-    - amdsmi
+    - llvm-project
+    - ROCR-Runtime
+- name: rocmTestDependencies
+  type: object
+  default:
    - clr
    - llvm-project
-    - ROCdbgapi
-    - ROCgdb
-    - rocm-cmake
    - rocm-core
    - rocminfo
-    - rocm_smi_lib
-    - rocprofiler
-    - rocprofiler-register
-    - rocprofiler-sdk
    - ROCR-Runtime
-    - roctracer
+    - rocprofiler-register

 - name: jobMatrix
  type: object
  default:
+    buildJobs:
+      - { os: ubuntu2204, packageManager: apt }
+      - { os: almalinux8, packageManager: dnf }
    testJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }

 jobs:
- job: aomp
-  variables:
-  - group: common
-  - template: /.azuredevops/variables-global.yml
-  pool: ${{ variables.MEDIUM_BUILD_POOL }}
-  workspace:
-    clean: all
-  steps:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-    parameters:
-      aptPackages: ${{ parameters.aptPackages }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-# checkout the repos tied to openmp-extras, plus llvm-project
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-    parameters:
-      checkoutRepo: ${{ parameters.checkoutRepo }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-    parameters:
-      checkoutRepo: aomp-extras_repo
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-    parameters:
-      checkoutRepo: flang_repo
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-    parameters:
-      checkoutRepo: llvm-project_repo
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-    parameters:
-      checkoutRef: ${{ parameters.checkoutRef }}
-      dependencyList: ${{ parameters.rocmDependencies }}
-      aggregatePipeline: ${{ parameters.aggregatePipeline }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-    parameters:
-      componentName: extras
-      cmakeBuildDir: '$(Build.SourcesDirectory)/aomp-extras/build'
-      cmakeSourceDir: '$(Build.SourcesDirectory)/aomp-extras'
-      installDir: '$(Build.BinariesDirectory)/llvm'
-      extraBuildFlags: >-
-        -DLLVM_DIR=$(Agent.BuildDirectory)/rocm/llvm
-        -DCMAKE_BUILD_TYPE=Release
-        -DAOMP_STANDALONE_BUILD=0
-        -DAOMP_VERSION_STRING=9.99.99
-        -GNinja
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-    parameters:
-      componentName: openmp
-      cmakeBuildDir: '$(Build.SourcesDirectory)/llvm-project/openmp/build'
-      cmakeSourceDir: '$(Build.SourcesDirectory)/llvm-project/openmp'
-      installDir: '$(Build.BinariesDirectory)/llvm'
-      extraBuildFlags: >-
-        -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Build.BinariesDirectory)"
-        -DCMAKE_BUILD_TYPE=Release
-        -DOPENMP_TEST_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang++
-        -DOPENMP_TEST_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang
-        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang++
-        -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang
-        -DOPENMP_ENABLE_LIBOMPTARGET=1
-        -DLIBOMP_COPY_EXPORTS=OFF
-        -DLIBOMP_OMPT_SUPPORT=ON
-        -DLIBOMP_OMPD_SUPPORT=ON
-        -DCMAKE_SKIP_INSTALL_RPATH=TRUE
-        -DLLVM_MAIN_INCLUDE_DIR=$(Build.SourcesDirectory)/llvm-project/llvm/include
-        -DLIBOMP_FORTRAN_MODULES_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/flang
-        -DLIBOMP_MODULES_INSTALL_PATH=$(Build.BinariesDirectory)/llvm/include/flang/
-        -GNinja
-  - task: Bash@3
-    displayName: 'ROCm symbolic link'
-    inputs:
-      targetType: inline
-      script: sudo ln -s $(Agent.BuildDirectory)/rocm /opt/rocm
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-    parameters:
-      componentName: offload
-      cmakeBuildDir: '$(Build.SourcesDirectory)/llvm-project/offload/build'
-      cmakeSourceDir: '$(Build.SourcesDirectory)/llvm-project/offload'
-      installDir: '$(Build.BinariesDirectory)/llvm'
-      extraBuildFlags: >-
-        -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Build.BinariesDirectory)"
-        -DCMAKE_BUILD_TYPE=Release
-        -DOPENMP_TEST_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang++
-        -DOPENMP_TEST_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang
-        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang++
-        -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang
-        -DCMAKE_SKIP_INSTALL_RPATH=TRUE
-        -DLLVM_MAIN_INCLUDE_DIR=$(Build.SourcesDirectory)/llvm-project/llvm/include
-        -DLIBOMPTARGET_LLVM_INCLUDE_DIRS=$(Build.SourcesDirectory)/llvm-project/llvm/include
-        -DCMAKE_EXE_LINKER_FLAGS="-L$(Agent.BuildDirectory)/rocm/llvm/lib"
-        -DCMAKE_SHARED_LINKER_FLAGS="-L$(Agent.BuildDirectory)/rocm/llvm/lib"
-        -GNinja
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-    parameters:
-      aptPackages: ${{ parameters.aptPackages }}
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    pool:
+      ${{ if eq(job.os, 'ubuntu2404') }}:
+        name: rocm-ci_medium_build_pool_2404
+      ${{ else }}:
+        name: ${{ variables.MEDIUM_BUILD_POOL }}
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    # checkout the repos tied to openmp-extras, plus llvm-project
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: ${{ parameters.checkoutRepo }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: aomp-extras_repo
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: flang_repo
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: llvm-project_repo
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
+      parameters:
+        dependencyList:
+          - gtest
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      parameters:
+        checkoutRef: ${{ parameters.checkoutRef }}
+        dependencyList: ${{ parameters.rocmDependencies }}
+        os: ${{ job.os }}
+        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        os: ${{ job.os }}
+        useAmdclang: false
+        componentName: extras
+        cmakeBuildDir: '$(Build.SourcesDirectory)/aomp-extras/build'
+        cmakeSourceDir: '$(Build.SourcesDirectory)/aomp-extras'
+        installDir: '$(Build.BinariesDirectory)/llvm'
+        extraBuildFlags: >-
+          -DLLVM_DIR=$(Agent.BuildDirectory)/rocm/llvm
+          -DCMAKE_BUILD_TYPE=Release
+          -DAOMP_STANDALONE_BUILD=0
+          -DAOMP_VERSION_STRING=9.99.99
+          -GNinja
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        os: ${{ job.os }}
+        componentName: openmp
+        cmakeBuildDir: '$(Build.SourcesDirectory)/llvm-project/openmp/build'
+        cmakeSourceDir: '$(Build.SourcesDirectory)/llvm-project/openmp'
+        installDir: '$(Build.BinariesDirectory)/llvm'
+        extraBuildFlags: >-
+          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Build.BinariesDirectory)"
+          -DCMAKE_BUILD_TYPE=Release
+          -DOPENMP_TEST_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang++
+          -DOPENMP_TEST_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang
+          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang++
+          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang
+          -DOPENMP_ENABLE_LIBOMPTARGET=1
+          -DLIBOMP_COPY_EXPORTS=OFF
+          -DLIBOMP_OMPD_SUPPORT=ON
+          -DCMAKE_SKIP_INSTALL_RPATH=TRUE
+          -DLLVM_MAIN_INCLUDE_DIR=$(Build.SourcesDirectory)/llvm-project/llvm/include
+          -DLIBOMP_FORTRAN_MODULES_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/flang
+          -DLIBOMP_MODULES_INSTALL_PATH=$(Build.BinariesDirectory)/llvm/include/flang/
+        multithreadFlag: -- -j32
+    - task: Bash@3
+      displayName: 'ROCm symbolic link'
+      inputs:
+        targetType: inline
+        script: sudo ln -s $(Agent.BuildDirectory)/rocm /opt/rocm
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        os: ${{ job.os }}
+        componentName: offload
+        cmakeBuildDir: '$(Build.SourcesDirectory)/llvm-project/offload/build'
+        cmakeSourceDir: '$(Build.SourcesDirectory)/llvm-project/offload'
+        installDir: '$(Build.BinariesDirectory)/llvm'
+        extraBuildFlags: >-
+          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Build.BinariesDirectory)"
+          -DCMAKE_BUILD_TYPE=Release
+          -DOPENMP_TEST_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang++
+          -DOPENMP_TEST_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang
+          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang++
+          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang
+          -DCMAKE_SKIP_INSTALL_RPATH=TRUE
+          -DLLVM_MAIN_INCLUDE_DIR=$(Build.SourcesDirectory)/llvm-project/llvm/include
+          -DLIBOMPTARGET_LLVM_INCLUDE_DIRS=$(Build.SourcesDirectory)/llvm-project/llvm/include
+          -DCMAKE_EXE_LINKER_FLAGS="-L$(Agent.BuildDirectory)/rocm/llvm/lib"
+          -DCMAKE_SHARED_LINKER_FLAGS="-L$(Agent.BuildDirectory)/rocm/llvm/lib"
+          -GNinja
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+      parameters:
+        componentName: ${{ parameters.componentName }}
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
+        componentName: ${{ parameters.componentName }}
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+    - ${{ if eq(job.os, 'ubuntu2204') }}:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}

 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: aomp_test_${{ job.target }}
-    dependsOn: aomp
+  - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
+    dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -198,12 +215,16 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+      parameters:
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmDependencies }}
+        dependencyList: ${{ parameters.rocmTestDependencies }}
+        os: ${{ job.os }}
    - task: Bash@3
      displayName: ROCm symbolic link
      inputs:
@@ -215,7 +236,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: aomp-extras_repo
-  # these copy steps are from the aomp prototype script for test prep
+    # these copy steps are from the aomp prototype script for test prep
    - task: CopyFiles@2
      displayName: 'Copy AOMP contents'
      inputs:
--- a/.azuredevops/components/copyHIP.yml
+++ b/.azuredevops/components/copyHIP.yml
@@ -1,36 +1,42 @@
 parameters:
- name: checkoutRepo
-  type: string
-  default: 'self'
- name: checkoutRef
-  type: string
-  default: ''
+- name: jobMatrix
+  type: object
+  default:
+    copyJobs:
+      - { os: ubuntu2204, backend: amd }
+      - { os: almalinux8, backend: amd }
+      - { os: ubuntu2204, backend: nvidia }
+      - { os: almalinux8, backend: nvidia }

 # hip and clr are tightly-coupled
 # run this same template for both repos
 # any changes for clr should just trigger HIP pipeline
 jobs:
- job: hip_clr_combined
-  variables:
-  - group: common
-  - template: /.azuredevops/variables-global.yml
-  pool:
-    vmImage: ${{ variables.BASE_BUILD_POOL }}
-  workspace:
-    clean: all
-  steps:
-# checkout nothing, just copy artifacts from triggering HIP job
-# and then publish for this clr job or for this hipother job to maintain latest
-  - checkout: none
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-download.yml
-    parameters:
-      componentName: HIP
-      pipelineId: $(HIP_PIPELINE_ID)
-  - task: Bash@3
-    displayName: Copy HIP artifacts
-    inputs:
-      targetType: inline
-      script: cp -a $(Agent.BuildDirectory)/rocm/* $(Build.BinariesDirectory)/
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+- ${{ each job in parameters.jobMatrix.copyJobs }}:
+  - job: hip_clr_combined_${{ job.os }}_${{ job.backend }}
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    pool:
+      vmImage: ${{ variables.BASE_BUILD_POOL }}
+    workspace:
+      clean: all
+    steps:
+  # checkout nothing, just copy artifacts from triggering HIP job
+  # and then publish for this clr job or for this hipother job to maintain latest
+    - checkout: none
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-download.yml
+      parameters:
+        componentName: HIP
+        pipelineId: $(HIP_PIPELINE_ID)
+        fileFilter: ${{ job.os }}*${{ job.backend }}
+    - task: Bash@3
+      displayName: Copy HIP artifacts
+      inputs:
+        targetType: inline
+        script: cp -a $(Agent.BuildDirectory)/rocm/* $(Build.BinariesDirectory)/
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+      parameters:
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
--- a/.azuredevops/components/hipBLAS-common.yml
+++ b/.azuredevops/components/hipBLAS-common.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: hipBLAS-common
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -14,54 +33,103 @@ parameters:
  type: object
  default:
    - cmake
-    - ninja-build
    - git
+    - ninja-build
    - wget
-    - python3-pip
 - name: rocmDependencies
  type: object
  default:
-    - rocm-cmake
-    - llvm-project
-    - ROCR-Runtime
    - clr
+    - llvm-project
+    - rocm-cmake
    - rocminfo
+    - ROCR-Runtime
+
+- name: jobMatrix
+  type: object
+  default:
+    buildJobs:
+      - { os: ubuntu2204, packageManager: apt }
+      - { os: almalinux8, packageManager: dnf }
+# - name: downstreamComponentMatrix
+#   type: object
+#   default:
+#     - hipBLASLt:
+#       name: hipBLASLt
+#       sparseCheckoutDir: projects/hipblaslt
+#       skipUnifiedBuild: 'false'
+#       buildDependsOn:
+#         - hipBLAS_common_build

 jobs:
- job: hipBLAS_common
-  variables:
-  - group: common
-  - name: ROCM_PATH
-    value: $(Agent.BuildDirectory)/rocm
-  - template: /.azuredevops/variables-global.yml
-  pool:
-    vmImage: ${{ variables.BASE_BUILD_POOL }}
-  workspace:
-    clean: all
-  steps:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-    parameters:
-      aptPackages: ${{ parameters.aptPackages }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-    parameters:
-      checkoutRepo: ${{ parameters.checkoutRepo }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-    parameters:
-      checkoutRef: ${{ parameters.checkoutRef }}
-      dependencyList: ${{ parameters.rocmDependencies }}
-      aggregatePipeline: ${{ parameters.aggregatePipeline }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-    parameters:
-      extraBuildFlags: >-
-        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
-        -GNinja
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-  # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-  #   parameters:
-  #     aptPackages: ${{ parameters.aptPackages }}
-  #     extraEnvVars:
-  #       - ROCM_PATH:::/home/user/workspace/rocm
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
+  - job: hipBLAS_common_build_${{ job.os }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    - name: ROCM_PATH
+      value: $(Agent.BuildDirectory)/rocm
+    pool:
+      vmImage: ${{ variables.BASE_BUILD_POOL }}
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      parameters:
+        checkoutRef: ${{ parameters.checkoutRef }}
+        dependencyList: ${{ parameters.rocmDependencies }}
+        os: ${{ job.os }}
+        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        os: ${{ job.os }}
+        extraBuildFlags: >-
+          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
+          -GNinja
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+      parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
+        os: ${{ job.os }}
+        componentName: ${{ parameters.componentName }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+    #   parameters:
+    #     aptPackages: ${{ parameters.aptPackages }}
+    #     extraEnvVars:
+    #       - ROCM_PATH:::/home/user/workspace/rocm
+
+# - ${{ if parameters.triggerDownstreamJobs }}:
+#   - ${{ each component in parameters.downstreamComponentMatrix }}:
+#     - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
+#       - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
+#         parameters:
+#           checkoutRepo: ${{ parameters.checkoutRepo }}
+#           sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
+#           buildDependsOn: ${{ component.buildDependsOn }}
+#           downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
+#           triggerDownstreamJobs: true
+#           unifiedBuild: ${{ parameters.unifiedBuild }}
--- a/.azuredevops/components/hipBLASLt.yml
+++ b/.azuredevops/components/hipBLASLt.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: hipBLASLt
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -13,6 +32,8 @@ parameters:
 - name: aptPackages
  type: object
  default:
+    - ccache
+    - gfortran
    - git
    - libdrm-dev
    - libmsgpack-dev
@@ -20,9 +41,6 @@ parameters:
    - ninja-build
    - python3-pip
    - python3-venv
-    - gfortran
-    - libblas-dev
-    - ccache
 - name: pipModules
  type: object
  default:
@@ -37,6 +55,7 @@ parameters:
    - hipBLAS-common
    - llvm-project
    - rocminfo
+    - rocm-cmake
    - rocm_smi_lib
    - rocprofiler-register
    - ROCR-Runtime
@@ -58,20 +77,37 @@ parameters:
  type: object
  default:
    buildJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
+      # - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
+      # - { os: almalinux8, packageManager: dnf, target: gfx942 }
+      # - { os: almalinux8, packageManager: dnf, target: gfx90a }
+      # - { os: almalinux8, packageManager: dnf, target: gfx1201 }
+      # - { os: almalinux8, packageManager: dnf, target: gfx1100 }
+      # - { os: almalinux8, packageManager: dnf, target: gfx1030 }
    testJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+# - name: downstreamComponentMatrix
+#   type: object
+#   default:
+#     - rocBLAS:
+#       name: rocBLAS
+#       sparseCheckoutDir: projects/rocblas
+#       skipUnifiedBuild: 'false'
+#       buildDependsOn:
+#         - hipBLASLt_build

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: hipBLASLt_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
    timeoutInMinutes: 300
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -86,6 +122,10 @@ jobs:
    - name: DAY_STRING
      value: $[format('{0:ddMMyyyy}', pipeline.startTime)]
    pool: ${{ variables.ULTRA_BUILD_POOL }}
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
    workspace:
      clean: all
    steps:
@@ -93,17 +133,22 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
+        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
+        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - task: Bash@3
      displayName: Add ROCm binaries to PATH
      inputs:
@@ -111,22 +156,20 @@ jobs:
        script: |
          echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/bin"
          echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/llvm/bin"
-  # Build and install gtest, lapack, hipBLAS-common
-  # $(Pipeline.Workspace)/deps is a temporary folder for the build process
-  # $(Pipeline.Workspace)/s/deps is part of the hipBLASLt repo
-    - script: mkdir $(Pipeline.Workspace)/deps
-      displayName: Create temp folder for external dependencies
-  # hipBLASLt already has a CMake script for external deps, so we can just run that
-  # https://github.com/ROCm/hipBLASLt/blob/develop/deps/CMakeLists.txt
-    - script: cmake $(Pipeline.Workspace)/s/deps
-      displayName: Configure hipBLASLt external dependencies
-      workingDirectory: $(Pipeline.Workspace)/deps
-    - script: make
-      displayName: Build hipBLASLt external dependencies
-      workingDirectory: $(Pipeline.Workspace)/deps
-    - script: sudo make install
-      displayName: Install hipBLASLt external dependencies
-      workingDirectory: $(Pipeline.Workspace)/deps
+    # hipBLASLt has a script for gtest and lapack
+    # https://github.com/ROCm/hipBLASLt/blob/develop/deps/CMakeLists.txt
+    # $(Agent.BuildDirectory)/deps is a temporary folder for the build process
+    # $(Agent.BuildDirectory)/s/deps is part of the hipBLASLt repo
+    - task: Bash@3
+      displayName: Build and install external dependencies
+      inputs:
+        targetType: inline
+        script: |
+          mkdir -p $(Agent.BuildDirectory)/deps
+          cd $(Agent.BuildDirectory)/deps
+          cmake -DCMAKE_POSITION_INDEPENDENT_CODE=ON $(Agent.BuildDirectory)/s/deps
+          make
+          sudo make install
    - script: |
        mkdir -p $(CCACHE_DIR)
        echo "##vso[task.prependpath]/usr/lib/ccache"
@@ -134,93 +177,117 @@ jobs:
    - task: Cache@2
      displayName: Ccache caching
      inputs:
-        key: hipBLASLt | $(Agent.OS) | ${{ job.target }} | $(DAY_STRING) | $(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
+        key: hipBLASLt | ${{ job.os }} | ${{ job.target }} | $(DAY_STRING) | $(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
        path: $(CCACHE_DIR)
        restoreKeys: |
-          hipBLASLt | $(Agent.OS) | ${{ job.target }} | $(DAY_STRING)
-          hipBLASLt | $(Agent.OS) | ${{ job.target }}
-          hipBLASLt | $(Agent.OS)
+          hipBLASLt | ${{ job.os }} | ${{ job.target }} | $(DAY_STRING)
+          hipBLASLt | ${{ job.os }} | ${{ job.target }}
+          hipBLASLt | ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
+        os: ${{ job.os }}
        extraBuildFlags: >-
-          -DCMAKE_BUILD_TYPE=Release
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+          -DCMAKE_INCLUDE_PATH=$(Agent.BuildDirectory)/rocm/llvm/include
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
          -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
          -DCMAKE_C_COMPILER_LAUNCHER=ccache
          -DAMDGPU_TARGETS=${{ job.target }}
-          -DTensile_LOGIC=
-          -DTensile_CPU_THREADS=
-          -DTensile_LIBRARY_FORMAT=msgpack
-          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm"
          -DBUILD_CLIENTS_TESTS=ON
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
+        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-        gpuTarget: ${{ job.target }}
-        extraPaths: /home/user/workspace/rocm/llvm/bin:/home/user/workspace/rocm/bin
-        installLatestCMake: true
-        extraEnvVars:
-          - HIP_ROCCLR_HOME:::/home/user/workspace/rocm
-          - TENSILE_ROCM_ASSEMBLER_PATH:::/home/user/workspace/rocm/llvm/bin/amdclang
-          - TENSILE_ROCM_OFFLOAD_BUNDLER_PATH:::/home/user/workspace/rocm/llvm/bin/clang-offload-bundler
-          - ROCM_PATH:::/home/user/workspace/rocm
-        extraCopyDirectories:
-          - deps
+    - ${{ if eq(job.os, 'ubuntu2204') }}:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+          gpuTarget: ${{ job.target }}
+          extraPaths: /home/user/workspace/rocm/llvm/bin:/home/user/workspace/rocm/bin
+          installLatestCMake: true
+          extraEnvVars:
+            - HIP_ROCCLR_HOME:::/home/user/workspace/rocm
+            - TENSILE_ROCM_ASSEMBLER_PATH:::/home/user/workspace/rocm/llvm/bin/amdclang
+            - TENSILE_ROCM_OFFLOAD_BUNDLER_PATH:::/home/user/workspace/rocm/llvm/bin/clang-offload-bundler
+            - ROCM_PATH:::/home/user/workspace/rocm
+          extraCopyDirectories:
+            - deps

- ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: hipBLASLt_test_${{ job.target }}
-    timeoutInMinutes: 300
-    dependsOn: hipBLASLt_build_${{ job.target }}
-    condition:
-      and(succeeded(),
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
-        eq(${{ parameters.aggregatePipeline }}, False)
-      )
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    - name: ROCM_PATH
-      value: $(Agent.BuildDirectory)/rocm
-    pool: ${{ job.target }}_test_pool
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-      parameters:
-        gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmTestDependencies }}
-        gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: hipBLASLt
-        testDir: '$(Agent.BuildDirectory)/rocm/bin'
-        testExecutable: './hipblaslt-test'
-        testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes --gtest_filter=*pre_checkin*'
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-        environment: test
-        gpuTarget: ${{ job.target }}
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
+    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
+      timeoutInMinutes: 300
+      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
+      condition:
+        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
+          eq(${{ parameters.aggregatePipeline }}, False)
+        )
+      variables:
+      - group: common
+      - template: /.azuredevops/variables-global.yml
+      - name: ROCM_PATH
+        value: $(Agent.BuildDirectory)/rocm
+      pool: ${{ job.target }}_test_pool
+      workspace:
+        clean: all
+      steps:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+        parameters:
+          preTargetFilter: ${{ parameters.componentName }}
+          os: ${{ job.os }}
+          gpuTarget: ${{ job.target }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+        parameters:
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
+          dependencyList: ${{ parameters.rocmTestDependencies }}
+          os: ${{ job.os }}
+          gpuTarget: ${{ job.target }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: ${{ parameters.componentName }}
+          os: ${{ job.os }}
+          testDir: '$(Agent.BuildDirectory)/rocm/bin'
+          testExecutable: './hipblaslt-test'
+          testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes --gtest_filter=*pre_checkin*'
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+          environment: test
+          gpuTarget: ${{ job.target }}
+
+# - ${{ if parameters.triggerDownstreamJobs }}:
+#   - ${{ each component in parameters.downstreamComponentMatrix }}:
+#     - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
+#       - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
+#         parameters:
+#           checkoutRepo: ${{ parameters.checkoutRepo }}
+#           sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
+#           buildDependsOn: ${{ component.buildDependsOn }}
+#           downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
+#           triggerDownstreamJobs: true
+#           unifiedBuild: ${{ parameters.unifiedBuild }}
--- a/.azuredevops/components/hipCUB.yml
+++ b/.azuredevops/components/hipCUB.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: hipCUB
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -14,9 +33,8 @@ parameters:
  type: object
  default:
    - cmake
-    - ninja-build
-    - libgtest-dev
    - git
+    - ninja-build
    - python3-pip
 - name: rocmDependencies
  type: object
@@ -33,103 +51,143 @@ parameters:
    - llvm-project
    - rocminfo
    - rocPRIM
-    - ROCR-Runtime
    - rocprofiler-register
+    - ROCR-Runtime

 - name: jobMatrix
  type: object
  default:
    buildJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
+      - { os: almalinux8, packageManager: dnf, target: gfx942 }
+      - { os: almalinux8, packageManager: dnf, target: gfx90a }
+      - { os: almalinux8, packageManager: dnf, target: gfx1201 }
+      - { os: almalinux8, packageManager: dnf, target: gfx1100 }
+      - { os: almalinux8, packageManager: dnf, target: gfx1030 }
    testJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: hipCUB_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
    pool: ${{ variables.MEDIUM_BUILD_POOL }}
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
    workspace:
      clean: all
    steps:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
+      parameters:
+        dependencyList:
+          - gtest
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
+        os: ${{ job.os }}
+        consolidateBuildAndInstall: true
        extraBuildFlags: >-
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor"
+          -DBUILD_BENCHMARK=ON
          -DBUILD_TEST=ON
          -DAMDGPU_TARGETS=${{ job.target }}
          -GNinja
+        extraCxxFlags: -Wno-deprecated-declarations
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        gpuTarget: ${{ job.target }}
+    - ${{ if eq(job.os, 'ubuntu2204') }}:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          gpuTarget: ${{ job.target }}

- ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: hipCUB_test_${{ job.target }}
-    dependsOn: hipCUB_build_${{ job.target }}
-    condition:
-      and(succeeded(),
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
-        eq(${{ parameters.aggregatePipeline }}, False)
-      )
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool: ${{ job.target }}_test_pool
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-      parameters:
-        gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmTestDependencies }}
-        gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: hipCUB
-        testDir: '$(Agent.BuildDirectory)/rocm/bin/hipcub'
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        environment: test
-        gpuTarget: ${{ job.target }}
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
+    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
+      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
+      condition:
+        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
+          eq(${{ parameters.aggregatePipeline }}, False)
+        )
+      variables:
+      - group: common
+      - template: /.azuredevops/variables-global.yml
+      pool: ${{ job.target }}_test_pool
+      workspace:
+        clean: all
+      steps:
+      - checkout: none
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          packageManager: ${{ job.packageManager }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+        parameters:
+          preTargetFilter: ${{ parameters.componentName }}
+          gpuTarget: ${{ job.target }}
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
+          dependencyList: ${{ parameters.rocmTestDependencies }}
+          gpuTarget: ${{ job.target }}
+          os: ${{ job.os }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: ${{ parameters.componentName }}
+          testDir: '$(Agent.BuildDirectory)/rocm/bin/hipcub'
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          environment: test
+          gpuTarget: ${{ job.target }}
--- a/.azuredevops/components/hipFFT.yml
+++ b/.azuredevops/components/hipFFT.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: hipFFT
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -61,7 +80,11 @@ parameters:

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: hipFFT_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.target }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.target }} # todo: add OS
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -79,12 +102,15 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        extraBuildFlags: >-
@@ -102,9 +128,11 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
@@ -113,8 +141,8 @@ jobs:
    #     gpuTarget: ${{ job.target }}

 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: hipFFT_test_${{ job.target }}
-    dependsOn: hipFFT_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_test_${{ job.target }}
+    dependsOn: ${{ parameters.componentName }}_build_${{ job.target }}
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -134,6 +162,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
      parameters:
+        preTargetFilter: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
@@ -141,10 +170,12 @@ jobs:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmTestDependencies }}
        gpuTarget: ${{ job.target }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
      parameters:
-        componentName: hipFFT
+        componentName: ${{ parameters.componentName }}
        testDir: '$(Agent.BuildDirectory)/rocm/bin'
        testExecutable: './hipfft-test'
        testParameters: '--test_prob 0.002 --gtest_output=xml:./test_output.xml --gtest_color=yes'
--- a/.azuredevops/components/hipRAND.yml
+++ b/.azuredevops/components/hipRAND.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: hipRAND
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -14,18 +33,18 @@ parameters:
  type: object
  default:
    - cmake
-    - ninja-build
-    - googletest
    - git
+    - ninja-build
    - python3-pip
 - name: rocmDependencies
  type: object
  default:
-    - llvm-project
-    - ROCR-Runtime
    - clr
+    - llvm-project
+    - rocm-cmake
    - rocminfo
    - rocRAND
+    - ROCR-Runtime
 - name: rocmTestDependencies
  type: object
  default:
@@ -33,110 +52,168 @@ parameters:
    - llvm-project
    - rocminfo
    - rocprofiler-register
-    - ROCR-Runtime
    - rocRAND
+    - ROCR-Runtime

 - name: jobMatrix
  type: object
  default:
    buildJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
+      - { os: almalinux8, packageManager: dnf, target: gfx942 }
+      - { os: almalinux8, packageManager: dnf, target: gfx90a }
+      - { os: almalinux8, packageManager: dnf, target: gfx1201 }
+      - { os: almalinux8, packageManager: dnf, target: gfx1100 }
+      - { os: almalinux8, packageManager: dnf, target: gfx1030 }
    testJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+# - name: downstreamComponentMatrix
+#   type: object
+#   default:
+#     - rocFFT:
+#       name: rocFFT
+#       sparseCheckoutDir: projects/rocfft
+#       skipUnifiedBuild: 'false'
+#       buildDependsOn:
+#         - hipRAND_build

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: hipRAND_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
    - name: HIP_ROCCLR_HOME
      value: $(Build.BinariesDirectory)/rocm
-    pool:
-      vmImage: ${{ variables.BASE_BUILD_POOL }}
+    pool: ${{ variables.MEDIUM_BUILD_POOL }}
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
    workspace:
      clean: all
    steps:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
+      parameters:
+        dependencyList:
+          - gtest
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
+        os: ${{ job.os }}
+        consolidateBuildAndInstall: true
        extraBuildFlags: >-
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
          -DBUILD_TEST=ON
          -DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm/lib/cmake/hip
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor"
          -DCMAKE_BUILD_TYPE=Release
          -DAMDGPU_TARGETS=${{ job.target }}
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-    #   parameters:
-    #     aptPackages: ${{ parameters.aptPackages }}
-    #     gpuTarget: ${{ job.target }}
-    #     extraEnvVars:
-    #       - HIP_ROCCLR_HOME:::/home/user/workspace/rocm
+    - ${{ if eq(job.os, 'ubuntu2204') }}:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          gpuTarget: ${{ job.target }}
+          extraEnvVars:
+            - HIP_ROCCLR_HOME:::/home/user/workspace/rocm

- ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: hipRAND_test_${{ job.target }}
-    dependsOn: hipRAND_build_${{ job.target }}
-    condition:
-        and(succeeded(),
-          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
-          eq(${{ parameters.aggregatePipeline }}, False)
-        )
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool: ${{ job.target }}_test_pool
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-      parameters:
-        gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmTestDependencies }}
-        gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: hipRAND
-        testDir: '$(Agent.BuildDirectory)/rocm/bin/hipRAND'
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        environment: test
-        gpuTarget: ${{ job.target }}
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
+    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
+      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
+      condition:
+          and(succeeded(),
+            eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+            not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
+            eq(${{ parameters.aggregatePipeline }}, False)
+          )
+      variables:
+      - group: common
+      - template: /.azuredevops/variables-global.yml
+      pool: ${{ job.target }}_test_pool
+      workspace:
+        clean: all
+      steps:
+      - checkout: none
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          packageManager: ${{ job.packageManager }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+        parameters:
+          preTargetFilter: ${{ parameters.componentName }}
+          os: ${{ job.os }}
+          gpuTarget: ${{ job.target }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
+          dependencyList: ${{ parameters.rocmTestDependencies }}
+          gpuTarget: ${{ job.target }}
+          os: ${{ job.os }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: ${{ parameters.componentName }}
+          testDir: '$(Agent.BuildDirectory)/rocm/bin/hipRAND'
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          environment: test
+          gpuTarget: ${{ job.target }}
+
+# - ${{ if parameters.triggerDownstreamJobs }}:
+#   - ${{ each component in parameters.downstreamComponentMatrix }}:
+#     - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
+#       - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
+#         parameters:
+#           checkoutRepo: ${{ parameters.checkoutRepo }}
+#           sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
+#           buildDependsOn: ${{ component.buildDependsOn }}
+#           downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
+#           triggerDownstreamJobs: true
+#           unifiedBuild: ${{ parameters.unifiedBuild }}
--- a/.azuredevops/components/llvm-project.yml
+++ b/.azuredevops/components/llvm-project.yml
@@ -14,146 +14,188 @@ parameters:
  type: object
  default:
    - cmake
-    - python3-pip
    - libnuma-dev
    - ninja-build
-    - python-is-python3
-    - zlib1g-dev
    - pkg-config
+    - python-is-python3
+    - python3-pip
+    - zlib1g-dev
 - name: rocmDependencies
  type: object
  default:
    - rocm-cmake

+- name: jobMatrix
+  type: object
+  default:
+    buildJobs:
+      - { os: ubuntu2204, packageManager: apt }
+      - { os: ubuntu2404, packageManager: apt }
+      - { os: almalinux8, packageManager: dnf }
+
 jobs:
- job: llvm_project
-  variables:
-  - group: common
-  - template: /.azuredevops/variables-global.yml
-  - name: HIP_DEVICE_LIB_PATH
-    value: '$(Build.BinariesDirectory)/amdgcn/bitcode'
-  - name: HIP_PATH
-    value: '$(Agent.BuildDirectory)/rocm'
-  pool: ${{ variables.ULTRA_BUILD_POOL }}
-  workspace:
-    clean: all
-  steps:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-    parameters:
-      aptPackages: ${{ parameters.aptPackages }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-    parameters:
-      checkoutRepo: ${{ parameters.checkoutRepo }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-    parameters:
-      checkoutRef: ${{ parameters.checkoutRef }}
-      dependencyList: ${{ parameters.rocmDependencies }}
-      skipLlvmSymlink: true
-      aggregatePipeline: ${{ parameters.aggregatePipeline }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-    parameters:
-      componentName: rocm-llvm
-      extraBuildFlags: >-
-        -DCMAKE_PREFIX_PATH="$(Build.BinariesDirectory)/llvm;$(Build.BinariesDirectory)"
-        -DCMAKE_BUILD_TYPE=Release
-        -DLLVM_ENABLE_PROJECTS=clang;lld;clang-tools-extra;mlir;flang
-        -DLLVM_ENABLE_RUNTIMES=compiler-rt;libunwind;libcxx;libcxxabi
-        -DCLANG_ENABLE_AMDCLANG=ON
-        -DLLVM_TARGETS_TO_BUILD=AMDGPU;X86
-        -DLIBCXX_ENABLE_SHARED=OFF
-        -DLIBCXX_ENABLE_STATIC=ON
-        -DLIBCXX_INSTALL_LIBRARY=OFF
-        -DLIBCXX_INSTALL_HEADERS=OFF
-        -DLIBCXXABI_ENABLE_SHARED=OFF
-        -DLIBCXXABI_ENABLE_STATIC=ON
-        -DLIBCXXABI_INSTALL_STATIC_LIBRARY=OFF
-        -DLLVM_BUILD_DOCS=OFF
-        -DLLVM_ENABLE_SPHINX=OFF
-        -DLLVM_ENABLE_ASSERTIONS=OFF
-        -DLLVM_ENABLE_Z3_SOLVER=OFF
-        -DLLVM_ENABLE_ZLIB=ON
-        -DCLANG_DEFAULT_LINKER=lld
-        -DCLANG_DEFAULT_RTLIB=compiler-rt
-        -DCLANG_DEFAULT_UNWINDLIB=libgcc
-        -DSANITIZER_AMDGPU=OFF
-        -DPACKAGE_VENDOR=AMD
-        -DCLANG_LINK_FLANG_LEGACY=ON
-        -DCMAKE_CXX_STANDARD=17
-        -DROCM_LLVM_BACKWARD_COMPAT_LINK=$(Build.BinariesDirectory)/llvm
-        -DROCM_LLVM_BACKWARD_COMPAT_LINK_TARGET=./lib/llvm
-        -GNinja
-      cmakeBuildDir: '$(Build.SourcesDirectory)/llvm/build'
-      cmakeSourceDir: '$(Build.SourcesDirectory)/llvm'
-      installDir: '$(Build.BinariesDirectory)/llvm'
-# use llvm-lit to run unit tests for llvm, clang, and lld
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-    parameters:
-      componentName: check-llvm
-      testDir: 'llvm/build'
-      testExecutable: './bin/llvm-lit'
-      testParameters: '-q --xunit-xml-output=llvm_test_output.xml --filter-out="live-debug-values-spill-tracking" ./test'
-      testOutputFile: llvm_test_output.xml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-    parameters:
-      componentName: check-clang
-      testDir: 'llvm/build'
-      testExecutable: './bin/llvm-lit'
-      testParameters: '-q --xunit-xml-output=clang_test_output.xml ./tools/clang/test'
-      testOutputFile: clang_test_output.xml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-    parameters:
-      componentName: check-lld
-      testDir: 'llvm/build'
-      testExecutable: './bin/llvm-lit'
-      testParameters: '-q --xunit-xml-output=lld_test_output.xml ./tools/lld/test'
-      testOutputFile: lld_test_output.xml
-  - task: CopyFiles@2
-    displayName: Copy FileCheck for Publishing
-    inputs:
-      CleanTargetFolder: false
-      SourceFolder: llvm/build/bin
-      Contents: FileCheck
-      TargetFolder: $(Build.BinariesDirectory)/llvm/bin
-      retryCount: 3
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-    parameters:
-      componentName: device-libs
-      extraBuildFlags: >-
-        -DCMAKE_PREFIX_PATH="$(Build.SourcesDirectory)/llvm/build"
-        -DCMAKE_BUILD_TYPE=Release
-      cmakeBuildDir: '$(Build.SourcesDirectory)/amd/device-libs/build'
-      cmakeSourceDir: '$(Build.SourcesDirectory)/amd/device-libs'
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-    parameters:
-      componentName: comgr
-      extraBuildFlags: >-
-        -DCMAKE_PREFIX_PATH="$(Build.SourcesDirectory)/llvm/build;$(Build.SourcesDirectory)/amd/device-libs/build"
-        -DCOMGR_DISABLE_SPIRV=1
-        -DCMAKE_BUILD_TYPE=Release
-      cmakeBuildDir: '$(Build.SourcesDirectory)/amd/comgr/build'
-      cmakeSourceDir: '$(Build.SourcesDirectory)/amd/comgr'
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-    parameters:
-      componentName: comgr
-      testParameters: '--output-on-failure --force-new-ctest-process --output-junit comgr_test_output.xml'
-      testDir: 'amd/comgr/build'
-      testOutputFile: comgr_test_output.xml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-    parameters:
-      componentName: hipcc
-      extraBuildFlags: >-
-        -DCMAKE_BUILD_TYPE=Release
-        -DHIPCC_BACKWARD_COMPATIBILITY=OFF
-      cmakeBuildDir: '$(Build.SourcesDirectory)/amd/hipcc/build'
-      cmakeSourceDir: '$(Build.SourcesDirectory)/amd/hipcc'
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-    parameters:
-      aptPackages: ${{ parameters.aptPackages }}
-      environment: combined
-      extraEnvVars:
-        - HIP_DEVICE_LIB_PATH:::/home/user/workspace/bin/amdgcn/bitcode
-        - HIP_PATH:::/home/user/workspace/rocm
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
+  - job: llvm_project_${{ job.os }}
+    pool:
+      ${{ if eq(job.os, 'ubuntu2404') }}:
+        name: 'rocm-ci_high_build_pool_2404' #temporarily using 'high' pool while 'ultra' is down
+      ${{ else }}:
+        name: 'rocm-ci_ultra_build_pool'
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    - name: HIP_DEVICE_LIB_PATH
+      value: '$(Build.BinariesDirectory)/amdgcn/bitcode'
+    - name: HIP_PATH
+      value: '$(Agent.BuildDirectory)/rocm'
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: ${{ parameters.checkoutRepo }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      parameters:
+        checkoutRef: ${{ parameters.checkoutRef }}
+        dependencyList: ${{ parameters.rocmDependencies }}
+        skipLlvmSymlink: true
+        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        componentName: rocm-llvm
+        os: ${{ job.os }}
+        useAmdclang: false
+        extraBuildFlags: >-
+          -DCMAKE_PREFIX_PATH="$(Build.BinariesDirectory)/llvm;$(Build.BinariesDirectory)"
+          -DCMAKE_BUILD_TYPE=Release
+          -DLLVM_ENABLE_PROJECTS=clang;lld;clang-tools-extra;mlir;flang
+          -DLLVM_ENABLE_RUNTIMES=compiler-rt;libunwind;libcxx;libcxxabi
+          -DCLANG_ENABLE_AMDCLANG=ON
+          -DLLVM_TARGETS_TO_BUILD=AMDGPU;X86
+          -DLIBCXX_ENABLE_SHARED=OFF
+          -DLIBCXX_ENABLE_STATIC=ON
+          -DLIBCXX_INSTALL_LIBRARY=OFF
+          -DLIBCXX_INSTALL_HEADERS=OFF
+          -DLIBCXXABI_ENABLE_SHARED=OFF
+          -DLIBCXXABI_ENABLE_STATIC=ON
+          -DLIBCXXABI_INSTALL_STATIC_LIBRARY=OFF
+          -DLLVM_BUILD_DOCS=OFF
+          -DLLVM_ENABLE_SPHINX=OFF
+          -DLLVM_ENABLE_ASSERTIONS=OFF
+          -DLLVM_ENABLE_Z3_SOLVER=OFF
+          -DLLVM_ENABLE_ZLIB=ON
+          -DCLANG_DEFAULT_LINKER=lld
+          -DCLANG_DEFAULT_RTLIB=compiler-rt
+          -DCLANG_DEFAULT_UNWINDLIB=libgcc
+          -DSANITIZER_AMDGPU=OFF
+          -DPACKAGE_VENDOR=AMD
+          -DCLANG_LINK_FLANG_LEGACY=ON
+          -DCMAKE_CXX_STANDARD=17
+          -DROCM_LLVM_BACKWARD_COMPAT_LINK=$(Build.BinariesDirectory)/llvm
+          -DROCM_LLVM_BACKWARD_COMPAT_LINK_TARGET=./lib/llvm
+          -GNinja
+        cmakeBuildDir: '$(Build.SourcesDirectory)/llvm/build'
+        cmakeSourceDir: '$(Build.SourcesDirectory)/llvm'
+        installDir: '$(Build.BinariesDirectory)/llvm'
+    # use llvm-lit to run unit tests for llvm, clang, and lld
+    - task: Bash@3
+      displayName: 'Copy llvm-lit to install directory'
+      inputs:
+        targetType: inline
+        script: |
+          cp $(Build.SourcesDirectory)/llvm/build/bin/llvm-lit $(Build.BinariesDirectory)/llvm/bin/
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+      parameters:
+        componentName: check-llvm
+        testDir: 'llvm/build'
+        testExecutable: './bin/llvm-lit'
+        testParameters: '-q --xunit-xml-output=llvm_test_output.xml --filter-out="live-debug-values-spill-tracking" ./test'
+        testOutputFile: llvm_test_output.xml
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+      parameters:
+        componentName: check-clang
+        testDir: 'llvm/build'
+        testExecutable: './bin/llvm-lit'
+        testParameters: '-q --xunit-xml-output=clang_test_output.xml ./tools/clang/test'
+        testOutputFile: clang_test_output.xml
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+      parameters:
+        componentName: check-lld
+        testDir: 'llvm/build'
+        testExecutable: './bin/llvm-lit'
+        testParameters: '-q --xunit-xml-output=lld_test_output.xml ./tools/lld/test'
+        testOutputFile: lld_test_output.xml
+        os: ${{ job.os }}
+    - task: CopyFiles@2
+      displayName: Copy FileCheck for Publishing
+      inputs:
+        CleanTargetFolder: false
+        SourceFolder: llvm/build/bin
+        Contents: FileCheck
+        TargetFolder: $(Build.BinariesDirectory)/llvm/bin
+        retryCount: 3
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        componentName: device-libs
+        os: ${{ job.os }}
+        useAmdclang: false
+        extraBuildFlags: >-
+          -DCMAKE_PREFIX_PATH="$(Build.SourcesDirectory)/llvm/build"
+          -DCMAKE_BUILD_TYPE=Release
+        cmakeBuildDir: '$(Build.SourcesDirectory)/amd/device-libs/build'
+        cmakeSourceDir: '$(Build.SourcesDirectory)/amd/device-libs'
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        componentName: comgr
+        os: ${{ job.os }}
+        useAmdclang: false
+        extraBuildFlags: >-
+          -DCMAKE_PREFIX_PATH="$(Build.SourcesDirectory)/llvm/build;$(Build.SourcesDirectory)/amd/device-libs/build"
+          -DCOMGR_DISABLE_SPIRV=1
+          -DCMAKE_BUILD_TYPE=Release
+        cmakeBuildDir: '$(Build.SourcesDirectory)/amd/comgr/build'
+        cmakeSourceDir: '$(Build.SourcesDirectory)/amd/comgr'
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+      parameters:
+        componentName: comgr
+        testParameters: '--output-on-failure --force-new-ctest-process --output-junit comgr_test_output.xml'
+        testDir: 'amd/comgr/build'
+        testOutputFile: comgr_test_output.xml
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        componentName: hipcc
+        os: ${{ job.os }}
+        useAmdclang: false
+        extraBuildFlags: >-
+          -DCMAKE_BUILD_TYPE=Release
+          -DHIPCC_BACKWARD_COMPATIBILITY=OFF
+        cmakeBuildDir: '$(Build.SourcesDirectory)/amd/hipcc/build'
+        cmakeSourceDir: '$(Build.SourcesDirectory)/amd/hipcc'
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+      parameters:
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+    - ${{ if eq(job.os, 'ubuntu2204') }}:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          environment: combined
+          extraEnvVars:
+            - HIP_DEVICE_LIB_PATH:::/home/user/workspace/bin/amdgcn/bitcode
+            - HIP_PATH:::/home/user/workspace/rocm
--- a/.azuredevops/components/rccl.yml
+++ b/.azuredevops/components/rccl.yml
@@ -15,7 +15,6 @@ parameters:
  default:
    - cmake
    - git
-    - googletest
    - libboost-program-options-dev
    - libdrm-dev
    - libfftw3-dev
@@ -90,6 +89,10 @@ jobs:
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
        submoduleBehaviour: recursive
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
+      parameters:
+        dependencyList:
+          - gtest
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
@@ -101,12 +104,11 @@ jobs:
        extraBuildFlags: >-
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/bin/hipcc
          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/bin/hipcc
-          -DHALF_INCLUDE_DIR=$(Agent.BuildDirectory)/rocm/include
          -DCMAKE_BUILD_TYPE=Release
          -DROCM_PATH=$(Agent.BuildDirectory)/rocm
          -DBUILD_TESTS=ON
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/rocm/share/rocm/cmake;$(Agent.BuildDirectory)/rocm/libexec/hipify
-          -DAMDGPU_TARGETS=${{ job.target }}
+          -DGPU_TARGETS=${{ job.target }}
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
--- a/.azuredevops/components/rocBLAS.yml
+++ b/.azuredevops/components/rocBLAS.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: rocBLAS
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -64,19 +83,43 @@ parameters:
  type: object
  default:
    buildJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
+      # - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
+      # - { os: almalinux8, packageManager: dnf, target: gfx942 }
+      # - { os: almalinux8, packageManager: dnf, target: gfx90a }
+      # - { os: almalinux8, packageManager: dnf, target: gfx1201 }
+      # - { os: almalinux8, packageManager: dnf, target: gfx1100 }
+      # - { os: almalinux8, packageManager: dnf, target: gfx1030 }
    testJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+# - name: downstreamComponentMatrix
+#   type: object
+#   default:
+#     # rocSOLVER depends on both rocBLAS and rocPRIM
+#     # for a unified build, rocBLAS will be the one to call rocSOLVER
+#     - rocSOLVER:
+#       name: rocSOLVER
+#       sparseCheckoutDir: projects/rocsolver
+#       skipUnifiedBuild: 'false'
+#       buildDependsOn:
+#         - rocBLAS_build
+#       unifiedBuild:
+#         downstreamAggregateNames: rocBLAS+rocPRIM
+#         buildDependsOn:
+#           - rocBLAS_build
+#           - rocPRIM_build

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: rocBLAS_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -89,6 +132,10 @@ jobs:
    - name: ROCM_PATH
      value: $(Agent.BuildDirectory)/rocm
    pool: ${{ variables.MEDIUM_BUILD_POOL }}
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
    workspace:
      clean: all
    steps:
@@ -96,19 +143,26 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
+        packageManager: ${{ job.packageManager }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aocl.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
+        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
+        os: ${{ job.os }}
        extraBuildFlags: >-
          -DCMAKE_TOOLCHAIN_FILE=toolchain-linux.cmake
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm/llvm;$(Agent.BuildDirectory)/rocm
@@ -128,63 +182,94 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
+        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-        installAOCL: true
-        gpuTarget: ${{ job.target }}
-        extraEnvVars:
-          - HIP_ROCCLR_HOME:::/home/user/workspace/rocm
-          - TENSILE_ROCM_ASSEMBLER_PATH:::/home/user/workspace/rocm/llvm/bin/clang
-          - TENSILE_ROCM_OFFLOAD_BUNDLER_PATH:::/home/user/workspace/rocm/llvm/bin/clang-offload-bundler
-          - ROCM_PATH:::/home/user/workspace/rocm
+    - ${{ if eq(job.os, 'ubuntu2204') }}:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+          installAOCL: true
+          gpuTarget: ${{ job.target }}
+          extraEnvVars:
+            - HIP_ROCCLR_HOME:::/home/user/workspace/rocm
+            - TENSILE_ROCM_ASSEMBLER_PATH:::/home/user/workspace/rocm/llvm/bin/clang
+            - TENSILE_ROCM_OFFLOAD_BUNDLER_PATH:::/home/user/workspace/rocm/llvm/bin/clang-offload-bundler
+            - ROCM_PATH:::/home/user/workspace/rocm

- ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: rocBLAS_test_${{ job.target }}
-    dependsOn: rocBLAS_build_${{ job.target }}
-    condition:
-      and(succeeded(),
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
-        eq(${{ parameters.aggregatePipeline }}, False)
-      )
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool: ${{ job.target }}_test_pool
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-      parameters:
-        gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmTestDependencies }}
-        gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: rocBLAS
-        testDir: '$(Agent.BuildDirectory)/rocm/bin'
-        testExecutable: './rocblas-test'
-        testParameters: '--yaml rocblas_smoke.yaml --gtest_output=xml:./test_output.xml --gtest_color=yes'
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-        environment: test
-        gpuTarget: ${{ job.target }}
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
+    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
+      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
+      condition:
+        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
+          eq(${{ parameters.aggregatePipeline }}, False)
+        )
+      variables:
+      - group: common
+      - template: /.azuredevops/variables-global.yml
+      pool: ${{ job.target }}_test_pool
+      workspace:
+        clean: all
+      steps:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+        parameters:
+          preTargetFilter: ${{ parameters.componentName }}
+          os: ${{ job.os }}
+          gpuTarget: ${{ job.target }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+        parameters:
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
+          dependencyList: ${{ parameters.rocmTestDependencies }}
+          os: ${{ job.os }}
+          gpuTarget: ${{ job.target }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: ${{ parameters.componentName }}
+          os: ${{ job.os }}
+          testDir: '$(Agent.BuildDirectory)/rocm/bin'
+          testExecutable: './rocblas-test'
+          testParameters: '--yaml rocblas_smoke.yaml --gtest_output=xml:./test_output.xml --gtest_color=yes'
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+          environment: test
+          gpuTarget: ${{ job.target }}
+
+# - ${{ if parameters.triggerDownstreamJobs }}:
+#   - ${{ each component in parameters.downstreamComponentMatrix }}:
+#     - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
+#       - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
+#         parameters:
+#           checkoutRepo: ${{ parameters.checkoutRepo }}
+#           sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
+#           triggerDownstreamJobs: true
+#           unifiedBuild: ${{ parameters.unifiedBuild }}
+#           ${{ if parameters.unifiedBuild }}:
+#             buildDependsOn: ${{ component.unifiedBuild.buildDependsOn }}
+#             downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ component.unifiedBuild.downstreamAggregateNames }}
+#           ${{ else }}:
+#             buildDependsOn: ${{ component.buildDependsOn }}
+#             downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
--- a/.azuredevops/components/rocDecode.yml
+++ b/.azuredevops/components/rocDecode.yml
@@ -1,4 +1,7 @@
 parameters:
+- name: componentName
+  type: string
+  default: rocDecode
 - name: checkoutRepo
  type: string
  default: 'self'
@@ -13,29 +16,28 @@ parameters:
 - name: aptPackages
  type: object
  default:
-    - python3-pip
    - cmake
-    - ninja-build
-    - pkg-config
    - ffmpeg
    - libavcodec-dev
    - libavformat-dev
    - libavutil-dev
+    - libdrm-dev
    - libstdc++-12-dev
    - libva-amdgpu-dev
    - mesa-amdgpu-va-drivers
-    - libdrm-dev
+    - ninja-build
+    - pkg-config
    - python3-pip
 - name: rocmDependencies
  type: object
  default:
-    - rocm-cmake
-    - llvm-project
-    - ROCR-Runtime
    - clr
-    - rocminfo
+    - llvm-project
+    - rocm-cmake
    - rocm-core
+    - rocminfo
    - rocprofiler-register
+    - ROCR-Runtime
 - name: rocmTestDependencies
  type: object
  default:
@@ -48,53 +50,70 @@ parameters:
 - name: jobMatrix
  type: object
  default:
+    buildJobs:
+      - { os: ubuntu2204, packageManager: apt }
+      - { os: almalinux8, packageManager: dnf }
    testJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }

 jobs:
- job: rocDecode_build
-  variables:
-  - group: common
-  - template: /.azuredevops/variables-global.yml
-  - name: ROCM_PATH
-    value: $(Agent.BuildDirectory)/rocm
-  pool:
-    vmImage: ${{ variables.BASE_BUILD_POOL }}
-  workspace:
-    clean: all
-  steps:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-    parameters:
-      aptPackages: ${{ parameters.aptPackages }}
-      registerROCmPackages: true
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-    parameters:
-      checkoutRepo: ${{ parameters.checkoutRepo }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-    parameters:
-      checkoutRef: ${{ parameters.checkoutRef }}
-      dependencyList: ${{ parameters.rocmDependencies }}
-      aggregatePipeline: ${{ parameters.aggregatePipeline }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-    parameters:
-      extraBuildFlags: >-
-        -DCMAKE_BUILD_TYPE=Release
-        -GNinja
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-  # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-  #   parameters:
-  #     aptPackages: ${{ parameters.aptPackages }}
-  #     registerROCmPackages: true
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    - name: ROCM_PATH
+      value: $(Agent.BuildDirectory)/rocm
+    pool:
+      vmImage: ${{ variables.BASE_BUILD_POOL }}
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
+        registerROCmPackages: true
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: ${{ parameters.checkoutRepo }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      parameters:
+        checkoutRef: ${{ parameters.checkoutRef }}
+        dependencyList: ${{ parameters.rocmDependencies }}
+        os: ${{ job.os }}
+        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        os: ${{ job.os }}
+        consolidateBuildAndInstall: true
+        extraBuildFlags: >-
+          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+          -DCMAKE_BUILD_TYPE=Release
+          -GNinja
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+      parameters:
+        componentName: ${{ parameters.componentName }}
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
+        componentName: ${{ parameters.componentName }}
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+    #   parameters:
+    #     aptPackages: ${{ parameters.aptPackages }}
+    #     registerROCmPackages: true

 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: rocDecode_test_${{ job.target }}
-    dependsOn: rocDecode_build
+  - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
+    dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -114,20 +133,27 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
        registerROCmPackages: true
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+      parameters:
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+      parameters:
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmTestDependencies }}
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
    - task: Bash@3
      displayName: Build rocDecode tests
      inputs:
        targetType: inline
        script: |
+          ${{ iif(eq(job.os, 'almalinux8'), 'source /opt/rh/gcc-toolset-14/enable', '') }}
          mkdir rocDecode-tests
          cd rocDecode-tests
          cmake $(Agent.BuildDirectory)/rocm/share/rocdecode/test
@@ -136,6 +162,7 @@ jobs:
      parameters:
        componentName: rocDecode
        testDir: 'rocDecode-tests'
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/rocFFT.yml
+++ b/.azuredevops/components/rocFFT.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: rocFFT
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -59,10 +78,23 @@ parameters:
        target: gfx942
      - gfx90a:
        target: gfx90a
+# - name: downstreamComponentMatrix
+#   type: object
+#   default:
+#     - hipFFT:
+#       name: hipFFT
+#       sparseCheckoutDir: projects/hipfft
+#       skipUnifiedBuild: 'false'
+#       buildDependsOn:
+#         - rocFFT_build

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: rocFFT_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.target }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_ubuntu2204_${{ job.target }} # todo: un-hardcode OS
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -79,12 +111,15 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        extraBuildFlags: >-
@@ -101,9 +136,11 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
@@ -114,8 +151,8 @@ jobs:
          - HIP_ROCCLR_HOME:::/home/user/workspace/rocm

 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: rocFFT_test_${{ job.target }}
-    dependsOn: rocFFT_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_test_${{ job.target }}
+    dependsOn: ${{ parameters.componentName }}_build_${{ job.target }}
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -135,6 +172,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
      parameters:
+        preTargetFilter: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
@@ -142,10 +180,12 @@ jobs:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmTestDependencies }}
        gpuTarget: ${{ job.target }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
      parameters:
-        componentName: rocFFT
+        componentName: ${{ parameters.componentName }}
        testDir: '$(Agent.BuildDirectory)/rocm/bin'
        testExecutable: './rocfft-test'
        testParameters: '--test_prob 0.004 --gtest_output=xml:./test_output.xml --gtest_color=yes'
@@ -154,3 +194,15 @@ jobs:
        aptPackages: ${{ parameters.aptPackages }}
        environment: test
        gpuTarget: ${{ job.target }}
+
+# - ${{ if parameters.triggerDownstreamJobs }}:
+#   - ${{ each component in parameters.downstreamComponentMatrix }}:
+#     - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
+#       - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
+#         parameters:
+#           checkoutRepo: ${{ parameters.checkoutRepo }}
+#           sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
+#           buildDependsOn: ${{ component.buildDependsOn }}
+#           downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
+#           triggerDownstreamJobs: true
+#           unifiedBuild: ${{ parameters.unifiedBuild }}
--- a/.azuredevops/components/rocJPEG.yml
+++ b/.azuredevops/components/rocJPEG.yml
@@ -1,4 +1,7 @@
 parameters:
+- name: componentName
+  type: string
+  default: rocJPEG
 - name: checkoutRepo
  type: string
  default: 'self'
@@ -44,32 +47,44 @@ parameters:
  type: object
  default:
    buildJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
+      - { os: almalinux8, packageManager: dnf, target: gfx942 }
+      - { os: almalinux8, packageManager: dnf, target: gfx90a }
+      - { os: almalinux8, packageManager: dnf, target: gfx1201 }
+      - { os: almalinux8, packageManager: dnf, target: gfx1100 }
+      - { os: almalinux8, packageManager: dnf, target: gfx1030 }
    testJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: rocJPEG_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
    - name: ROCM_PATH
      value: $(Agent.BuildDirectory)/rocm
    pool:
-      vmImage: ${{ variables.BASE_BUILD_POOL }}
+      ${{ if eq(job.os, 'ubuntu2404') }}:
+        name: rocm-ci_medium_build_pool_2404
+      ${{ else }}:
+        name: ${{ variables.MEDIUM_BUILD_POOL }}
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
    workspace:
      clean: all
    steps:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
        registerROCmPackages: true
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
@@ -80,17 +95,26 @@ jobs:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
+        os: ${{ job.os }}
+        consolidateBuildAndInstall: true
        extraBuildFlags: >-
+          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm"
          -DCMAKE_BUILD_TYPE=Release
          -DGPU_TARGETS=${{ job.target }}
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+      parameters:
+        componentName: ${{ parameters.componentName }}
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
    #   parameters:
@@ -99,8 +123,8 @@ jobs:
    #     registerROCmPackages: true

 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: rocJPEG_test_${{ job.target }}
-    dependsOn: rocJPEG_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
+    dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -120,22 +144,28 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
        registerROCmPackages: true
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
      parameters:
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+      parameters:
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmTestDependencies }}
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
    - task: Bash@3
      displayName: Build rocJPEG tests
      inputs:
        targetType: inline
        script: |
+          ${{ iif(eq(job.os, 'almalinux8'), 'source /opt/rh/gcc-toolset-14/enable', '') }}
          mkdir rocJPEG-tests
          cd rocJPEG-tests
          cmake $(Agent.BuildDirectory)/rocm/share/rocjpeg/test
@@ -144,6 +174,7 @@ jobs:
      parameters:
        componentName: rocJPEG
        testDir: 'rocJPEG-tests'
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/rocPRIM.yml
+++ b/.azuredevops/components/rocPRIM.yml
@@ -1,16 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: rocPRIM
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
- name: sparseCheckout
-  type: boolean
-  default: false
+# monorepo related parameters
 - name: sparseCheckoutDir
  type: string
  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -20,18 +33,17 @@ parameters:
  type: object
  default:
    - cmake
-    - ninja-build
-    - libgtest-dev
    - git
+    - ninja-build
    - python3-pip
 - name: rocmDependencies
  type: object
  default:
-    - rocm-cmake
-    - llvm-project
-    - ROCR-Runtime
    - clr
+    - llvm-project
+    - rocm-cmake
    - rocminfo
+    - ROCR-Runtime
 - name: rocmTestDependencies
  type: object
  default:
@@ -45,98 +57,175 @@ parameters:
  type: object
  default:
    buildJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
+      - { os: almalinux8, packageManager: dnf, target: gfx942 }
+      - { os: almalinux8, packageManager: dnf, target: gfx90a }
+      - { os: almalinux8, packageManager: dnf, target: gfx1201 }
+      - { os: almalinux8, packageManager: dnf, target: gfx1100 }
+      - { os: almalinux8, packageManager: dnf, target: gfx1030 }
    testJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942, shard: 1, shardCount: 3 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx942, shard: 2, shardCount: 3 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx942, shard: 3, shardCount: 3 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a, shard: 1, shardCount: 3 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a, shard: 2, shardCount: 3 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a, shard: 3, shardCount: 3 }
+- name: downstreamComponentMatrix
+  type: object
+  default:
+    - rocThrust:
+      name: rocThrust
+      sparseCheckoutDir: projects/rocthrust
+      skipUnifiedBuild: 'false'
+      buildDependsOn:
+        - rocPRIM_build
+    - hipCUB:
+      name: hipCUB
+      sparseCheckoutDir: projects/hipcub
+      skipUnifiedBuild: 'false'
+      buildDependsOn:
+        - rocPRIM_build
+    # rocSOLVER depends on both rocBLAS and rocPRIM
+    # for a unified build, rocBLAS will be the one to call rocSOLVER
+    # - rocSOLVER:
+    #   name: rocSOLVER
+    #   sparseCheckoutDir: projects/rocsolver
+    #   skipUnifiedBuild: 'true'
+    #   buildDependsOn:
+    #     - rocPRIM_build

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: rocPRIM_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
    pool: ${{ variables.MEDIUM_BUILD_POOL }}
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
    workspace:
      clean: all
    steps:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
-        sparseCheckout: ${{ parameters.sparseCheckout }}
        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
+      parameters:
+        dependencyList:
+          - gtest
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
+        os: ${{ job.os }}
+        consolidateBuildAndInstall: true
        extraBuildFlags: >-
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-          -DBUILD_BENCHMARK=ON
+          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor"
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
          -DAMDGPU_TARGETS=${{ job.target }}
+          -DBUILD_BENCHMARK=ON
          -DBUILD_TEST=ON
          -GNinja
+        extraCxxFlags: -Wno-deprecated-declarations
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        gpuTarget: ${{ job.target }}
+    - ${{ if eq(job.os, 'ubuntu2204') }}:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          gpuTarget: ${{ job.target }}

- ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: rocPRIM_test_${{ job.target }}
-    dependsOn: rocPRIM_build_${{ job.target }}
-    condition:
-      and(succeeded(),
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
-        eq(${{ parameters.aggregatePipeline }}, False)
-      )
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool: ${{ job.target }}_test_pool
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-      parameters:
-        gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmTestDependencies }}
-        gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: rocPRIM
-        testDir: '$(Agent.BuildDirectory)/rocm/bin/rocprim'
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        environment: test
-        gpuTarget: ${{ job.target }}
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
+    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}_shard_${{ job.shard }}
+      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
+      condition:
+        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
+          eq(${{ parameters.aggregatePipeline }}, False)
+        )
+      variables:
+      - group: common
+      - template: /.azuredevops/variables-global.yml
+      pool: ${{ job.target }}_test_pool
+      workspace:
+        clean: all
+      steps:
+      - checkout: none
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          packageManager: ${{ job.packageManager }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+        parameters:
+          preTargetFilter: ${{ parameters.componentName }}
+          gpuTarget: ${{ job.target }}
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
+          dependencyList: ${{ parameters.rocmTestDependencies }}
+          gpuTarget: ${{ job.target }}
+          os: ${{ job.os }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: ${{ parameters.componentName }}
+          testDir: '$(Agent.BuildDirectory)/rocm/bin/rocprim'
+          extraTestParameters: '-I ${{ job.shard }},,${{ job.shardCount }}'
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          environment: test
+          gpuTarget: ${{ job.target }}
+
+- ${{ if parameters.triggerDownstreamJobs }}:
+  - ${{ each component in parameters.downstreamComponentMatrix }}:
+    - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
+      - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
+        parameters:
+          checkoutRepo: ${{ parameters.checkoutRepo }}
+          sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
+          buildDependsOn: ${{ component.buildDependsOn }}
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
+          triggerDownstreamJobs: true
+          unifiedBuild: ${{ parameters.unifiedBuild }}
--- a/.azuredevops/components/rocRAND.yml
+++ b/.azuredevops/components/rocRAND.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: rocRAND
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -15,18 +34,16 @@ parameters:
  default:
    - cmake
    - git
-    - googletest
-    - libgtest-dev
    - ninja-build
    - python3-pip
 - name: rocmDependencies
  type: object
  default:
-    - rocm-cmake
-    - llvm-project
-    - ROCR-Runtime
    - clr
+    - llvm-project
+    - rocm-cmake
    - rocminfo
+    - ROCR-Runtime
 - name: rocmTestDependencies
  type: object
  default:
@@ -40,56 +57,96 @@ parameters:
  type: object
  default:
    buildJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
+      - { os: almalinux8, packageManager: dnf, target: gfx942 }
+      - { os: almalinux8, packageManager: dnf, target: gfx90a }
+      - { os: almalinux8, packageManager: dnf, target: gfx1201 }
+      - { os: almalinux8, packageManager: dnf, target: gfx1100 }
+      - { os: almalinux8, packageManager: dnf, target: gfx1030 }
    testJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+- name: downstreamComponentMatrix
+  type: object
+  default:
+    - hipRAND:
+      name: hipRAND
+      sparseCheckoutDir: projects/hiprand
+      skipUnifiedBuild: 'false'
+      buildDependsOn:
+        - rocRAND_build

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: rocRAND_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
    - name: HIP_ROCCLR_HOME
      value: $(Build.BinariesDirectory)/rocm
    pool:
-      vmImage: ${{ variables.BASE_BUILD_POOL }}
+      ${{ if eq(job.os, 'ubuntu2404') }}:
+        vmImage: 'ubuntu-24.04'
+      ${{ else }}:
+        name: ${{ variables.MEDIUM_BUILD_POOL }}
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
    workspace:
      clean: all
    steps:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
+      parameters:
+        dependencyList:
+          - gtest
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        os: ${{ job.os }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
+        os: ${{ job.os }}
+        consolidateBuildAndInstall: true
        extraBuildFlags: >-
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-          -DBUILD_TEST=ON
+          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
          -DAMDGPU_TARGETS=${{ job.target }}
+          -DBUILD_TEST=ON
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
    #   parameters:
@@ -98,42 +155,63 @@ jobs:
    #     extraEnvVars:
    #       - HIP_ROCCLR_HOME:::/home/user/workspace/rocm

- ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: rocRAND_test_${{ job.target }}
-    dependsOn: rocRAND_build_${{ job.target }}
-    condition:
-      and(succeeded(),
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
-        eq(${{ parameters.aggregatePipeline }}, False)
-      )
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool: ${{ job.target }}_test_pool
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-      parameters:
-        gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmTestDependencies }}
-        gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: rocRAND
-        testDir: '$(Agent.BuildDirectory)/rocm/bin/rocRAND'
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        environment: test
-        gpuTarget: ${{ job.target }}
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
+    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
+      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
+      condition:
+        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
+          eq(${{ parameters.aggregatePipeline }}, False)
+        )
+      variables:
+      - group: common
+      - template: /.azuredevops/variables-global.yml
+      pool: ${{ job.target }}_test_pool
+      workspace:
+        clean: all
+      steps:
+      - checkout: none
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          packageManager: ${{ job.packageManager }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+        parameters:
+          preTargetFilter: ${{ parameters.componentName }}
+          gpuTarget: ${{ job.target }}
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
+          dependencyList: ${{ parameters.rocmTestDependencies }}
+          gpuTarget: ${{ job.target }}
+          os: ${{ job.os }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: ${{ parameters.componentName }}
+          testDir: '$(Agent.BuildDirectory)/rocm/bin/rocRAND'
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          environment: test
+          gpuTarget: ${{ job.target }}
+
+- ${{ if parameters.triggerDownstreamJobs }}:
+  - ${{ each component in parameters.downstreamComponentMatrix }}:
+    - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
+      - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
+        parameters:
+          checkoutRepo: ${{ parameters.checkoutRepo }}
+          sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
+          buildDependsOn: ${{ component.buildDependsOn }}
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
+          triggerDownstreamJobs: true
+          unifiedBuild: ${{ parameters.unifiedBuild }}
--- a/.azuredevops/components/rocSOLVER.yml
+++ b/.azuredevops/components/rocSOLVER.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: rocSOLVER
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -26,14 +45,12 @@ parameters:
  type: object
  default:
    - clr
-    - hipSPARSE
    - llvm-project
    - rocBLAS
    - rocm-cmake
    - rocminfo
    - rocPRIM
    - ROCR-Runtime
-    - rocSPARSE
 - name: rocmTestDependencies
  type: object
  default:
@@ -55,33 +72,47 @@ parameters:
  type: object
  default:
    buildJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
+      # - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
+      # - { os: almalinux8, packageManager: dnf, target: gfx942 }
+      # - { os: almalinux8, packageManager: dnf, target: gfx90a }
+      # - { os: almalinux8, packageManager: dnf, target: gfx1201 }
+      # - { os: almalinux8, packageManager: dnf, target: gfx1100 }
+      # - { os: almalinux8, packageManager: dnf, target: gfx1030 }
    testJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: rocSOLVER_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
    pool: ${{ variables.MEDIUM_BUILD_POOL }}
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
    workspace:
      clean: all
    steps:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - task: Bash@3
      displayName: 'Clone lapack'
      inputs:
@@ -92,11 +123,15 @@ jobs:
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
+        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        componentName: lapack
+        os: ${{ job.os }}
        extraBuildFlags: >-
          -DCMAKE_BUILD_TYPE=Release
          -DCMAKE_Fortran_FLAGS=-fno-optimize-sibling-calls
@@ -109,6 +144,7 @@ jobs:
        installDir: '$(Pipeline.Workspace)/deps-install'
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
+        os: ${{ job.os }}
        extraBuildFlags: >-
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Pipeline.Workspace)/deps-install
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
@@ -120,56 +156,71 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
+        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        gpuTarget: ${{ job.target }}
-        extraCopyDirectories:
-          - deps-install
+    - ${{ if eq(job.os, 'ubuntu2204') }}:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          gpuTarget: ${{ job.target }}
+          extraCopyDirectories:
+            - deps-install

- ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: rocSOLVER_test_${{ job.target }}
-    dependsOn: rocSOLVER_build_${{ job.target }}
-    condition:
-      and(succeeded(),
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
-        eq(${{ parameters.aggregatePipeline }}, False)
-      )
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool: ${{ job.target }}_test_pool
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-      parameters:
-        gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmTestDependencies }}
-        gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: rocSOLVER
-        testDir: '$(Agent.BuildDirectory)/rocm/bin'
-        testExecutable: './rocsolver-test'
-        testParameters: '--gtest_filter="*checkin*" --gtest_output=xml:./test_output.xml --gtest_color=yes'
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        environment: test
-        gpuTarget: ${{ job.target }}
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
+    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
+      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
+      condition:
+        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
+          eq(${{ parameters.aggregatePipeline }}, False)
+        )
+      variables:
+      - group: common
+      - template: /.azuredevops/variables-global.yml
+      pool: ${{ job.target }}_test_pool
+      workspace:
+        clean: all
+      steps:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+        parameters:
+          preTargetFilter: ${{ parameters.componentName }}
+          os: ${{ job.os }}
+          gpuTarget: ${{ job.target }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+        parameters:
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
+          dependencyList: ${{ parameters.rocmTestDependencies }}
+          os: ${{ job.os }}
+          gpuTarget: ${{ job.target }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: ${{ parameters.componentName }}
+          os: ${{ job.os }}
+          testDir: '$(Agent.BuildDirectory)/rocm/bin'
+          testExecutable: './rocsolver-test'
+          testParameters: '--gtest_filter="*checkin*" --gtest_output=xml:./test_output.xml --gtest_color=yes'
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          environment: test
+          gpuTarget: ${{ job.target }}
--- a/.azuredevops/components/rocThrust.yml
+++ b/.azuredevops/components/rocThrust.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: rocThrust
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -14,18 +33,17 @@ parameters:
  type: object
  default:
    - cmake
+    - git
    - ninja-build
    - libboost-program-options-dev
-    - googletest
    - libfftw3-dev
-    - git
    - python3-pip
 - name: rocmDependencies
  type: object
  default:
    - clr
-    - hipRAND
    - llvm-project
+    - rocm-cmake
    - rocminfo
    - rocPRIM
    - ROCR-Runtime
@@ -36,104 +54,142 @@ parameters:
    - llvm-project
    - rocminfo
    - rocPRIM
-    - ROCR-Runtime
-    - hipRAND
    - rocprofiler-register
+    - ROCR-Runtime

 - name: jobMatrix
  type: object
  default:
    buildJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
+      - { os: almalinux8, packageManager: dnf, target: gfx942 }
+      - { os: almalinux8, packageManager: dnf, target: gfx90a }
+      - { os: almalinux8, packageManager: dnf, target: gfx1201 }
+      - { os: almalinux8, packageManager: dnf, target: gfx1100 }
+      - { os: almalinux8, packageManager: dnf, target: gfx1030 }
    testJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: rocThrust_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
    pool: ${{ variables.MEDIUM_BUILD_POOL }}
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
    workspace:
      clean: all
    steps:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
+      parameters:
+        dependencyList:
+          - gtest
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
+        os: ${{ job.os }}
+        consolidateBuildAndInstall: true
        extraBuildFlags: >-
          -GNinja
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
          -DROCM_PATH=$(Agent.BuildDirectory)/rocm
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor"
          -DAMDGPU_TARGETS=${{ job.target }}
          -DBUILD_TEST=ON
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        gpuTarget: ${{ job.target }}
+    - ${{ if eq(job.os, 'ubuntu2204') }}:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          gpuTarget: ${{ job.target }}

- ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: rocThrust_test_${{ job.target }}
-    dependsOn: rocThrust_build_${{ job.target }}
-    condition:
-      and(succeeded(),
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
-        eq(${{ parameters.aggregatePipeline }}, False)
-      )
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool: ${{ job.target }}_test_pool
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-      parameters:
-        gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmTestDependencies }}
-        gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: rocThrust
-        testDir: '$(Agent.BuildDirectory)/rocm/bin/rocthrust'
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        environment: test
-        gpuTarget: ${{ job.target }}
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
+    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
+      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
+      condition:
+        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
+          eq(${{ parameters.aggregatePipeline }}, False)
+        )
+      variables:
+      - group: common
+      - template: /.azuredevops/variables-global.yml
+      pool: ${{ job.target }}_test_pool
+      workspace:
+        clean: all
+      steps:
+      - checkout: none
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          packageManager: ${{ job.packageManager }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+        parameters:
+          preTargetFilter: ${{ parameters.componentName }}
+          gpuTarget: ${{ job.target }}
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
+          dependencyList: ${{ parameters.rocmTestDependencies }}
+          gpuTarget: ${{ job.target }}
+          os: ${{ job.os }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: ${{ parameters.componentName }}
+          testDir: '$(Agent.BuildDirectory)/rocm/bin/rocthrust'
+          testParameters: '--output-on-failure --force-new-ctest-process --output-junit test_output.xml --exclude-regex "scan.hip"'
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          environment: test
+          gpuTarget: ${{ job.target }}
--- a/.azuredevops/components/rocm-cmake.yml
+++ b/.azuredevops/components/rocm-cmake.yml
@@ -16,8 +16,6 @@ parameters:
    - doxygen
    - doxygen-doc
    - ninja-build
-    - python3-pip
-    - python3-sphinx
 - name: pipModules
  type: object
  default:
@@ -25,49 +23,75 @@ parameters:
    - cmake==3.20.5
    - ninja
    - rocm-docs-core
+    - sphinx
+
+- name: jobMatrix
+  type: object
+  default:
+    buildJobs:
+      - { os: ubuntu2204, packageManager: apt }
+      - { os: ubuntu2404, packageManager: apt }
+      - { os: almalinux8, packageManager: dnf }

 jobs:
- job: rocm_cmake
-  variables:
-  - group: common
-  - template: /.azuredevops/variables-global.yml
-  pool:
-    vmImage: ${{ variables.BASE_BUILD_POOL }}
-  workspace:
-    clean: all
-  steps:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-    parameters:
-      aptPackages: ${{ parameters.aptPackages }}
-      pipModules: ${{ parameters.pipModules }}
-  - task: Bash@3
-    displayName: Add CMake to PATH
-    inputs:
-      targetType: inline
-      script: echo "##vso[task.prependpath]$(python3 -m site --user-base)/bin"
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-    parameters:
-      checkoutRepo: ${{ parameters.checkoutRepo }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-  - task: Bash@3
-    displayName: CTest setup
-    inputs:
-      targetType: inline
-      script: |
-        python -m pip install -r $(Build.SourcesDirectory)/docs/requirements.txt
-        python -m pip install -r $(Build.SourcesDirectory)/test/docsphinx/docs/.sphinx/requirements.txt
-        git config --global user.email "you@example.com"
-        git config --global user.name "Your Name"
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-    parameters:
-      componentName: rocm-cmake
-      testParameters: '-E "pass-version-parent" --output-on-failure --force-new-ctest-process --output-junit test_output.xml'
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-  # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-  #   parameters:
-  #     aptPackages: ${{ parameters.aptPackages }}
-  #     pipModules: ${{ parameters.pipModules }}
-  #     environment: combined
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
+  - job: rocm_cmake_${{ job.os }}
+    pool:
+      ${{ if eq(job.os, 'ubuntu2404') }}:
+        vmImage: 'ubuntu-24.04'
+      ${{ else }}:
+        vmImage: 'ubuntu-22.04'
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        pipModules: ${{ parameters.pipModules }}
+        packageManager: ${{ job.packageManager }}
+    - task: Bash@3
+      displayName: Add CMake to PATH
+      inputs:
+        targetType: inline
+        script: echo "##vso[task.prependpath]$(python3 -m site --user-base)/bin"
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: ${{ parameters.checkoutRepo }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        os: ${{ job.os }}
+        useAmdclang: false
+    - task: Bash@3
+      displayName: CTest setup
+      inputs:
+        targetType: inline
+        script: |
+          python -m pip install -r $(Build.SourcesDirectory)/docs/requirements.txt
+          python -m pip install -r $(Build.SourcesDirectory)/test/docsphinx/docs/.sphinx/requirements.txt
+          git config --global user.email "you@example.com"
+          git config --global user.name "Your Name"
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+      parameters:
+        componentName: rocm-cmake
+        testParameters: '-E "pass-version-parent" --output-on-failure --force-new-ctest-process --output-junit test_output.xml'
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+      parameters:
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+    #   parameters:
+    #     aptPackages: ${{ parameters.aptPackages }}
+    #     pipModules: ${{ parameters.pipModules }}
+    #     environment: combined
--- a/.azuredevops/components/rocm-core.yml
+++ b/.azuredevops/components/rocm-core.yml
@@ -15,39 +15,61 @@ parameters:
  default:
    - cmake
    - ninja-build
-    - python3-pip
+
+- name: jobMatrix
+  type: object
+  default:
+    buildJobs:
+      - { os: ubuntu2204, packageManager: apt }
+      - { os: ubuntu2404, packageManager: apt }
+      - { os: almalinux8, packageManager: dnf }

 jobs:
- job: rocm_core
-  variables:
-  - group: common
-  - template: /.azuredevops/variables-global.yml
-  pool:
-    vmImage: ${{ variables.BASE_BUILD_POOL }}
-  workspace:
-    clean: all
-  steps:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-    parameters:
-      aptPackages: ${{ parameters.aptPackages }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-    parameters:
-      checkoutRepo: ${{ parameters.checkoutRepo }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-    parameters:
-      extraBuildFlags: >-
-        -DCMAKE_CURRENT_BINARY_DIR=$PWD
-        -DCMAKE_CURRENT_SOURCE_DIR=$PWD/../
-        -DCMAKE_VERBOSE_MAKEFILE=1
-        -DCPACK_GENERATOR=DEB
-        -DCPACK_DEBIAN_PACKAGE_RELEASE="local.9999~99.99"
-        -DCPACK_RPM_PACKAGE_RELEASE="local.9999"
-        -DROCM_VERSION="$(NEXT_RELEASE_VERSION)"
-        -GNinja
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-  # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-  #   parameters:
-  #     aptPackages: ${{ parameters.aptPackages }}
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
+  - job: rocm_core_${{ job.os }}
+    pool:
+      ${{ if eq(job.os, 'ubuntu2404') }}:
+        vmImage: 'ubuntu-24.04'
+      ${{ else }}:
+        vmImage: 'ubuntu-22.04'
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: ${{ parameters.checkoutRepo }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        os: ${{ job.os }}
+        useAmdclang: false
+        extraBuildFlags: >-
+          -DCMAKE_CURRENT_BINARY_DIR=$PWD
+          -DCMAKE_CURRENT_SOURCE_DIR=$PWD/../
+          -DCMAKE_VERBOSE_MAKEFILE=1
+          -DCPACK_GENERATOR=DEB
+          -DCPACK_DEBIAN_PACKAGE_RELEASE="local.9999~99.99"
+          -DCPACK_RPM_PACKAGE_RELEASE="local.9999"
+          -DROCM_VERSION="$(NEXT_RELEASE_VERSION)"
+          -GNinja
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+      parameters:
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+    #   parameters:
+    #     aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/rocm-examples.yml
+++ b/.azuredevops/components/rocm-examples.yml
@@ -15,6 +15,7 @@ parameters:
  default:
    - cmake
    - libglfw3-dev
+    - libmsgpack-dev
    - libtbb-dev
    - ninja-build
    - python3-pip
--- a/.azuredevops/components/rocm_smi_lib.yml
+++ b/.azuredevops/components/rocm_smi_lib.yml
@@ -17,50 +17,66 @@ parameters:
    - libdrm-dev
    - ninja-build
    - pkg-config
-    - python3-pip

 - name: jobMatrix
  type: object
  default:
+    buildJobs:
+      - { os: ubuntu2204, packageManager: apt }
+      - { os: ubuntu2404, packageManager: apt }
+      - { os: almalinux8, packageManager: dnf }
    testJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }

 jobs:
- job: rocm_smi_lib_build
-  variables:
-  - group: common
-  - template: /.azuredevops/variables-global.yml
-  pool:
-    vmImage: ${{ variables.BASE_BUILD_POOL }}
-  workspace:
-    clean: all
-  steps:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-    parameters:
-      aptPackages: ${{ parameters.aptPackages }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-    parameters:
-      checkoutRepo: ${{ parameters.checkoutRepo }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-    parameters:
-      extraBuildFlags: >-
-        -DBUILD_TESTS=ON
-        -DROCM_DEP_ROCMCORE=ON
-        -GNinja
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-  # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-  #   parameters:
-  #     aptPackages: ${{ parameters.aptPackages }}
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
+  - job: rocm_smi_lib_build_${{ job.os }}
+    pool:
+      ${{ if eq(job.os, 'ubuntu2404') }}:
+        vmImage: 'ubuntu-24.04'
+      ${{ else }}:
+        vmImage: 'ubuntu-22.04'
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: ${{ parameters.checkoutRepo }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        os: ${{ job.os }}
+        useAmdclang: false
+        extraBuildFlags: >-
+          -DBUILD_TESTS=ON
+          -DROCM_DEP_ROCMCORE=ON
+          -GNinja
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+      parameters:
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+    #   parameters:
+    #     aptPackages: ${{ parameters.aptPackages }}

 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: rocm_smi_lib_test_${{ job.target }}
-    dependsOn: rocm_smi_lib_build
+  - job: rocm_smi_lib_test_${{ job.os }}_${{ job.target }}
+    dependsOn: rocm_smi_lib_build_${{ job.os }}
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -77,8 +93,11 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+      parameters:
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
      parameters:
        runRocminfo: false
@@ -86,8 +105,9 @@ jobs:
      parameters:
        componentName: rocm_smi_lib
        testDir: '$(Agent.BuildDirectory)'
-        testExecutable: './rocm/share/rocm_smi/rsmitst_tests/rsmitst'
+        testExecutable: 'sudo ./rocm/share/rocm_smi/rsmitst_tests/rsmitst'
        testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/rocminfo.yml
+++ b/.azuredevops/components/rocminfo.yml
@@ -17,7 +17,6 @@ parameters:
    - libdrm-amdgpu-dev
    - libdrm-dev
    - ninja-build
-    - python3-pip
 - name: rocmDependencies
  type: object
  default:
@@ -32,49 +31,63 @@ parameters:
 - name: jobMatrix
  type: object
  default:
+    buildJobs:
+      - { os: ubuntu2204, packageManager: apt }
+      - { os: almalinux8, packageManager: dnf }
    testJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }

 jobs:
- job: rocminfo
-  variables:
-  - group: common
-  - template: /.azuredevops/variables-global.yml
-  pool:
-    vmImage: ${{ variables.BASE_BUILD_POOL }}
-  workspace:
-    clean: all
-  steps:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-    parameters:
-      aptPackages: ${{ parameters.aptPackages }}
-      registerROCmPackages: true
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-    parameters:
-      checkoutRepo: ${{ parameters.checkoutRepo }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-    parameters:
-      checkoutRef: ${{ parameters.checkoutRef }}
-      dependencyList: ${{ parameters.rocmDependencies }}
-      aggregatePipeline: ${{ parameters.aggregatePipeline }}
-      skipLlvmSymlink: true
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-    parameters:
-      extraBuildFlags: >-
-        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-        -DROCRTST_BLD_TYPE=release
-        -GNinja
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
+  - job: rocminfo_build_${{ job.os }}
+    pool:
+      vmImage: 'ubuntu-22.04'
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
+        registerROCmPackages: true
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: ${{ parameters.checkoutRepo }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      parameters:
+        checkoutRef: ${{ parameters.checkoutRef }}
+        dependencyList: ${{ parameters.rocmDependencies }}
+        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        skipLlvmSymlink: true
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        os: ${{ job.os }}
+        useAmdclang: false
+        extraBuildFlags: >-
+          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+          -DROCRTST_BLD_TYPE=release
+          -GNinja
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+      parameters:
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml

 - ${{ each job in parameters.jobMatrix.testJobs }}:
  - job: rocminfo_test_${{ job.target }}
-    dependsOn: rocminfo
+    dependsOn: rocminfo_build_${{ job.os }}
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -91,14 +104,18 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
        registerROCmPackages: true
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+      parameters:
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmTestDependencies }}
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
      parameters:
        runRocminfo: false
@@ -109,6 +126,7 @@ jobs:
        testExecutable: './rocm/bin/rocminfo'
        testParameters: ''
        testPublishResults: false
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
      parameters:
        componentName: rocm_agent_enumerator
@@ -116,6 +134,7 @@ jobs:
        testExecutable: './rocm/bin/rocm_agent_enumerator'
        testParameters: ''
        testPublishResults: false
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/rocprofiler-compute.yml
+++ b/.azuredevops/components/rocprofiler-compute.yml
@@ -24,24 +24,28 @@ parameters:
  default:
    - astunparse==1.6.2
    - colorlover
-    - "dash>=1.12.0"
+    - dash-bootstrap-components
+    - dash-svg
+    - "dash>=3.0.0"
+    - kaleido==0.2.1
    - matplotlib
    - "numpy>=1.17.5"
    - "pandas>=1.4.3"
+    - plotext
+    - plotille
    - pymongo
    - pyyaml
-    - tabulate
-    - tqdm
-    - dash-svg
-    - dash-bootstrap-components
-    - kaleido
    - setuptools
-    - plotille
+    - tabulate
+    - textual
+    - textual_plotext
+    - textual-fspicker
+    - tqdm
    - mock
    - pytest
    - pytest-cov
    - pytest-xdist
- name: rocmDependencies
+- name: rocmTestDependencies
  type: object
  default:
    - amdsmi
@@ -114,14 +118,6 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmDependencies }}
-        dependencySource: ${{ job.dependencySource }}
-        gpuTarget: ${{ job.target }}
-        aggregatePipeline: ${{ parameters.aggregatePipeline }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        extraBuildFlags: >-
@@ -165,14 +161,6 @@ jobs:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - task: Bash@3
-      displayName: Add en_US.UTF-8 locale
-      inputs:
-        targetType: inline
-        script: |
-          sudo locale-gen en_US.UTF-8
-          sudo update-locale
-          locale -a
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
@@ -184,9 +172,17 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmDependencies }}
+        dependencyList: ${{ parameters.rocmTestDependencies }}
        dependencySource: ${{ job.dependencySource }}
        gpuTarget: ${{ job.target }}
+    - task: Bash@3
+      displayName: Add en_US.UTF-8 locale
+      inputs:
+        targetType: inline
+        script: |
+          sudo locale-gen en_US.UTF-8
+          sudo update-locale
+          locale -a
    - task: Bash@3
      displayName: Add ROCm binaries to PATH
      inputs:
--- a/.azuredevops/components/rocprofiler-register.yml
+++ b/.azuredevops/components/rocprofiler-register.yml
@@ -15,40 +15,62 @@ parameters:
  default:
    - cmake
    - ninja-build
-    - python3-pip
+
+- name: jobMatrix
+  type: object
+  default:
+    buildJobs:
+      - { os: ubuntu2204, packageManager: apt }
+      - { os: ubuntu2404, packageManager: apt }
+      - { os: almalinux8, packageManager: dnf }

 jobs:
- job: rocprofiler_register
-  variables:
-  - group: common
-  - template: /.azuredevops/variables-global.yml
-  pool:
-    vmImage: ${{ variables.BASE_BUILD_POOL }}
-  workspace:
-    clean: all
-  steps:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-    parameters:
-      aptPackages: ${{ parameters.aptPackages }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-    parameters:
-      checkoutRepo: ${{ parameters.checkoutRepo }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-    parameters:
-      componentName: rocprofiler-register
-      extraBuildFlags: >-
-        -DCMAKE_PREFIX_PATH=$(Build.BinariesDirectory)
-        -DROCPROFILER_REGISTER_BUILD_TESTS=ON
-        -DROCPROFILER_REGISTER_BUILD_SAMPLES=ON
-        -GNinja
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-    parameters:
-      componentName: rocprofiler-register
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-  # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-  #   parameters:
-  #     aptPackages: ${{ parameters.aptPackages }}
-  #     environment: combined
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
+  - job: rocprofiler_register_${{ job.os }}
+    pool:
+      ${{ if eq(job.os, 'ubuntu2404') }}:
+        vmImage: 'ubuntu-24.04'
+      ${{ else }}:
+        vmImage: 'ubuntu-22.04'
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: ${{ parameters.checkoutRepo }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        componentName: rocprofiler-register
+        os: ${{ job.os }}
+        useAmdclang: false
+        extraBuildFlags: >-
+          -DCMAKE_PREFIX_PATH=$(Build.BinariesDirectory)
+          -DROCPROFILER_REGISTER_BUILD_TESTS=ON
+          -DROCPROFILER_REGISTER_BUILD_SAMPLES=ON
+          -GNinja
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+      parameters:
+        componentName: rocprofiler-register
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+      parameters:
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+    #   parameters:
+    #     aptPackages: ${{ parameters.aptPackages }}
+    #     environment: combined
--- a/.azuredevops/components/rocprofiler-sdk.yml
+++ b/.azuredevops/components/rocprofiler-sdk.yml
@@ -14,10 +14,12 @@ parameters:
  type: object
  default:
    - build-essential
+    - cmake
    - libdrm-amdgpu-dev
    - libdrm-dev
    - libdw-dev
    - libelf-dev
+    - libsqlite3-dev
    - libva-dev
    - ninja-build
    - pkg-config
@@ -74,8 +76,7 @@ jobs:
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
-    pool:
-      vmImage: ${{ variables.BASE_BUILD_POOL }}
+    pool: ${{ variables.MEDIUM_BUILD_POOL }}
    workspace:
      clean: all
    steps:
--- a/.azuredevops/components/rocprofiler.yml
+++ b/.azuredevops/components/rocprofiler.yml
@@ -1,4 +1,7 @@
 parameters:
+- name: componentName
+  type: string
+  default: rocprofiler
 - name: checkoutRepo
  type: string
  default: 'self'
@@ -15,7 +18,6 @@ parameters:
  type: object
  default:
    - cmake
-    - libgtest-dev
    - libdrm-dev
    - libdw-dev
    - libsystemd-dev
@@ -26,13 +28,13 @@ parameters:
 - name: pipModules
  type: object
  default:
-    - pyyaml==5.3.1
-    - Cppheaderparser
-    - websockets
-    - matplotlib
-    - lxml
    - barectf
+    - Cppheaderparser
+    - lxml
+    - matplotlib
    - pandas
+    - pyyaml==5.3.1
+    - websockets
 - name: rocmDependencies
  type: object
  default:
@@ -41,29 +43,33 @@ parameters:
    - ROCdbgapi
    - rocm-cmake
    - rocm-core
-    - rocm_smi_lib
    - rocminfo
-    - ROCR-Runtime
+    - rocm_smi_lib
    - rocprofiler-register
+    - ROCR-Runtime
    - roctracer

 - name: jobMatrix
  type: object
  default:
    buildJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
+      - { os: almalinux8, packageManager: dnf, target: gfx942 }
+      - { os: almalinux8, packageManager: dnf, target: gfx90a }
+      - { os: almalinux8, packageManager: dnf, target: gfx1201 }
+      - { os: almalinux8, packageManager: dnf, target: gfx1100 }
+      - { os: almalinux8, packageManager: dnf, target: gfx1030 }
    testJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: rocprofiler_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -72,6 +78,10 @@ jobs:
    - name: ROCM_PATH
      value: $(Agent.BuildDirectory)/rocm
    pool: ${{ variables.MEDIUM_BUILD_POOL }}
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
    workspace:
      clean: all
    steps:
@@ -79,46 +89,59 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
+        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
+      parameters:
+        dependencyList:
+          - gtest
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+      parameters:
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
+        os: ${{ job.os }}
+        useAmdclang: false
        extraBuildFlags: >-
-          -DCMAKE_MODULE_PATH=$(Build.SourcesDirectory)/cmake_modules;$(Agent.BuildDirectory)/rocm/lib/cmake;$(Agent.BuildDirectory)/rocm/lib/cmake/hip
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+          -DCMAKE_MODULE_PATH=$(Build.SourcesDirectory)/cmake_modules;$(Agent.BuildDirectory)/rocm/lib/cmake;$(Agent.BuildDirectory)/rocm/lib/cmake/hip;$(Agent.BuildDirectory)/rocm/lib64/cmake;$(Agent.BuildDirectory)/rocm/lib64/cmake/hip
+          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor"
+          -DCMAKE_POSITION_INDEPENDENT_CODE=ON
          -DENABLE_LDCONFIG=OFF
          -DUSE_PROF_API=1
          -DGPU_TARGETS=${{ job.target }}
-          -DAMDGPU_TARGETS=${{ job.target }}
        multithreadFlag: -- -j32
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-        gpuTarget: ${{ job.target }}
-        extraEnvVars:
-          - HIP_ROCCLR_HOME:::/home/user/workspace/rocm
-          - ROCM_PATH:::/home/user/workspace/rocm
+    - ${{ if eq(job.os, 'ubuntu2204') }}:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+          gpuTarget: ${{ job.target }}
+          extraEnvVars:
+            - HIP_ROCCLR_HOME:::/home/user/workspace/rocm
+            - ROCM_PATH:::/home/user/workspace/rocm

 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: rocprofiler_test_${{ job.target }}
-    dependsOn: rocprofiler_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
+    dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -139,16 +162,21 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
      parameters:
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+      parameters:
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
      parameters:
@@ -157,12 +185,14 @@ jobs:
        testExecutable:  ./run.sh
        testParameters: ''
        testPublishResults: false
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
      parameters:
        componentName: rocprofilerV2
        testDir: $(Agent.BuildDirectory)/rocm
        testExecutable:  share/rocprofiler/tests/runUnitTests
        testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/roctracer.yml
+++ b/.azuredevops/components/roctracer.yml
@@ -1,4 +1,7 @@
 parameters:
+- name: componentName
+  type: string
+  default: roctracer
 - name: checkoutRepo
  type: string
  default: 'self'
@@ -18,7 +21,7 @@ parameters:
    - graphviz
    - libdrm-amdgpu-dev
    - ninja-build
-    - python3-pip
+    - zlib1g-dev
 - name: pipModules
  type: object
  default:
@@ -45,26 +48,32 @@ parameters:
  type: object
  default:
    buildJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
+      - { os: almalinux8, packageManager: dnf, target: gfx942 }
+      - { os: almalinux8, packageManager: dnf, target: gfx90a }
+      - { os: almalinux8, packageManager: dnf, target: gfx1201 }
+      - { os: almalinux8, packageManager: dnf, target: gfx1100 }
+      - { os: almalinux8, packageManager: dnf, target: gfx1030 }
    testJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: roctracer_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
-    - name: HIP_ROCCLR_HOME
-      value: $(Build.BinariesDirectory)/rocm
    pool:
      vmImage: ${{ variables.BASE_BUILD_POOL }}
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
    workspace:
      clean: all
    steps:
@@ -72,6 +81,7 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
+        packageManager: ${{ job.packageManager }}
        registerROCmPackages: true
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
@@ -83,21 +93,27 @@ jobs:
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        os: ${{ job.os }}
+    # the linker flags will not affect ubuntu2204 builds as the paths do not exist
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
+        os: ${{ job.os }}
+        useAmdclang: false
        extraBuildFlags: >-
          -DCMAKE_BUILD_TYPE=release
          -DROCM_PATH=$(Agent.BuildDirectory)/rocm
-          -DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm/lib/cmake/hip
+          -DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm/lib/cmake/hip;$(Agent.BuildDirectory)/rocm/lib64/cmake/hip
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
          -DGPU_TARGETS=${{ job.target }}
-          -DAMDGPU_TARGETS=${{ job.target }}
+          -DCMAKE_POSITION_INDEPENDENT_CODE=ON
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
+        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
@@ -108,8 +124,8 @@ jobs:
    #     registerROCmPackages: true

 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: roctracer_test_${{ job.target }}
-    dependsOn: roctracer_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
+    dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -127,17 +143,20 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
+        packageManager: ${{ job.packageManager }}
        registerROCmPackages: true
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
      parameters:
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmTestDependencies }}
        gpuTarget: ${{ job.target }}
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
      parameters:
@@ -146,6 +165,7 @@ jobs:
        testParameters: ''
        testDir: $(Agent.BuildDirectory)
        testPublishResults: false
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/dependencies/gtest.yml
+++ b/.azuredevops/dependencies/gtest.yml
@@ -11,36 +11,54 @@ parameters:
 - name: aptPackages
  type: object
  default:
-    - git
    - cmake
+    - git
    - ninja-build

+- name: jobMatrix
+  type: object
+  default:
+    buildJobs:
+      - { os: ubuntu2204, packageManager: apt }
+      - { os: almalinux8, packageManager: dnf }
+
 jobs:
- job: gtest
-  variables:
-  - group: common
-  - template: /.azuredevops/variables-global.yml
-  pool:
-    vmImage: ${{ variables.BASE_BUILD_POOL }}
-  workspace:
-    clean: all
-  steps:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-    parameters:
-      aptPackages: ${{ parameters.aptPackages }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-  - task: Bash@3
-    displayName: 'git clone gtest'
-    inputs:
-      targetType: inline
-      script: git clone -b ${{ parameters.gtestVersion }} https://github.com/google/googletest --depth=1 --shallow-submodules --recurse-submodules
-      workingDirectory: $(Agent.BuildDirectory)
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-    parameters:
-      cmakeBuildDir: $(Agent.BuildDirectory)/googletest/build
-      cmakeSourceDir: $(Agent.BuildDirectory)/googletest
-      extraBuildFlags: >-
-        -DGTEST_FORCE_SHARED_CRT=ON
-        -DCMAKE_DEBUG_POSTFIX=d
-        -GNinja
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
+  - job: gtest_${{ job.os }}
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    pool:
+      vmImage: 'ubuntu-22.04'
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
+    workspace:
+      clean: all
+    steps:
+    - checkout: none
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - task: Bash@3
+      displayName: Clone GTest ${{ parameters.gtestVersion }}
+      inputs:
+        targetType: inline
+        script: git clone https://github.com/google/googletest -b ${{ parameters.gtestVersion }} --depth=1 --shallow-submodules --recurse-submodules
+        workingDirectory: $(Agent.BuildDirectory)
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        os: ${{ job.os }}
+        cmakeBuildDir: $(Agent.BuildDirectory)/googletest/build
+        cmakeSourceDir: $(Agent.BuildDirectory)/googletest
+        useAmdclang: false
+        extraBuildFlags: >-
+          -DGTEST_FORCE_SHARED_CRT=ON
+          -DCMAKE_DEBUG_POSTFIX=d
+          -GNinja
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
+        os: ${{ job.os }}
--- a/.azuredevops/nightly/pytorch.yml
+++ b/.azuredevops/nightly/pytorch.yml
@@ -4,71 +4,71 @@ parameters:
 - name: aptPackages
  type: object
  default:
-    - build-essential
-    - git
-    - ninja-build
-    - openjdk-8-jdk
-    - ca-certificates
+    - autoconf
    - bc
    - bridge-utils
+    - build-essential
+    - ca-certificates
+    - ccache
    - devscripts
    - dkms
    - doxygen
+    - fakeroot
+    - ffmpeg
+    - gfortran
+    - git
+    - gnutls-bin
+    - libamd2
+    - libavformat-dev
+    - libblas3
+    - libcamd2
+    - libccolamd2
+    - libcholmod3
+    - libcolamd2
    - libdpkg-dev
    - libdpkg-perl
+    - libdrm-amdgpu1
+    - libdrm-dev
    - libelf-dev
+    - libfreetype-dev
+    - libgfortran5
+    - libgomp1
+    - libjpeg-dev
+    - libjpeg-turbo-official
+    - liblapack-dev
+    - liblapack3
+    - libmetis5
+    - libncurses-dev
+    - libnuma-dev
+    - libopenblas-dev
+    - libpth-dev
+    - libquadmath0
+    - libssh-dev
+    - libstdc++-12-dev
+    - libsuitesparseconfig5
+    - libswscale-dev
+    - libtinfo-dev
+    - libunwind-dev
+    - libwebp-dev
+    - llvm-dev
+    - ncurses-base
+    - ninja-build
+    - numactl
+    - openjdk-8-jdk
+    - python-is-python3
    - python3-dev
    - python3-pip
    - python3-venv
-    - wget
-    - ncurses-base
-    - libncurses-dev
-    - numactl
-    - libnuma-dev
-    - libssh-dev
-    - libunwind-dev
-    - llvm-dev
-    - libpth-dev
    - qemu-kvm
    - re2c
    - subversion
-    - fakeroot
-    - autoconf
-    - libgomp1
-    - libtinfo-dev
-    - libcholmod3
-    - libsuitesparseconfig5
-    - libstdc++-12-dev
-    - python-is-python3
-    - gfortran
-    - libgfortran5
-    - liblapack3
-    - libblas3
-    - libquadmath0
-    - libmetis5
-    - libamd2
-    - libcamd2
-    - libcolamd2
-    - libccolamd2
-    - libdrm-amdgpu1
-    - ccache
+    - wget
    - zip
-    - libjpeg-turbo-official
-    - libjpeg-dev
-    - libwebp-dev
-    - libfreetype-dev
-    - gnutls-bin
-    - ffmpeg
-    - libopenblas-dev
-    - liblapack-dev
-    - libswscale-dev
-    - libavformat-dev
 - name: pipModules
  type: object
  default:
-    - cmake
    - astunparse
-    - "expecttest>=0.2.1"
+    - "expecttest>=0.3.0"
    - hypothesis
    - numpy
    - psutil
@@ -76,8 +76,8 @@ parameters:
    - requests
    - setuptools==75.8.0
    - types-dataclasses
-    - "typing-extensions>=4.8.0"
-    - "sympy>=1.13.0"
+    - "typing-extensions>=4.10.0"
+    - "sympy>=1.13.3"
    - filelock
    - networkx
    - jinja2
@@ -97,36 +97,39 @@ parameters:
 - name: rocmDependencies
  type: object
  default:
-    - rocminfo
-    - MIOpen
    - clr
    - hipBLAS
+    - hipBLASLt
    - hipFFT
    - hipRAND
    - hipSOLVER
    - hipSPARSE
-    - ROCR-Runtime
+    - hipSPARSELt
    - llvm-project
+    - MIOpen
    - rccl
    - rocBLAS
    - rocFFT
+    - rocm-core
+    - rocminfo
    - rocm_smi_lib
+    - rocPRIM
+    - rocprofiler-register
    - rocRAND
+    - ROCR-Runtime
    - rocSOLVER
    - rocSPARSE
    - roctracer
-    - hipBLASLt
-    - rocprofiler-register
-    - rocm-core
-    - rocPRIM
    # below are additional dependencies not called out by build script, but throw errors during cmake
+    - composable_kernel
+    - hipBLAS-common
    - hipCUB
    - rocThrust
-    - hipBLAS-common
-    - composable_kernel
 - name: rocmTestDependencies
  type: object
  default:
+    # rocroller.so needed and is not included in the wheel
+    - hipBLASLt
    - rocminfo
 # Reference on what tests to run for torchvision found in private repo:
 # https://github.com/ROCm/rocAutomation/blob/jenkins-pipelines/pytorch/pytorch_ci/test_pytorch_test1.sh#L54
@@ -240,12 +243,6 @@ jobs:
        git clone https://github.com/pytorch/builder.git --depth=1 --recurse-submodules
        sudo ln -s $(Build.SourcesDirectory)/builder /builder
      workingDirectory: $(Build.SourcesDirectory)
-  - task: Bash@3
-    displayName: Temporarily Patch CK Submodule
-    inputs:
-      targetType: inline
-      script: git pull origin develop
-      workingDirectory: $(Build.SourcesDirectory)/pytorch/third_party/composable_kernel
  - task: Bash@3
    displayName: Install patchelf
    inputs:
@@ -267,6 +264,11 @@ jobs:
      script: |
        sudo bash pytorch/.ci/docker/common/install_rocm_magma.sh $(MAGMA_ROCM)
      workingDirectory: $(Build.SourcesDirectory)
+  - task: Bash@3
+    displayName: Install targeted typing_extensions for build
+    inputs:
+      targetType: inline
+      script: pip install --target=$(Build.SourcesDirectory)/pytorch/torch/.. typing_extensions
  - task: Bash@3
    displayName: Run ROCm Build Script
    inputs:
@@ -281,7 +283,6 @@ jobs:
        PYTORCH_ROOT=$(PYTORCH_ROOT)
        CMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
        DESIRED_DEVTOOLSET=$(DESIRED_DEVTOOLSET)
-        TORCH_PACKAGE_NAME=torch.$(ROCM_BRANCH).$(JOB_GPU_TARGET)
        PYTORCH_BUILD_VERSION=$(cat $(Build.SourcesDirectory)/pytorch/version.txt | cut -da -f1)
        PYTORCH_BUILD_NUMBER=$(date -u +%Y%m%d)
        SKIP_ALL_TESTS=1
@@ -322,8 +323,6 @@ jobs:
      inputs:
        targetType: inline
        script: >-
-          TORCH_PACKAGE_NAME=torch.$(ROCM_BRANCH).$(JOB_GPU_TARGET)
-          TORCHVISION_PACKAGE_NAME=torchvision.$(ROCM_BRANCH).$(JOB_GPU_TARGET)
          PYTORCH_VERSION=$(cat $(Build.SourcesDirectory)/pytorch/version.txt | cut -da -f1)post$(date -u +%Y%m%d)
          BUILD_VERSION=$(cat $(Build.SourcesDirectory)/vision/version.txt | cut -da -f1)post$(date -u +%Y%m%d)
          python3 setup.py bdist_wheel
@@ -400,11 +399,9 @@ jobs:
  - task: DownloadPipelineArtifact@2
    displayName: 'Download Pipeline Wheel Files'
    inputs:
-      itemPattern: '**/*$(JOB_GPU_TARGET)*.whl'
+      itemPattern: '**/*.whl'
      targetPath: $(Agent.BuildDirectory)
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-    parameters:
-      dependencySource: staging
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
    parameters:
      dependencyList: ${{ parameters.rocmTestDependencies }}
--- a/.azuredevops/nightly/rocm-nightly.yml
+++ b/.azuredevops/nightly/rocm-nightly.yml
@@ -3,12 +3,21 @@ parameters:
 - name: jobList
  type: object
  default:
-    - gfx942-staging:
-      target: gfx942
-      source: staging
-    - gfx90a-staging:
-      target: gfx90a
-      source: staging
+    - { os: ubuntu2204, target: gfx942, source: staging }
+    - { os: ubuntu2204, target: gfx90a, source: staging }
+    - { os: ubuntu2204, target: gfx1201, source: staging }
+    - { os: ubuntu2204, target: gfx1100, source: staging }
+    - { os: ubuntu2204, target: gfx1030, source: staging }
+    - { os: ubuntu2404, target: gfx942, source: staging }
+    - { os: ubuntu2404, target: gfx90a, source: staging }
+    - { os: ubuntu2404, target: gfx1201, source: staging }
+    - { os: ubuntu2404, target: gfx1100, source: staging }
+    - { os: ubuntu2404, target: gfx1030, source: staging }
+    - { os: almalinux8, target: gfx942, source: staging }
+    - { os: almalinux8, target: gfx90a, source: staging }
+    - { os: almalinux8, target: gfx1201, source: staging }
+    - { os: almalinux8, target: gfx1100, source: staging }
+    - { os: almalinux8, target: gfx1030, source: staging }
 - name: rocmDependencies
  type: object
  default:
@@ -16,9 +25,9 @@ parameters:
    - amdsmi
    - aomp-extras
    - aomp
+    - clr
    - composable_kernel
    - half
-    - HIP
    - hip-tests
    - hipBLAS
    - hipBLAS-common
@@ -83,7 +92,7 @@ schedules:

 jobs:
 - ${{ each job in parameters.jobList }}:
-  - job: rocm_nightly_${{ job.target }}_${{ job.source }}
+  - job: rocm_nightly_${{ job.os }}_${{ job.target }}_${{ job.source }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -108,9 +117,9 @@ jobs:
      parameters:
        dependencySource: ${{ job.source }}
        dependencyList: ${{ parameters.rocmDependencies }}
+        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
        skipLibraryLinking: true
-        skipLlvmSymlink: true
    - script: df -h
      displayName: System disk space after ROCm
    - script: du -sh $(Agent.BuildDirectory)/rocm
--- a/.azuredevops/tag-builds/clr.yml
+++ b/.azuredevops/tag-builds/clr.yml
@@ -28,12 +28,22 @@ resources:
    endpoint: ROCm
    name: ROCm/hipother
    ref: ${{ parameters.checkoutRef }}
+  pipelines:
+  - pipeline: hip_pipeline
+    source: \experimental\HIP
+    trigger: true
+  - pipeline: hipother_pipeline
+    source: \experimental\hipother
+    trigger: true

 trigger: none
 pr: none

 jobs:
-  - template: ${{ variables.CI_COMPONENT_PATH }}/HIP.yml
-    parameters:
-      checkoutRepo: release_repo
-      checkoutRef: ${{ parameters.checkoutRef }}
+  - ${{ if eq(variables['Build.Reason'], 'ResourceTrigger') }}:
+    - template: ${{ variables.CI_COMPONENT_PATH }}/copyHIP.yml@pipelines_repo
+  - ${{ if ne(variables['Build.Reason'], 'ResourceTrigger') }}:
+    - template: ${{ variables.CI_COMPONENT_PATH }}/HIP.yml@pipelines_repo
+      parameters:
+        checkoutRepo: release_repo
+        checkoutRef: ${{ parameters.checkoutRef }}
--- a/.azuredevops/templates/steps/artifact-download.yml
+++ b/.azuredevops/templates/steps/artifact-download.yml
@@ -19,36 +19,24 @@ parameters:
  default: false

 steps:
- task: Bash@3
-  displayName: Set allowPartiallySucceededBuilds
-  inputs:
-    targetType: inline
-    script: |
-      if [[ ",$ALLOWED_PARTIAL_SUCCEED_BUILDS," == *",${{ parameters.componentName }},"* ]]; then
-        echo "##vso[task.setvariable variable=allowPartiallySucceededBuilds;]true"
-      else
-        echo "##vso[task.setvariable variable=allowPartiallySucceededBuilds;]false"
-      fi
 - task: DownloadPipelineArtifact@2
  displayName: Download ${{ parameters.componentName }}
  inputs:
-    ${{ if eq(parameters.aggregatePipeline, false) }}:
+    itemPattern: '**/*${{ parameters.componentName }}*${{ parameters.fileFilter }}*'
+    targetPath: '$(Pipeline.Workspace)/d'
+    allowPartiallySucceededBuilds: true
+    ${{ if parameters.aggregatePipeline }}:
+      buildType: 'current'
+    ${{ else }}:
      buildType: 'specific'
      project: ROCm-CI
-      definition: ${{ parameters.pipelineId }}
      specificBuildWithTriggering: true
-      itemPattern: '**/*${{ parameters.fileFilter }}*'
-      # aomp is a special case, since the trigger file is under ROCm/ROCm instead of the component repo
-      ${{ if notIn(parameters.componentName, 'aomp') }}:
-        buildVersionToDownload: latestFromBranch # default is 'latest'
+      definition: ${{ parameters.pipelineId }}
      branchName: refs/heads/${{ parameters.branchName }}
-      allowPartiallySucceededBuilds: $(allowPartiallySucceededBuilds)
-      targetPath: '$(Pipeline.Workspace)/d'
-    ${{ else }}:
-      buildType: 'current'
-      itemPattern: '**/${{ parameters.componentName }}*${{ parameters.fileFilter }}*'
-      allowPartiallySucceededBuilds: $(allowPartiallySucceededBuilds)
-      targetPath: '$(Pipeline.Workspace)/d'
+      ${{ if eq(parameters.componentName, 'aomp') }}:
+        buildVersionToDownload: latest # aomp trigger lives in ROCm/ROCm, so cannot use ROCm/aomp branch names
+      ${{ else }}:
+        buildVersionToDownload: latestFromBranch
 - task: ExtractFiles@1
  displayName: Extract ${{ parameters.componentName }}
  inputs:
--- a/.azuredevops/templates/steps/artifact-upload.yml
+++ b/.azuredevops/templates/steps/artifact-upload.yml
@@ -3,15 +3,21 @@
 # publish can be toggled off for jobs that produce multiple tarballs
 # for those cases, only publish the last call which puts all the tarballs in one container folder
 parameters:
- name: artifactName
+- name: componentName
  type: string
-  default: 'drop'
- name: publish
-  type: boolean
-  default: true
+  default: $(Build.DefinitionName)
 - name: gpuTarget
  type: string
  default: ''
+- name: artifactName
+  type: string
+  default: drop
+- name: publish
+  type: boolean
+  default: true
+- name: os
+  type: string
+  default: 'ubuntu2204'

 steps:
 - task: ArchiveFiles@2
@@ -20,7 +26,7 @@ steps:
    includeRootFolder: false
    archiveType: 'tar'
    tarCompression: 'gz'
-    archiveFile: '$(Build.ArtifactStagingDirectory)/$(Build.DefinitionName)_$(Build.BuildId)_$(Build.BuildNumber)_ubuntu2204_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.tar.gz'
+    archiveFile: '$(Build.ArtifactStagingDirectory)/${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}_$(System.JobAttempt).tar.gz'
 - task: DeleteFiles@1
  displayName: 'Cleanup Staging Area'
  inputs:
@@ -32,7 +38,7 @@ steps:
  inputs:
    workingDirectory: $(Pipeline.Workspace)
    targetType: inline
-    script: echo "$(Build.DefinitionName)_$(Build.BuildId)_$(Build.BuildNumber)_ubuntu2204_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.tar.gz" >> pipelineArtifacts.txt
+    script: echo "${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}_$(System.JobAttempt).tar.gz" >> pipelineArtifacts.txt
 # then publish it
 - ${{ if parameters.publish }}:
  - task: PublishPipelineArtifact@1
@@ -40,4 +46,5 @@ steps:
    displayName: '${{ parameters.artifactName }} Publish'
    retryCountOnTaskFailure: 3
    inputs:
+      artifactName: ${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}_$(System.JobAttempt)
      targetPath: '$(Build.ArtifactStagingDirectory)'
--- a/.azuredevops/templates/steps/build-autotools.yml
+++ b/.azuredevops/templates/steps/build-autotools.yml
@@ -1,4 +1,7 @@
 parameters:
+- name: os
+  type: string
+  default: 'ubuntu2204'
 - name: componentName
  type: string
  default: ''
@@ -20,17 +23,23 @@ steps:
  displayName: '${{ parameters.componentName }} configure flags'
  inputs:
    targetType: inline
-    script: ./configure --prefix=${{ parameters.installDir }} ${{ parameters.configureFlags }}
    workingDirectory: ${{ parameters.buildDir }}
+    script: |
+      ${{ iif(eq(parameters.os, 'almalinux8'), 'source /opt/rh/gcc-toolset-14/enable', '') }}
+      ./configure --prefix=${{ parameters.installDir }} ${{ parameters.configureFlags }}
 - task: Bash@3
  displayName: '${{ parameters.componentName }} make'
  inputs:
    targetType: inline
-    script: ${{ parameters.makeCallPrefix }} make -j$(nproc)
    workingDirectory: ${{ parameters.buildDir }}
+    script: |
+      ${{ iif(eq(parameters.os, 'almalinux8'), 'source /opt/rh/gcc-toolset-14/enable', '') }}
+      ${{ parameters.makeCallPrefix }} make -j$(nproc)
 - task: Bash@3
  displayName: '${{ parameters.componentName }} make install'
  inputs:
    targetType: inline
-    script: make install
    workingDirectory: ${{ parameters.buildDir }}
+    script: |
+      ${{ iif(eq(parameters.os, 'almalinux8'), 'source /opt/rh/gcc-toolset-14/enable', '') }}
+      make install
--- a/.azuredevops/templates/steps/build-cmake.yml
+++ b/.azuredevops/templates/steps/build-cmake.yml
@@ -1,10 +1,16 @@
 parameters:
+- name: os
+  type: string
+  default: 'ubuntu2204'
 - name: componentName
  type: string
  default: ''
 - name: extraBuildFlags
  type: string
  default: ''
+- name: extraCxxFlags
+  type: string
+  default: ''
 - name: multithreadFlag
  type: string
  default: ''
@@ -32,41 +38,81 @@ parameters:
 - name: installEnabled
  type: boolean
  default: true
+# for jobs that rebuild during install step and use ninja
+# set to true to save time, only applies for almalinux8
+- name: consolidateBuildAndInstall
+  type: boolean
+  default: false
 - name: printDiskSpace
  type: boolean
  default: true
+# todo: make this control cxx and c compiler flags
+- name: useAmdclang
+  type: boolean
+  default: true
+
+# for cmake calls, set env variables for AlmaLinux 8
+# to simulate running source /opt/rh/gcc-toolset-14/enable for the session

 steps:
 # create workingDirectory if it does not exist and change into it
 # call cmake from within that directory using $cmakeArgs as its parameters
 - task: CMake@1
  displayName: '${{parameters.componentName }} CMake Flags'
+  ${{ if eq(parameters.os, 'almalinux8')}}:
+    env:
+      PATH: "/opt/rh/gcc-toolset-14/root/usr/bin:$(PATH)"
+      MANPATH: "/opt/rh/gcc-toolset-14/root/usr/share/man:$(MANPATH)"
+      INFOPATH: "/opt/rh/gcc-toolset-14/root/usr/share/info:$(INFOPATH)"
+      PCP_DIR: "/opt/rh/gcc-toolset-14/root"
+      LD_LIBRARY_PATH: "/opt/rh/gcc-toolset-14/root/usr/lib64:/opt/rh/gcc-toolset-14/root/usr/lib:$(LD_LIBRARY_PATH)"
+      PKG_CONFIG_PATH: "/opt/rh/gcc-toolset-14/root/usr/lib64/pkgconfig:$(PKG_CONFIG_PATH)"
  inputs:
    workingDirectory: ${{ parameters.cmakeBuildDir }}
-    ${{ if eq(parameters.customInstallPath, true) }}:
-      cmakeArgs: -DCMAKE_INSTALL_PREFIX=${{ parameters.installDir }} ${{ parameters.extraBuildFlags }} ${{ parameters.cmakeSourceDir }}
-    ${{ else }}:
-      cmakeArgs: ${{ parameters.extraBuildFlags }} ${{ parameters.cmakeSourceDir }}
+    cmakeArgs: >-
+      ${{ iif(parameters.customInstallPath, join('', format('-DCMAKE_INSTALL_PREFIX={0}', parameters.installDir)), '') }}
+      ${{ iif(eq(parameters.os, 'almalinux8'), '-DCMAKE_SHARED_LINKER_FLAGS="-L$(Agent.BuildDirectory)/rocm/lib64 -L/opt/rh/gcc-toolset-14/root/usr/lib/gcc/x86_64-redhat-linux/14/"', '') }}
+      ${{ iif(eq(parameters.os, 'almalinux8'), '-DCMAKE_EXE_LINKER_FLAGS="-L$(Agent.BuildDirectory)/rocm/lib64 -L/opt/rh/gcc-toolset-14/root/usr/lib/gcc/x86_64-redhat-linux/14/"', '') }}
+      -DCMAKE_CXX_FLAGS="${{ parameters.extraCxxFlags }} ${{ iif(and(eq(parameters.os, 'almalinux8'), parameters.useAmdclang), '--gcc-toolchain=/opt/rh/gcc-toolset-14/root', '') }}"
+      ${{ parameters.extraBuildFlags }}
+      ${{ parameters.cmakeSourceDir }}
 - ${{ if parameters.printDiskSpace }}:
  - script: df -h
    displayName: Disk space before build
 # equivalent to running make $cmakeTargetDir from $cmakeBuildDir
 # i.e., cd $cmakeBuildDir; make $cmakeTargetDir
 - task: CMake@1
-  displayName: '${{parameters.componentName }} Build'
+  ${{ if and( eq(parameters.os, 'almalinux8'), eq(parameters.consolidateBuildAndInstall , true)) }}:
+    displayName: '${{ parameters.componentName }} CMake Build and Install'
+  ${{ else }}:
+    displayName: '${{ parameters.componentName }} CMake Build'
+  ${{ if eq(parameters.os, 'almalinux8')}}:
+    env:
+      PATH: "/opt/rh/gcc-toolset-14/root/usr/bin:$(PATH)"
+      MANPATH: "/opt/rh/gcc-toolset-14/root/usr/share/man:$(MANPATH)"
+      INFOPATH: "/opt/rh/gcc-toolset-14/root/usr/share/info:$(INFOPATH)"
+      PCP_DIR: "/opt/rh/gcc-toolset-14/root"
+      LD_LIBRARY_PATH: "/opt/rh/gcc-toolset-14/root/usr/lib64:/opt/rh/gcc-toolset-14/root/usr/lib:$(LD_LIBRARY_PATH)"
+      PKG_CONFIG_PATH: "/opt/rh/gcc-toolset-14/root/usr/lib64/pkgconfig:$(PKG_CONFIG_PATH)"
  inputs:
    workingDirectory: ${{ parameters.cmakeBuildDir }}
-    ${{ if eq(parameters.customBuildTarget, '') }}:
-      cmakeArgs: '--build ${{ parameters.cmakeTargetDir }} ${{ parameters.multithreadFlag }}'
-    ${{ else }}:
-      cmakeArgs: '--build ${{ parameters.cmakeTargetDir }} --target ${{ parameters.customBuildTarget }} ${{ parameters.multithreadFlag }}'
-    retryCountOnTaskFailure: 10
+    ${{ if eq(parameters.os, 'almalinux8') }}:
+      cmakeArgs: >-
+        --build ${{ parameters.cmakeTargetDir }}
+        ${{ iif(and(eq(parameters.consolidateBuildAndInstall, true), ne(parameters.cmakeTarget, '')), format('--target {0}', parameters.cmakeTarget), '') }}
+        ${{ iif(and(ne(parameters.customBuildTarget, ''), ne(parameters.consolidateBuildAndInstall, true)), format('--target {0}', parameters.customBuildTarget), '') }}
+        ${{ parameters.multithreadFlag }}
+    ${{ if ne(parameters.os, 'almalinux8') }}:
+      cmakeArgs: >-
+        --build ${{ parameters.cmakeTargetDir }}
+        ${{ iif(ne(parameters.customBuildTarget, ''), format('--target {0}', parameters.customBuildTarget), '') }}
+        ${{ parameters.multithreadFlag }}
 - ${{ if parameters.printDiskSpace }}:
  - script: df -h
    displayName: Disk space after build
 # equivalent to running make $cmakeTarget from $cmakeBuildDir
 # e.g., make install
- ${{ if eq(parameters.installEnabled, true) }}:
+- ${{ if and(eq(parameters.installEnabled, true), or(ne(parameters.os, 'almalinux8'), eq(parameters.consolidateBuildAndInstall, false))) }}:
  - task: CMake@1
    displayName: '${{parameters.componentName }} ${{ parameters.cmakeTarget }}'
    inputs:
--- a/.azuredevops/templates/steps/checkout.yml
+++ b/.azuredevops/templates/steps/checkout.yml
@@ -4,9 +4,6 @@ parameters:
 - name: checkoutRepo
  type: string
  default: 'self'
- name: sparseCheckout
-  type: boolean
-  default: false
 - name: sparseCheckoutDir
  type: string
  default: ''
@@ -22,10 +19,10 @@ steps:
    submodules: ${{ parameters.submoduleBehaviour }}
    retryCountOnTaskFailure: 3
    fetchFilter: blob:none
-    ${{ if eq(parameters.sparseCheckout, true) }}:
+    ${{ if ne(parameters.sparseCheckoutDir, '') }}:
      sparseCheckoutDirectories: ${{ parameters.sparseCheckoutDir }}
      path: sparse
-  - ${{ if eq(parameters.sparseCheckout, true) }}:
+  - ${{ if ne(parameters.sparseCheckoutDir, '') }}:
    - task: Bash@3
      displayName: Symlink sparse checkout
      inputs:
--- a/.azuredevops/templates/steps/dependencies-apt.yml
+++ b/.azuredevops/templates/steps/dependencies-apt.yml
@@ -0,0 +1,42 @@
+parameters:
+- name: aptPackages
+  type: object
+  default: []
+- name: registerROCmPackages
+  type: boolean
+  default: false
+
+steps:
+- ${{ if eq(parameters.registerROCmPackages, true) }}:
+  - task: Bash@3
+    displayName: 'Register AMDGPU & ROCm repos (apt)'
+    inputs:
+      targetType: inline
+      script: |
+        sudo mkdir --parents --mode=0755 /etc/apt/keyrings
+        wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null
+        echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/amdgpu/$(REPO_RADEON_VERSION)/ubuntu jammy main" | sudo tee /etc/apt/sources.list.d/amdgpu.list
+        echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/$(REPO_RADEON_VERSION) jammy main" | sudo tee --append /etc/apt/sources.list.d/rocm.list
+        echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600
+        sudo apt update
+- task: Bash@3
+  displayName: 'sudo apt-get update'
+  inputs:
+    targetType: inline
+    script: |
+      echo "deb http://archive.ubuntu.com/ubuntu/ jammy main restricted universe multiverse" | sudo tee -a /etc/apt/sources.list.d/default.list
+      echo "deb http://archive.ubuntu.com/ubuntu/ jammy-updates main restricted universe multiverse" | sudo tee -a /etc/apt/sources.list.d/default.list
+      echo "deb http://archive.ubuntu.com/ubuntu/ jammy-backports main restricted universe multiverse" | sudo tee -a /etc/apt/sources.list.d/default.list
+      echo "deb http://archive.ubuntu.com/ubuntu/ jammy-security main restricted universe multiverse" | sudo tee -a /etc/apt/sources.list.d/default.list
+      sudo DEBIAN_FRONTEND=noninteractive apt-get --yes update
+- task: Bash@3
+  displayName: 'sudo apt-get fix'
+  inputs:
+    targetType: inline
+    script: sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --fix-broken install
+- ${{ if gt(length(parameters.aptPackages), 0) }}:
+  - task: Bash@3
+    displayName: 'sudo apt-get install ...'
+    inputs:
+      targetType: inline
+      script: sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --fix-missing install ${{ join(' ', parameters.aptPackages) }}
--- a/.azuredevops/templates/steps/dependencies-aqlprofile.yml
+++ b/.azuredevops/templates/steps/dependencies-aqlprofile.yml
@@ -1,25 +1,44 @@
+parameters:
+- name: os
+  type: string
+  default: ubuntu2204
+
 steps:
 - task: Bash@3
  displayName: Get aqlprofile package name
  inputs:
    targetType: inline
-    script: |
-      export packageName=$(curl -s https://repo.radeon.com/rocm/apt/$(REPO_RADEON_VERSION)/pool/main/h/hsa-amd-aqlprofile/ | grep -oP "href=\"\K[^\"]*$(lsb_release -rs)[^\"]*\.deb")
-      echo "##vso[task.setvariable variable=packageName;isreadonly=true]$packageName"
+    ${{ if eq(parameters.os, 'ubuntu2204') }}:
+      script: |
+        export packageName=$(curl -s https://repo.radeon.com/rocm/apt/$(REPO_RADEON_VERSION)/pool/main/h/hsa-amd-aqlprofile/ | grep -oP "href=\"\K[^\"]*$(lsb_release -rs)[^\"]*\.deb")
+        echo "##vso[task.setvariable variable=packageName;isreadonly=true]$packageName"
+    ${{ if eq(parameters.os, 'almalinux8') }}:
+      script: |
+        export packageName=$(curl -s https://repo.radeon.com/rocm/rhel8/$(REPO_RADEON_VERSION)/main/ | grep -oP "hsa-amd-aqlprofile-[^\"]+\.rpm" | head -n1)
+        echo "##vso[task.setvariable variable=packageName;isreadonly=true]$packageName"
 - task: Bash@3
  displayName: 'Download aqlprofile'
  inputs:
    targetType: inline
-    script: wget -nv https://repo.radeon.com/rocm/apt/$(REPO_RADEON_VERSION)/pool/main/h/hsa-amd-aqlprofile/$(packageName)
    workingDirectory: '$(Pipeline.Workspace)'
+    ${{ if eq(parameters.os, 'ubuntu2204') }}:
+      script: wget -nv https://repo.radeon.com/rocm/apt/$(REPO_RADEON_VERSION)/pool/main/h/hsa-amd-aqlprofile/$(packageName)
+    ${{ if eq(parameters.os, 'almalinux8') }}:
+      script: wget -nv https://repo.radeon.com/rocm/rhel8/$(REPO_RADEON_VERSION)/main/$(packageName)
 - task: Bash@3
  displayName: 'Extract aqlprofile'
  inputs:
    targetType: inline
-    script: |
-      mkdir hsa-amd-aqlprofile
-      dpkg-deb -R $(packageName) hsa-amd-aqlprofile
    workingDirectory: '$(Pipeline.Workspace)'
+    ${{ if eq(parameters.os, 'ubuntu2204') }}:
+      script: |
+        mkdir hsa-amd-aqlprofile
+        dpkg-deb -R $(packageName) hsa-amd-aqlprofile
+    ${{ if eq(parameters.os, 'almalinux8') }}:
+      script: |
+        mkdir hsa-amd-aqlprofile
+        sudo dnf -y install rpm-build cpio
+        rpm2cpio $(packageName) | (cd hsa-amd-aqlprofile && cpio -idmv)
 - task: Bash@3
  displayName: 'Copy aqlprofile files'
  inputs:
--- a/.azuredevops/templates/steps/dependencies-boost.yml
+++ b/.azuredevops/templates/steps/dependencies-boost.yml
@@ -1,35 +0,0 @@
-steps:
- task: DownloadPipelineArtifact@2
-  displayName: Download Boost
-  inputs:
-    buildType: specific
-    project: ROCm-CI
-    definition: $(BOOST_DEPENDENCY_PIPELINE_ID)
-    targetPath: $(Pipeline.Workspace)/d
- task: ExtractFiles@1
-  displayName: Extract Boost
-  inputs:
-    archiveFilePatterns: '$(Pipeline.Workspace)/d/**/*.tar.gz'
-    destinationFolder: $(Agent.BuildDirectory)/boost
-    cleanDestinationFolder: true
-    overwriteExistingFiles: true
- task: DeleteFiles@1
-  displayName: Cleanup Compressed Boost
-  inputs:
-    SourceFolder: $(Pipeline.Workspace)/d
-    Contents: '**/*.tar.gz'
-    RemoveDotFiles: true
- task: Bash@3
-  displayName: 'List Boost files'
-  inputs:
-    targetType: inline
-    script: ls -1R $(Agent.BuildDirectory)/boost
- task: Bash@3
-  displayName: 'Link Boost shared libraries'
-  inputs:
-    targetType: inline
-    script: |
-      echo $(Agent.BuildDirectory)/boost/lib | sudo tee /etc/ld.so.conf.d/boost.conf
-      sudo cat /etc/ld.so.conf.d/boost.conf
-      sudo ldconfig -v
-      ldconfig -p
--- a/.azuredevops/templates/steps/dependencies-cmake-latest.yml
+++ b/.azuredevops/templates/steps/dependencies-cmake-latest.yml
@@ -1,10 +1,23 @@
-# replace cmake from apt install with newest version using snap install
 steps:
 - task: Bash@3
-  displayName: update cmake
+  displayName: Install CMake 3.31
  inputs:
    targetType: inline
    script: |
-      sudo apt purge cmake -y
-      sudo snap install cmake --classic --channel=3.31/stable
-      hash -r
+      CMAKE_VERSION=3.31.0
+      CMAKE_ROOT="$(Pipeline.Workspace)/cmake"
+
+      echo "Downloading CMake $CMAKE_VERSION..."
+      curl -fsSL -o cmake.tar.gz https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz
+
+      echo "Extracting to $CMAKE_ROOT..."
+      sudo mkdir -p $CMAKE_ROOT
+      sudo tar --strip-components=1 -xz -C $CMAKE_ROOT -f cmake.tar.gz
+
+      echo "##vso[task.prependpath]$CMAKE_ROOT/bin"
+- task: Bash@3
+  displayName: cmake --version
+  inputs:
+    targetType: inline
+    script: |
+      cmake --version
--- a/.azuredevops/templates/steps/dependencies-dnf.yml
+++ b/.azuredevops/templates/steps/dependencies-dnf.yml
@@ -0,0 +1,157 @@
+parameters:
+- name: aptPackages
+  type: object
+  default: []
+- name: registerROCmPackages
+  type: boolean
+  default: false
+# As par of installing gcc toolset and python,
+# the environment will install this base set of dnf packages.
+- name: basePackages
+  type: object
+  default:
+    - epel-release
+    - gcc-toolset-14
+    - gcc-toolset-14-libatomic-devel
+    - git
+    - jq
+    - numactl
+    - python3.11
+    - python3.11-pip
+    - vim-common
+    - wget
+# Instead of defining multiple arrays of packages per component,
+# we define a map of apt package names to dnf package names.
+- name: aptToDnfMap
+  type: object
+  default:
+    bison: bison
+    ccache: ccache
+    cmake: cmake
+    cuda-toolkit-12-9: cuda-compiler-12-9 cuda-toolkit-12-9
+    libcudnn9-dev-cuda-12: libcudnn9-cuda-12
+    dejagnu: dejagnu
+    doxygen: doxygen
+    # note: doxygen-doc is not available in dnf
+    # libavcodec-dev, libavformat-dev, libavutil-dev come with ffmpeg-devel
+    ffmpeg: ffmpeg ffmpeg-devel
+    flex: flex
+    # note: g++ is installed by default with gcc-toolset-14
+    # note: gawk is already installed
+    # note: gcc-toolset-14-gfortran is installed by default with gcc-toolset-14
+    # note: git is in the base packages list
+    graphviz: graphviz
+    libbabeltrace-dev: libbabeltrace-devel
+    libbison-dev: bison-devel
+    libboost-program-options-dev: boost-devel
+    # note: libdrm-amdgpu1 is not available in dnf
+    libdrm-dev: libdrm-devel
+    libdrm-amdgpu-dev: libdrm-amdgpu-devel
+    libdw-dev: elfutils-devel
+    libelf-dev: elfutils-libelf-devel
+    libexpat-dev: expat-devel
+    libffi-dev: libffi-devel
+    libfftw3-dev: fftw-devel
+    libgmp-dev: gmp-devel
+    liblzma-dev: xz-devel
+    libmpfr-dev: mpfr-devel
+    libmsgpack-dev: msgpack-devel
+    libncurses5-dev: ncurses-devel
+    libnuma-dev: numactl-devel
+    libopenmpi-dev: openmpi-devel
+    libpci-dev: libpciaccess-devel
+    libssl-dev: openssl-devel
+    # note: libstdc++-devel is in the base packages list
+    libsystemd-dev: systemd-devel
+    libtool: libtool
+    # note: libudev-dev is part of systemd-devel
+    libva-amdgpu-dev: libva-amdgpu-devel
+    mesa-amdgpu-va-drivers: mesa-amdgpu-va-drivers
+    mesa-common-dev: mesa-libGL-devel
+    ncurses-dev: ncurses-devel
+    # note: llvm needs ninja-build version newer than what dnf provides
+    ocl-icd-libopencl1: ocl-icd
+    ocl-icd-opencl-dev: ocl-icd-devel
+    opencl-headers: opencl-headers
+    parallel: parallel
+    pkg-config: pkgconf-pkg-config
+    # note: python3 is the default python in AlmaLinux 8
+    python3-dev: python3.11-devel
+    # note: python3.11-pip is already installed when updating to python 3.11
+    # note: python3.11-setuptools is already installed when updating to python 3.11
+    texinfo: texinfo
+    zlib1g-dev: zlib-devel
+
+steps:
+- ${{ if eq(parameters.registerROCmPackages, true) }}:
+  - task: Bash@3
+    displayName: 'Register AMDGPU & ROCm repos (dnf)'
+    inputs:
+      targetType: inline
+      script: |
+        sudo rpm --import https://repo.radeon.com/rocm/rocm.gpg.key
+        echo '[amdgpu]' | sudo tee /etc/yum.repos.d/amdgpu.repo > /dev/null
+        echo "name=amdgpu" | sudo tee --append /etc/yum.repos.d/amdgpu.repo
+        echo "baseurl=https://repo.radeon.com/amdgpu/$(REPO_RADEON_VERSION)/rhel/8.10/main/x86_64/" | sudo tee --append /etc/yum.repos.d/amdgpu.repo
+        echo "enabled=1" | sudo tee --append /etc/yum.repos.d/amdgpu.repo
+        echo "gpgcheck=1" | sudo tee --append /etc/yum.repos.d/amdgpu.repo
+        echo "gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key" | sudo tee --append /etc/yum.repos.d/amdgpu.repo
+        echo '[rocm]' | sudo tee /etc/yum.repos.d/rocm.repo > /dev/null
+        echo "name=ROCm$(REPO_RADEON_VERSION)" | sudo tee --append /etc/yum.repos.d/rocm.repo
+        echo "baseurl=https://repo.radeon.com/rocm/rhel8/$(REPO_RADEON_VERSION)/main/" | sudo tee --append /etc/yum.repos.d/rocm.repo
+        echo "enabled=1" | sudo tee --append /etc/yum.repos.d/rocm.repo
+        echo "gpgcheck=1" | sudo tee --append /etc/yum.repos.d/rocm.repo
+        echo "gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key" | sudo tee --append /etc/yum.repos.d/rocm.repo
+        sudo dnf clean all
+        sudo dnf makecache
+- task: Bash@3
+  displayName: 'Install base dnf packages'
+  inputs:
+    targetType: inline
+    script: |
+      sudo dnf config-manager --set-enabled powertools
+      # rpm fusion free repo for some dependencies
+      sudo dnf -y install https://download1.rpmfusion.org/free/el/rpmfusion-free-release-8.noarch.rpm
+      sudo dnf -y install ${{ join(' ', parameters.basePackages) }}
+- task: Bash@3
+  displayName: 'Check gcc environment'
+  inputs:
+    targetType: inline
+    script: |
+      echo "=== Versions and sanity checks ==="
+      gcc --version
+      g++ --version
+      gcc -print-file-name=libstdc++.so
+      g++ -print-file-name=libstdc++.so
+- task: Bash@3
+  displayName: 'Set python 3.11 as default'
+  inputs:
+    targetType: inline
+    script: |
+      sudo dnf -y module disable python36
+      sudo rm -f /usr/local/bin/python3.12 /usr/local/bin/python3.13 /usr/local/bin/python3.14
+      sudo alternatives --set python /usr/bin/python3.11
+      sudo alternatives --set python3 /usr/bin/python3.11
+      python3 --version
+      python3 -m pip install --upgrade pip setuptools wheel
+- ${{ each pkg in parameters.aptPackages }}:
+  # note: llvm needs ninja-build version newer than what dnf provides
+  - ${{ if eq(pkg, 'ninja-build') }}:
+    - task: Bash@3
+      displayName: 'Install ninja 1.11.1'
+      inputs:
+        targetType: inline
+        script: |
+          curl -LO https://github.com/ninja-build/ninja/releases/download/v1.11.1/ninja-linux.zip
+          sudo dnf -y install unzip
+          unzip ninja-linux.zip
+          sudo mv ninja /usr/local/bin/ninja
+          sudo chmod +x /usr/local/bin/ninja
+          echo "##vso[task.prependpath]/usr/local/bin"
+  - ${{ if ne(parameters.aptToDnfMap[pkg], '') }}:
+    - task: Bash@3
+      displayName: 'dnf install ${{ parameters.aptToDnfMap[pkg] }}'
+      inputs:
+        targetType: inline
+        script: |
+          sudo dnf -y install ${{ parameters.aptToDnfMap[pkg] }}
--- a/.azuredevops/templates/steps/dependencies-other.yml
+++ b/.azuredevops/templates/steps/dependencies-other.yml
@@ -9,56 +9,24 @@ parameters:
 - name: registerROCmPackages
  type: boolean
  default: false
+- name: packageManager
+  type: string
+  default: apt

 steps:
- ${{ if eq(parameters.registerROCmPackages, true) }}:
-  - task: Bash@3
-    displayName: 'Register AMDGPU & ROCm repos'
-    inputs:
-      targetType: inline
-      script: |
-        sudo mkdir --parents --mode=0755 /etc/apt/keyrings
-        wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null
-        echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/amdgpu/$(REPO_RADEON_VERSION)/ubuntu jammy main" | sudo tee /etc/apt/sources.list.d/amdgpu.list
-        echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/$(REPO_RADEON_VERSION) jammy main" | sudo tee --append /etc/apt/sources.list.d/rocm.list
-        echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600
-        sudo apt update
-# firefox takes time to upgrade and is not needed for CI workloads, hold version
- task: Bash@3
-  continueOnError: true
-  displayName: 'sudo apt-mark hold firefox'
-  inputs:
-    targetType: inline
-    script: sudo apt-mark hold firefox
- task: Bash@3
-  displayName: 'sudo apt-get update'
-  inputs:
-    targetType: inline
-    script: |
-      echo "deb http://archive.ubuntu.com/ubuntu/ jammy main restricted universe multiverse" | sudo tee -a /etc/apt/sources.list.d/default.list
-      echo "deb http://archive.ubuntu.com/ubuntu/ jammy-updates main restricted universe multiverse" | sudo tee -a /etc/apt/sources.list.d/default.list
-      echo "deb http://archive.ubuntu.com/ubuntu/ jammy-backports main restricted universe multiverse" | sudo tee -a /etc/apt/sources.list.d/default.list
-      echo "deb http://archive.ubuntu.com/ubuntu/ jammy-security main restricted universe multiverse" | sudo tee -a /etc/apt/sources.list.d/default.list
-      sudo DEBIAN_FRONTEND=noninteractive apt-get --yes update
- task: Bash@3
-  displayName: 'sudo apt-get upgrade'
-  inputs:
-    targetType: inline
-    script: sudo DEBIAN_FRONTEND=noninteractive apt-get --yes upgrade
- task: Bash@3
-  displayName: 'sudo apt-get fix'
-  inputs:
-    targetType: inline
-    script: sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --fix-broken install
- ${{ if gt(length(parameters.aptPackages), 0) }}:
-  - task: Bash@3
-    displayName: 'sudo apt-get install ...'
-    inputs:
-      targetType: inline
-      script: sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --fix-missing install ${{ join(' ', parameters.aptPackages) }}
+- ${{ if eq(parameters.packageManager, 'apt') }}:
+  - template: dependencies-apt.yml
+    parameters:
+      aptPackages: ${{ parameters.aptPackages }}
+      registerROCmPackages: ${{ parameters.registerROCmPackages }}
+- ${{ if eq(parameters.packageManager, 'dnf') }}:
+  - template: dependencies-dnf.yml
+    parameters:
+      aptPackages: ${{ parameters.aptPackages }}
+      registerROCmPackages: ${{ parameters.registerROCmPackages }}
 - ${{ if gt(length(parameters.pipModules), 0) }}:
  - task: Bash@3
    displayName: 'pip install  ...'
    inputs:
      targetType: inline
-      script: pip install -v --force-reinstall ${{ join(' ', parameters.pipModules) }}
+      script: python3 -m pip install -v --force-reinstall ${{ join(' ', parameters.pipModules) }}
--- a/.azuredevops/templates/steps/dependencies-rocm.yml
+++ b/.azuredevops/templates/steps/dependencies-rocm.yml
@@ -13,6 +13,9 @@ parameters:
 - name: dependencyList
  type: object
  default: []
+- name: os
+  type: string
+  default: 'ubuntu2204'
 - name: gpuTarget
  type: string
  default: ''
@@ -36,6 +39,10 @@ parameters:
 - name: aggregatePipeline
  type: boolean
  default: false
+# monorepo related parameters
+- name: downstreamAggregateNames
+  type: string
+  default: ''

 - name: componentVarList
  type: object
@@ -103,7 +110,7 @@ parameters:
    hipCUB:
      pipelineId: $(HIPCUB_PIPELINE_ID)
      stagingBranch: develop
-      mainlineBranch: mainline
+      mainlineBranch: develop
      hasGpuTarget: true
    hipFFT:
      pipelineId: $(HIPFFT_PIPELINE_ID)
@@ -123,7 +130,7 @@ parameters:
    hipRAND:
      pipelineId: $(HIPRAND_PIPELINE_ID)
      stagingBranch: develop
-      mainlineBranch: mainline
+      mainlineBranch: develop
      hasGpuTarget: true
    hipSOLVER:
      pipelineId: $(HIPSOLVER_PIPELINE_ID)
@@ -258,7 +265,7 @@ parameters:
    rocPRIM:
      pipelineId: $(ROCPRIM_PIPELINE_ID)
      stagingBranch: develop
-      mainlineBranch: mainline
+      mainlineBranch: develop
      hasGpuTarget: true
    rocprofiler:
      pipelineId: $(ROCPROFILER_PIPELINE_ID)
@@ -298,7 +305,7 @@ parameters:
    rocRAND:
      pipelineId: $(ROCRAND_PIPELINE_ID)
      stagingBranch: develop
-      mainlineBranch: mainline
+      mainlineBranch: develop
      hasGpuTarget: true
    rocr_debug_agent:
      pipelineId: $(ROCR_DEBUG_AGENT_PIPELINE_ID)
@@ -323,7 +330,7 @@ parameters:
    rocThrust:
      pipelineId: $(ROCTHRUST_PIPELINE_ID)
      stagingBranch: develop
-      mainlineBranch: mainline
+      mainlineBranch: develop
      hasGpuTarget: true
    roctracer:
      pipelineId: $(ROCTRACER_PIPELINE_ID)
@@ -361,7 +368,7 @@ steps:
        pipelineId: ${{ parameters.componentVarList[split(dependency, ':')[0]].pipelineId }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
        ${{ if parameters.componentVarList[split(dependency, ':')[0]].hasGpuTarget }}:
-          fileFilter: "${{ split(dependency, ':')[1] }}*${{ parameters.gpuTarget }}"
+          fileFilter: "${{ split(dependency, ':')[1] }}*_${{ parameters.os }}_${{ parameters.gpuTarget }}"
        # dependencySource = staging
        ${{ if eq(parameters.dependencySource, 'staging')}}:
          branchName: ${{ parameters.componentVarList[split(dependency, ':')[0]].stagingBranch }}
@@ -384,6 +391,14 @@ steps:
        ${{ else }}:
          branchName: ${{ parameters.componentVarList[split(dependency, ':')[0]].stagingBranch }}
 # no colon (:) found in this item in the list
+  - ${{ elseif containsValue(split(parameters.downstreamAggregateNames, '+'), dependency) }}:
+    - template: local-artifact-download.yml
+      parameters:
+        ${{ if parameters.componentVarList[dependency].hasGpuTarget }}:
+          gpuTarget: ${{ parameters.gpuTarget }}
+        preTargetFilter: ${{ dependency }}
+        os: ${{ parameters.os }}
+        buildType: current
  - ${{ else }}:
    - template: artifact-download.yml
      parameters:
@@ -391,7 +406,9 @@ steps:
        pipelineId: ${{ parameters.componentVarList[dependency].pipelineId }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
        ${{ if parameters.componentVarList[dependency].hasGpuTarget }}:
-          fileFilter: ${{ parameters.gpuTarget }}
+          fileFilter: ${{ parameters.os }}_${{ parameters.gpuTarget }}
+        ${{ else }}:
+          fileFilter: ${{ parameters.os }}
        # dependencySource = staging
        ${{ if eq(parameters.dependencySource, 'staging')}}:
          branchName: ${{ parameters.componentVarList[dependency].stagingBranch }}
@@ -419,14 +436,16 @@ steps:
    displayName: Symlink from rocm/llvm to rocm/lib/llvm
    inputs:
      targetType: inline
-      script: sudo ln -s $(Agent.BuildDirectory)/rocm/llvm $(Agent.BuildDirectory)/rocm/lib/llvm
+      script: |
+        sudo mkdir -p $(Agent.BuildDirectory)/rocm/lib
+        sudo ln -sr $(Agent.BuildDirectory)/rocm/llvm $(Agent.BuildDirectory)/rocm/lib/llvm
  - task: Bash@3
    displayName: Symlink executables from rocm/llvm/bin to rocm/bin
    inputs:
      targetType: inline
      script: |
        for file in amdclang amdclang++ amdclang-cl amdclang-cpp amdflang amdlld aompcc mygpu mycpu offload-arch; do
-          sudo ln -s $(Agent.BuildDirectory)/rocm/llvm/bin/$file $(Agent.BuildDirectory)/rocm/bin/$file
+          sudo ln -sr $(Agent.BuildDirectory)/rocm/llvm/bin/$file $(Agent.BuildDirectory)/rocm/bin/$file
        done
 # dlopen calls within a ctest or pytest sequence runs into issues when shared library symlink convention is not followed
 # the convention is as follows:
@@ -471,8 +490,10 @@ steps:
      targetType: inline
 # OS ignores if the ROCm lib folder shows up more than once
      script: |
-        echo $(Agent.BuildDirectory)/rocm/lib | sudo tee /etc/ld.so.conf.d/rocm-ci.conf
+        echo $(Agent.BuildDirectory)/rocm/lib | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
        echo $(Agent.BuildDirectory)/rocm/llvm/lib | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
+        echo $(Agent.BuildDirectory)/rocm/lib64 | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
+        echo $(Agent.BuildDirectory)/rocm/llvm/lib64 | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
        sudo cat /etc/ld.so.conf.d/rocm-ci.conf
        sudo ldconfig -v
        ldconfig -p
--- a/.azuredevops/templates/steps/dependencies-vendor.yml
+++ b/.azuredevops/templates/steps/dependencies-vendor.yml
@@ -0,0 +1,53 @@
+parameters:
+- name: os
+  type: string
+  default: 'ubuntu2204'
+- name: dependencyList
+  type: object
+- name: pipelineIdList
+  type: object
+  default:
+    boost: 250
+    grpc: 72
+    gtest: 73
+    half560: 68
+    lapack: 69
+
+steps:
+- ${{ each dependency in parameters.dependencyList }}:
+  - task: DownloadPipelineArtifact@2
+    displayName: Download ${{ dependency }}
+    inputs:
+      project: ROCm-CI
+      buildType: specific
+      targetPath: $(Pipeline.Workspace)/d
+      definition: ${{ parameters.pipelineIdList[dependency] }}
+      itemPattern: '**/*${{ parameters.os }}*'
+  - task: ExtractFiles@1
+    displayName: Extract ${{ dependency }}
+    inputs:
+      archiveFilePatterns: '$(Pipeline.Workspace)/d/**/*.tar.gz'
+      destinationFolder: $(Agent.BuildDirectory)/vendor
+      cleanDestinationFolder: true
+      overwriteExistingFiles: true
+  - task: DeleteFiles@1
+    displayName: Clean up ${{ dependency }}
+    inputs:
+      SourceFolder: $(Pipeline.Workspace)/d
+      Contents: '**/*.tar.gz'
+      RemoveDotFiles: true
+- task: Bash@3
+  displayName: List vendored files
+  inputs:
+    targetType: inline
+    script: ls -la1R $(Agent.BuildDirectory)/vendor
+- task: Bash@3
+  displayName: Link vendored shared libraries
+  inputs:
+    targetType: inline
+    script: |
+      echo $(Agent.BuildDirectory)/vendor/lib | sudo tee -a /etc/ld.so.conf.d/vendor.conf
+      echo $(Agent.BuildDirectory)/vendor/lib64 | sudo tee -a /etc/ld.so.conf.d/vendor.conf
+      sudo cat /etc/ld.so.conf.d/vendor.conf
+      sudo ldconfig -v
+      ldconfig -p
--- a/.azuredevops/templates/steps/local-artifact-download.yml
+++ b/.azuredevops/templates/steps/local-artifact-download.yml
@@ -2,6 +2,9 @@
 # It can be overridden to download any artifact from any pipeline, given the appropriate build/pipeline IDs

 parameters:
+  - name: os
+    type: string
+    default: 'ubuntu2204'
  - name: gpuTarget
    type: string
    default: ''
@@ -29,25 +32,27 @@ parameters:

 steps:
  - task: DownloadPipelineArtifact@2
-    displayName: 'Download Pipeline Build'
+    displayName: Download ${{ parameters.preTargetFilter}}*${{ parameters.os }}_${{ parameters.gpuTarget}}*${{ parameters.postTargetFilter}}
    inputs:
      ${{ if eq(parameters.buildType, 'specific') }}:
        buildType: specific
        buildVersionToDownload: specific
        project: ROCm-CI
-        definition: ${{ parameters.definitionId }}
-        buildId: ${{ parameters.buildId }}
-      itemPattern: '**/*${{ parameters.preTargetFilter }}*${{ parameters.gpuTarget }}*${{ parameters.postTargetFilter }}*'
+        ${{ if ne(parameters.definitionId, 0) }}:
+          definition: ${{ parameters.definitionId }}
+        ${{ if ne(parameters.buildId, 0) }}:
+          buildId: ${{ parameters.buildId }}
+      itemPattern: '**/*${{ parameters.preTargetFilter }}*${{ parameters.os }}_${{ parameters.gpuTarget }}*${{ parameters.postTargetFilter }}*'
      targetPath: $(Pipeline.Workspace)/d
  - task: ExtractFiles@1
-    displayName: 'Extract Pipeline Build'
+    displayName: Extract ${{ parameters.preTargetFilter}}*${{ parameters.os }}_${{ parameters.gpuTarget}}*${{ parameters.postTargetFilter}}
    inputs:
      archiveFilePatterns: '$(Pipeline.Workspace)/d/**/*.tar.gz'
      destinationFolder: '$(Agent.BuildDirectory)/rocm'
      cleanDestinationFolder: false
      overwriteExistingFiles: true
  - task: DeleteFiles@1
-    displayName: 'Clean up Compressed Pipeline Build'
+    displayName: Clean up ${{ parameters.preTargetFilter}}*${{ parameters.os }}_${{ parameters.gpuTarget}}*${{ parameters.postTargetFilter}}
    inputs:
      SourceFolder: '$(Pipeline.Workspace)/d'
      Contents: '/**/*.tar.xz'
--- a/.azuredevops/templates/steps/manifest.yml
+++ b/.azuredevops/templates/steps/manifest.yml
@@ -1,10 +1,19 @@
 parameters:
- name: artifactName
+- name: componentName
  type: string
-  default: 'drop'
+  default: $(Build.DefinitionName)
+- name: sparseCheckoutDir
+  type: string
+  default: ''
 - name: gpuTarget
  type: string
  default: ''
+- name: artifactName
+  type: string
+  default: drop
+- name: os
+  type: string
+  default: 'ubuntu2204'

 steps:
 - task: Bash@3
@@ -25,8 +34,9 @@ steps:

      IS_TAG_BUILD=$(jq 'has("release_repo")' resources.repositories)
      IS_AOMP_BUILD=$(jq 'has("aomp_repo")' resources.repositories)
+      IS_MATHLIBS_BUILD=$(jq 'has("libraries_repo")' resources.repositories)

-      if [ "$IS_TAG_BUILD" = "true" ] || [ "$IS_AOMP_BUILD" = "true" ]; then
+      if [ "$IS_TAG_BUILD" = "true" ] || [ "$IS_AOMP_BUILD" = "true" ] || [ "$IS_MATHLIBS_BUILD" = "true" ]; then
        exclude_keys=("pipelines_repo" "self") # Triggered by a file under ROCm/ROCm
      else
        exclude_keys=("pipelines_repo") # Triggered by a file under a component repo
@@ -45,6 +55,7 @@ steps:
              buildId: "$(Build.BuildId)",
              repoId: $entry.value.id,
              repoName: $entry.value.name,
+              repoSparse: "${{ parameters.sparseCheckoutDir }}",
              repoRef: $entry.value.ref,
              repoUrl: $entry.value.url,
              repoVersion: $entry.value.version
@@ -55,7 +66,7 @@ steps:
        )
      ' resources.repositories)

-      manifest_json=$(Build.ArtifactStagingDirectory)/manifest_$(Build.DefinitionName)_$(Build.BuildId)_$(Build.BuildNumber)_ubuntu2204_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.json
+      manifest_json=$(Build.ArtifactStagingDirectory)/manifest_${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.json

      dependencies=()
      for manifest_file in $(Pipeline.Workspace)/d/**/manifest_*.json; do
@@ -81,6 +92,7 @@ steps:
          "<tr><td>" + .buildNumber + "</td>" +
          "<td><a href=\"https://dev.azure.com/ROCm-CI/ROCm-CI/_build/results?buildId=" + .buildId + "\">" + .buildId + "</a></td>" +
          "<td><a href=\"" + .repoUrl + "\">" + .repoName + "</a></td>" +
+          "<td><a href=\"" + .repoUrl + "/tree/" + .repoRef + "/" + .repoSparse + "\">" + .repoSparse + "</a></td>" +
          "<td><a href=\"" + .repoUrl + "/tree/" + .repoRef + "\">" + .repoRef + "</a></td>" +
          "<td><a href=\"" + .repoUrl + "/commit/" + .repoVersion + "\">" + .repoVersion + "</a></td></tr>"
        ')
@@ -93,6 +105,7 @@ steps:
          "<tr><td>" + .buildNumber + "</td>" +
          "<td><a href=\"https://dev.azure.com/ROCm-CI/ROCm-CI/_build/results?buildId=" + .buildId + "\">" + .buildId + "</a></td>" +
          "<td><a href=\"" + .repoUrl + "\">" + .repoName + "</a></td>" +
+          "<td><a href=\"" + .repoUrl + "/tree/" + .repoRef + "/" + .repoSparse + "\">" + .repoSparse + "</a></td>" +
          "<td><a href=\"" + .repoUrl + "/tree/" + .repoRef + "\">" + .repoRef + "</a></td>" +
          "<td><a href=\"" + .repoUrl + "/commit/" + .repoVersion + "\">" + .repoVersion + "</a></td></tr>"
        ')
@@ -107,7 +120,7 @@ steps:
  inputs:
    targetType: inline
    script: |
-      manifest_html=$(Build.ArtifactStagingDirectory)/manifest_$(Build.DefinitionName)_$(Build.BuildId)_$(Build.BuildNumber)_ubuntu2204_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.html
+      manifest_html=$(Build.ArtifactStagingDirectory)/manifest_${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.html
      cat <<EOF > $manifest_html
      <html>
      <h1>Manifest</h1>
@@ -117,6 +130,7 @@ steps:
        <th>Build Number</th>
        <th>Build ID</th>
        <th>Repo Name</th>
+        <th>Repo Sparse</th>
        <th>Repo Ref</th>
        <th>Repo Version</th>
      </tr>
@@ -128,6 +142,7 @@ steps:
        <th>Build Number</th>
        <th>Build ID</th>
        <th>Repo Name</th>
+        <th>Repo Sparse</th>
        <th>Repo Ref</th>
        <th>Repo Version</th>
      </tr>
@@ -148,7 +163,7 @@ steps:
  continueOnError: true
  inputs:
    tabName: Manifest
-    reportDir: $(Build.ArtifactStagingDirectory)/manifest_$(Build.DefinitionName)_$(Build.BuildId)_$(Build.BuildNumber)_ubuntu2204_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.html
+    reportDir: $(Build.ArtifactStagingDirectory)/manifest_${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.html
 - task: Bash@3
  displayName: Save manifest artifact file name
  condition: always()
@@ -157,5 +172,5 @@ steps:
    workingDirectory: $(Pipeline.Workspace)
    targetType: inline
    script: |
-      echo "manifest_$(Build.DefinitionName)_$(Build.BuildId)_$(Build.BuildNumber)_ubuntu2204_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.html" >> pipelineArtifacts.txt
-      echo "manifest_$(Build.DefinitionName)_$(Build.BuildId)_$(Build.BuildNumber)_ubuntu2204_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.json" >> pipelineArtifacts.txt
+      echo "manifest_${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.html" >> pipelineArtifacts.txt
+      echo "manifest_${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.json" >> pipelineArtifacts.txt
--- a/.azuredevops/templates/steps/miopen-get-ck-build.yml
+++ b/.azuredevops/templates/steps/miopen-get-ck-build.yml
@@ -25,7 +25,7 @@ steps:
      echo "Fetching CK build ID for commit $CK_COMMIT"
      CK_CHECKS_URL="$GH_API/composable_kernel/commits/${CK_COMMIT}/check-runs"
      CK_BUILD_ID=$(curl -s $CK_CHECKS_URL | \
-        jq '.check_runs[] | select(.name == "composable_kernel" and .app.slug == "azure-pipelines") | .details_url' | \
+        jq '.check_runs[] | select(.name == "composable_kernel" and .app.slug == "azure-pipelines" and .conclusion == "success") | .details_url' | \
        tr -d '"' | grep -oP 'buildId=\K\d+')

      # If none found, use latest successful CK build instead
--- a/.azuredevops/templates/steps/preamble.yml
+++ b/.azuredevops/templates/steps/preamble.yml
@@ -3,10 +3,27 @@
 # also display installed components and packages
 steps:
 - task: Bash@3
-  displayName: List apt packages
+  displayName: OS Version
  inputs:
    targetType: inline
-    script: apt list --installed
+    script: cat /etc/os-release
+- task: Bash@3
+  displayName: List installed packages (apt, dnf, or yum)
+  inputs:
+    targetType: inline
+    script: |
+      if command -v apt >/dev/null 2>&1; then
+        echo "Listing installed packages with apt:"
+        apt list --installed
+      elif command -v dnf >/dev/null 2>&1; then
+        echo "Listing installed packages with dnf:"
+        dnf list installed
+      elif command -v yum >/dev/null 2>&1; then
+        echo "Listing installed packages with yum:"
+        yum list installed
+      else
+        echo "No supported package manager found (apt, dnf, yum)."
+      fi
 - task: Bash@3
  displayName: Print Python version
  inputs:
@@ -16,7 +33,7 @@ steps:
  displayName: List Python packages
  inputs:
    targetType: inline
-    script: pip list -v
+    script: python3 -m pip list -v
 # The "Azure Pipelines" agents install CMake in multiple ways, including a standalone install into /usr/local/bin:
 # https://github.com/actions/runner-images/blob/6d939a3ab352a54a021dd67b071577287b6f14a5/images/ubuntu/scripts/build/install-cmake.sh#L27
 # This standalone CMake does not have a fixed version, and is not the same version as the one installed by the package manager
--- a/.azuredevops/templates/steps/test.yml
+++ b/.azuredevops/templates/steps/test.yml
@@ -2,21 +2,27 @@ parameters:
 - name: componentName
  type: string
  default: ''
+- name: os
+  type: string
+  default: ubuntu2204
 - name: testDir
  type: string
-  default: 'build'
+  default: build
 - name: testExecutable
  type: string
-  default: 'ctest'
+  default: ctest
 - name: testParameters
  type: string
-  default: '--output-on-failure --force-new-ctest-process --output-junit test_output.xml'
+  default: --output-on-failure --force-new-ctest-process --output-junit test_output.xml
+- name: extraTestParameters
+  type: string
+  default: ''
 - name: testOutputFile
  type: string
  default: test_output.xml
 - name: testOutputFormat
  type: string
-  default: 'JUnit'
+  default: JUnit
  values:
    - JUnit
    - NUnit
@@ -26,26 +32,28 @@ parameters:
 - name: testPublishResults
  type: boolean
  default: true
- name: allowPartiallySucceededBuilds
+- name: allowComponentTestFailure
  type: object
  default:
    - amdsmi
-    - aomp
    - HIPIFY
-    - MIVisionX
    - rocm_smi_lib
-    - rocprofiler-sdk
    - roctracer
+    # the following do not use this template but allow test failures, included for completeness
+    - aomp
+    - ROCgdb

 steps:
 # run test, continue on failure to publish results
 # and to publish build artifacts
 - task: Bash@3
  displayName: '${{ parameters.componentName }} Test'
-  continueOnError: ${{ containsValue(parameters.allowPartiallySucceededBuilds, parameters.componentName) }}
+  continueOnError: ${{ containsValue(parameters.allowComponentTestFailure, parameters.componentName) }}
  inputs:
    targetType: inline
-    script: ${{ parameters.testExecutable }} ${{ parameters.testParameters }}
+    script: |
+      ${{ iif(eq(parameters.os, 'almalinux8'), 'source /opt/rh/gcc-toolset-14/enable', '') }}
+      ${{ parameters.testExecutable }} ${{ parameters.testParameters }} ${{ parameters.extraTestParameters }}
    workingDirectory: ${{ parameters.testDir }}
 - ${{ if parameters.testPublishResults }}:
  - task: PublishTestResults@2
--- a/.azuredevops/variables-global.yml
+++ b/.azuredevops/variables-global.yml
@@ -3,6 +3,8 @@
 variables:
 - name: RESOURCES_REPOSITORIES
  value: $[ convertToJson(resources.repositories) ]
+- name: CCACHE_DIR
+  value: $(Pipeline.Workspace)/ccache
 - name: CI_ROOT_PATH
  value: /.azuredevops
 - name: CI_COMPONENT_PATH
@@ -30,320 +32,136 @@ variables:
 - name: GFX90A_TEST_POOL
  value: gfx90a_test_pool
 - name: LATEST_RELEASE_VERSION
-  value: 6.4.0
+  value: 6.4.1
 - name: REPO_RADEON_VERSION
-  value: 6.4
+  value: 6.4.1
 - name: NEXT_RELEASE_VERSION
-  value: 6.5.0
+  value: 7.0.0
 - name: LATEST_RELEASE_TAG
-  value: rocm-6.4.0
+  value: rocm-6.4.1
 - name: DOCKER_SKIP_GFX
  value: gfx90a
- name: AMDMIGRAPHX_GFX942_TEST_PIPELINE_ID
-  value: 197
 - name: AMDMIGRAPHX_PIPELINE_ID
  value: 113
- name: AMDMIGRAPHX_TAGGED_PIPELINE_ID
-  value: 60
 - name: AMDSMI_PIPELINE_ID
  value: 99
- name: AMDSMI_TAGGED_PIPELINE_ID
-  value: 33
 - name: AOMP_EXTRAS_PIPELINE_ID
  value: 111
- name: AOMP_EXTRAS_TAGGED_PIPELINE_ID
-  value: 75
 - name: AOMP_PIPELINE_ID
  value: 115
- name: AOMP_TAGGED_PIPELINE_ID
-  value: 76
- name: CCACHE_DIR
-  value: $(Pipeline.Workspace)/ccache
 - name: CLR_PIPELINE_ID
  value: 145
- name: CLR_TAGGED_PIPELINE_ID
-  value: 71
- name: COMPOSABLE_KERNEL_GFX942_TEST_PIPELINE_ID
-  value: 179
 - name: COMPOSABLE_KERNEL_PIPELINE_ID
  value: 86
- name: COMPOSABLE_KERNEL_TAGGED_PIPELINE_ID
-  value: 38
 - name: FLANG_LEGACY_PIPELINE_ID
  value: 77
- name: FLANG_LEGACY_TAGGED_PIPELINE_ID
-  value: 77
 - name: HALF_PIPELINE_ID
  value: 101
- name: HALF_TAGGED_PIPELINE_ID
-  value: 11
 - name: HALF560_PIPELINE_ID
  value: 68
 - name: HALF560_BUILD_ID
  value: 621
 - name: HIP_PIPELINE_ID
  value: 93
- name: HIP_TAGGED_PIPELINE_ID
-  value: 31
 - name: HIP_TESTS_PIPELINE_ID
  value: 233
- name: HIP_TESTS_TAGGED_PIPELINE_ID
-  value: 220
 - name: HIPBLAS_COMMON_PIPELINE_ID
  value: 223
- name: HIPBLAS_COMMON_TAGGED_PIPELINE_ID
-  value: 224
- name: HIPBLAS_GFX942_TEST_PIPELINE_ID
-  value: 202
 - name: HIPBLAS_PIPELINE_ID
  value: 87
- name: HIPBLAS_TAGGED_PIPELINE_ID
-  value: 44
- name: HIPBLASLT_GFX942_TEST_PIPELINE_ID
-  value: 187
 - name: HIPBLASLT_PIPELINE_ID
  value: 112
- name: HIPBLASLT_TAGGED_PIPELINE_ID
-  value: 45
- name: HIPCUB_GFX942_TEST_PIPELINE_ID
-  value: 186
 - name: HIPCUB_PIPELINE_ID
-  value: 97
- name: HIPCUB_TAGGED_PIPELINE_ID
-  value: 46
- name: HIPFFT_GFX942_TEST_PIPELINE_ID
-  value: 198
+  value: 277
 - name: HIPFFT_PIPELINE_ID
  value: 121
- name: HIPFFT_TAGGED_PIPELINE_ID
-  value: 12
 - name: HIPFORT_PIPELINE_ID
  value: 102
- name: HIPFORT_TAGGED_PIPELINE_ID
-  value: 34
 - name: HIPIFY_PIPELINE_ID
  value: 92
- name: HIPIFY_TAGGED_PIPELINE_ID
-  value: 13
- name: HIPRAND_GFX942_TEST_PIPELINE_ID
-  value: 188
 - name: HIPRAND_PIPELINE_ID
-  value: 90
- name: HIPRAND_TAGGED_PIPELINE_ID
-  value: 42
- name: HIPSOLVER_GFX942_TEST_PIPELINE_ID
-  value: 201
+  value: 275
 - name: HIPSOLVER_PIPELINE_ID
  value: 84
- name: HIPSOLVER_TAGGED_PIPELINE_ID
-  value: 52
- name: HIPSPARSE_GFX942_TEST_PIPELINE_ID
-  value: 195
 - name: HIPSPARSE_PIPELINE_ID
  value: 83
- name: HIPSPARSE_TAGGED_PIPELINE_ID
-  value: 14
- name: HIPSPARSELT_GFX942_TEST_PIPELINE_ID
-  value: 200
 - name: HIPSPARSELT_PIPELINE_ID
  value: 104
- name: HIPSPARSELT_TAGGED_PIPELINE_ID
-  value: 53
- name: HIPTENSOR_GFX942_TEST_PIPELINE_ID
-  value: 192
 - name: HIPTENSOR_PIPELINE_ID
  value: 105
- name: HIPTENSOR_TAGGED_PIPELINE_ID
-  value: 56
 - name: LLVM_PROJECT_PIPELINE_ID
  value: 2
- name: LLVM_PROJECT_TAGGED_PIPELINE_ID
-  value: 8
 - name: MIOPEN_PIPELINE_ID
  value: 108
- name: MIOPEN_TAGGED_PIPELINE_ID
-  value: 58
 - name: MIVISIONX_PIPELINE_ID
  value: 80
- name: MIVISIONX_TAGGED_PIPELINE_ID
-  value: 18
- name: OMNIPERF_PIPELINE_ID
-  value: 241
- name: OMNIPERF_TAGGED_PIPELINE_ID
-  value: 242
- name: OMNITRACE_PIPELINE_ID
-  value: 253
- name: OMNITRACE_TAGGED_PIPELINE_ID
-  value: 252
- name: RCCL_GFX942_TEST_PIPELINE_ID
-  value: 184
 - name: RCCL_PIPELINE_ID
  value: 107
- name: RCCL_TAGGED_PIPELINE_ID
-  value: 15
 - name: RDC_PIPELINE_ID
  value: 100
- name: RDC_TAGGED_PIPELINE_ID
-  value: 59
 - name: ROCAL_PIPELINE_ID
  value: 151
- name: ROCALUTION_GFX942_TEST_PIPELINE_ID
-  value: 196
 - name: ROCALUTION_PIPELINE_ID
  value: 89
- name: ROCALUTION_TAGGED_PIPELINE_ID
-  value: 16
- name: ROCBLAS_GFX942_TEST_PIPELINE_ID
-  value: 185
 - name: ROCBLAS_PIPELINE_ID
  value: 85
- name: ROCBLAS_TAGGED_PIPELINE_ID
-  value: 32
 - name: ROCDBGAPI_PIPELINE_ID
  value: 135
- name: ROCDBGAPI_TAGGED_PIPELINE_ID
-  value: 17
 - name: ROCDECODE_PIPELINE_ID
  value: 79
- name: ROCDECODE_TAGGED_PIPELINE_ID
-  value: 21
- name: ROCFFT_GFX942_TEST_PIPELINE_ID
-  value: 189
 - name: ROCFFT_PIPELINE_ID
  value: 120
- name: ROCFFT_TAGGED_PIPELINE_ID
-  value: 19
 - name: ROCGDB_PIPELINE_ID
  value: 134
- name: ROCGDB_TAGGED_PIPELINE_ID
-  value: 50
 - name: ROCJPEG_PIPELINE_ID
  value: 262
- name: ROCJPEG_TAGGED_PIPELINE_ID
-  value: 263
 - name: ROCM_BANDWIDTH_TEST_PIPELINE_ID
  value: 88
- name: ROCM_BANDWIDTH_TEST_TAGGED_PIPELINE_ID
-  value: 23
 - name: ROCM_CMAKE_PIPELINE_ID
  value: 6
- name: ROCM_CMAKE_TAGGED_PIPELINE_ID
-  value: 7
 - name: ROCM_CORE_PIPELINE_ID
  value: 103
- name: ROCM_CORE_TAGGED_PIPELINE_ID
-  value: 22
- name: ROCM_EXAMPLES_GFX942_TEST_PIPELINE_ID
-  value: 204
 - name: ROCM_EXAMPLES_PIPELINE_ID
  value: 216
- name: ROCM_EXAMPLES_TAGGED_PIPELINE_ID
-  value: 245
 - name: ROCM_SMI_LIB_PIPELINE_ID
  value: 96
- name: ROCM_SMI_LIB_TAGGED_PIPELINE_ID
-  value: 47
 - name: ROCMINFO_PIPELINE_ID
  value: 91
- name: ROCMINFO_TAGGED_PIPELINE_ID
-  value: 27
 - name: ROCMLIR_PIPELINE_ID
  value: 229
- name: ROCMLIR_TAGGED_PIPELINE_ID
-  value: 62
 - name: ROCMVALIDATIONSUITE_PIPELINE_ID
  value: 106
- name: ROCMVALIDATIONSUITE_TAGGED_PIPELINE_ID
-  value: 43
- name: ROCPRIM_GFX942_TEST_PIPELINE_ID
-  value: 180
 - name: ROCPRIM_PIPELINE_ID
-  value: 82
- name: ROCPRIM_TAGGED_PIPELINE_ID
-  value: 20
- name: ROCPROFILER_GFX942_TEST_PIPELINE_ID
-  value: 190
+  value: 273
 - name: ROCPROFILER_COMPUTE_PIPELINE_ID
  value: 257
- name: ROCPROFILER_COMPUTE_TAGGED_PIPELINE_ID
-  value: 258
 - name: ROCPROFILER_REGISTER_PIPELINE_ID
  value: 1
- name: ROCPROFILER_REGISTER_TAGGED_PIPELINE_ID
-  value: 25
 - name: ROCPROFILER_SDK_PIPELINE_ID
  value: 246
- name: ROCPROFILER_SDK_TAGGED_PIPELINE_ID
-  value: 234
 - name: ROCPROFILER_SYSTEMS_PIPELINE_ID
  value: 255
- name: ROCPROFILER_SYSTEMS_TAGGED_PIPELINE_ID
-  value: 254
 - name: ROCPROFILER_PIPELINE_ID
  value: 143
- name: ROCPROFILER_TAGGED_PIPELINE_ID
-  value: 28
 - name: ROCPYDECODE_PIPELINE_ID
  value: 239
- name: ROCPYDECODE_TAGGED_PIPELINE_ID
-  value: 232
 - name: ROCR_DEBUG_AGENT_PIPELINE_ID
  value: 136
- name: ROCR_DEBUG_AGENT_TAGGED_PIPELINE_ID
-  value: 29
 - name: ROCR_RUNTIME_PIPELINE_ID
  value: 10
- name: ROCR_RUNTIME_TAGGED_PIPELINE_ID
-  value: 24
- name: ROCRAND_GFX942_TEST_PIPELINE_ID
-  value: 183
 - name: ROCRAND_PIPELINE_ID
-  value: 95
- name: ROCRAND_TAGGED_PIPELINE_ID
-  value: 41
- name: ROCSOLVER_GFX942_TEST_PIPELINE_ID
-  value: 199
+  value: 274
 - name: ROCSOLVER_PIPELINE_ID
  value: 81
- name: ROCSOLVER_TAGGED_PIPELINE_ID
-  value: 55
- name: ROCSPARSE_GFX942_TEST_PIPELINE_ID
-  value: 191
 - name: ROCSPARSE_PIPELINE_ID
  value: 98
- name: ROCSPARSE_TAGGED_PIPELINE_ID
-  value: 67
- name: ROCT_THUNK_INTERFACE_PIPELINE_ID
-  value: 3
- name: ROCT_THUNK_INTERFACE_TAGGED_PIPELINE_ID
-  value: 9
- name: ROCTHRUST_GFX942_TEST_PIPELINE_ID
-  value: 194
 - name: ROCTHRUST_PIPELINE_ID
-  value: 94
- name: ROCTHRUST_TAGGED_PIPELINE_ID
-  value: 26
- name: ROCTRACER_GFX942_TEST_PIPELINE_ID
-  value: 181
+  value: 276
 - name: ROCTRACER_PIPELINE_ID
  value: 141
- name: ROCTRACER_TAGGED_PIPELINE_ID
-  value: 30
- name: ROCWMMA_GFX942_TEST_PIPELINE_ID
-  value: 193
 - name: ROCWMMA_PIPELINE_ID
  value: 109
- name: ROCWMMA_TAGGED_PIPELINE_ID
-  value: 57
- name: RPP_GFX942_TEST_PIPELINE_ID
-  value: 182
 - name: RPP_PIPELINE_ID
  value: 78
- name: RPP_TAGGED_PIPELINE_ID
-  value: 39
 - name: TRANSFERBENCH_PIPELINE_ID
  value: 265
- name: TRANSFERBENCH_TAGGED_PIPELINE_ID
-  value: 266
- name: BOOST_DEPENDENCY_PIPELINE_ID
-  value: 250
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -1,3 +1,18 @@
+Datacenter
+GST
+IET
+LTO
+MX
+Microscaling
+NANOO
+ROCprof
+affinitization
+amdclang
+benefitting
+demangled
+inlined
+microscaling
+roofline
 AAC
 ABI
 ACE
@@ -6,7 +21,6 @@ ACS
 AccVGPR
 AccVGPRs
 ALU
-AllReduce
 AMD
 AMDGPU
 AMDGPUs
@@ -14,7 +28,6 @@ AMDMIGraphX
 AMI
 AOCC
 AOMP
-AOT
 AOTriton
 APBDIS
 APIC
@@ -34,7 +47,6 @@ Andrej
 Arb
 Autocast
 BARs
-BatchNorm
 BLAS
 BMC
 BabelStream
@@ -82,13 +94,10 @@ ConnectX
 CuPy
 da
 Dashboarding
-Dataloading
 DBRX
 DDR
 DF
 DGEMM
-DGL
-DGLGraph
 dGPU
 dGPUs
 DIMM
@@ -106,7 +115,6 @@ DataFrame
 DataLoader
 DataParallel
 Debian
-decompositions
 DeepSeek
 DeepSpeed
 Dependabot
@@ -132,12 +140,10 @@ FX
 Filesystem
 FindDb
 Flang
-FlashAttention
 FluxBenchmark
 Fortran
 Fuyu
 GALB
-GAT
 GCC
 GCD
 GCDs
@@ -165,8 +171,6 @@ GPT
 GPU
 GPU's
 GPUs
-Graphbolt
-GraphSage
 GRBM
 GenAI
 GenZ
@@ -179,7 +183,6 @@ HIPCC
 HIPExtension
 HIPIFY
 HIPification
-hipification
 HIPify
 HPC
 HPCG
@@ -194,7 +197,6 @@ Higgs
 Hyperparameters
 Huggingface
 ICD
-ICT
 ICV
 IDE
 IDEs
@@ -229,7 +231,6 @@ KV
 KVM
 Karpathy's
 KiB
-Kineto
 Keras
 Khronos
 LAPACK
@@ -242,7 +243,6 @@ LM
 LSAN
 LSan
 LTS
-LSTMs
 LanguageCrossEntropy
 LoRA
 MEM
@@ -279,7 +279,6 @@ Miniconda
 MirroredStrategy
 Mixtral
 MosaicML
-Mpops
 Multicore
 Multithreaded
 MyEnvironment
@@ -293,7 +292,6 @@ NIC
 NICs
 NLI
 NLP
-NN
 NPKit
 NPS
 NSP
@@ -330,7 +328,6 @@ OpenMPI
 OpenSSL
 OpenVX
 OpenXLA
-Optim
 Oversubscription
 PagedAttention
 Pallas
@@ -369,7 +366,6 @@ RDC's
 RDMA
 RDNA
 README
-Recomputation
 RHEL
 RMW
 RNN
@@ -402,7 +398,6 @@ Ryzen
 SALU
 SBIOS
 SCA
-ScaledGEMM
 SDK
 SDMA
 SDPA
@@ -443,8 +438,6 @@ TCI
 TCIU
 TCP
 TCR
-TensorRT
-TensorFloat
 TF
 TFLOPS
 TP
@@ -531,7 +524,6 @@ allocator
 allocators
 amdgpu
 api
-aten
 atmi
 atomics
 autogenerated
@@ -702,7 +694,6 @@ installable
 interop
 interprocedural
 intra
-intrinsics
 invariants
 invocating
 ipo
@@ -721,13 +712,11 @@ linearized
 linter
 linux
 llvm
-lm
 localscratch
 logits
 lossy
 macOS
 matchers
-megatron
 microarchitecture
 migraphx
 migratable
@@ -799,7 +788,6 @@ quantile
 quantizer
 quasirandom
 queueing
-qwen
 radeon
 rccl
 rdc
@@ -808,7 +796,6 @@ reStructuredText
 redirections
 refactorization
 reformats
-reinforcememt
 repo
 repos
 representativeness
@@ -816,7 +803,6 @@ req
 resampling
 rescaling
 reusability
-RLHF
 roadmap
 roc
 rocAL
@@ -854,7 +840,6 @@ roctracer
 rst
 runtime
 runtimes
-ResNet
 sL
 scalability
 scalable
@@ -870,7 +855,6 @@ sm
 smi
 softmax
 spack
-spmm
 src
 stochastically
 strided
@@ -879,7 +863,6 @@ subdirectory
 subexpression
 subfolder
 subfolders
-submatrix
 submodule
 submodules
 subnet
@@ -904,7 +887,6 @@ torchvision
 tqdm
 tracebacks
 txt
-TopK
 uarch
 uncached
 uncacheable
@@ -932,7 +914,6 @@ vectorize
 vectorized
 vectorizer
 vectorizes
-verl
 virtualize
 virtualized
 vjxb
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -47,9 +47,7 @@ See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/roc
 #### Changed

 * HIP runtime uses device bitcode before SPIRV.
-* The implementation of preventing `hipLaunchKernel` latency degradation with number of idle streams is reverted/disabled by default.
-* Stop using `__AMDGCN_WAVEFRONT_SIZE` and `warpSize` as compile-time constants. The `warpSize` variable is no longer `constexpr`, in order to match the CUDA specification.
-  See more details of the `warpSize` change within the [ROCm upcoming changes](#rocm-upcoming-changes).
+* The implementation of preventing `hipLaunchKernel` latency degradation with number of idle streams is reverted or disabled by default.

 #### Optimized

--- a/README.md
+++ b/README.md
@@ -19,143 +19,17 @@ ROCm supports programming models, such as OpenMP and OpenCL, and includes all ne
 source software compilers, debuggers, and libraries. ROCm is fully integrated into machine learning
 (ML) frameworks, such as PyTorch and TensorFlow.

-## Getting the ROCm Source Code
+> [!IMPORTANT]
+> A new open source build platform for ROCm is under development at
+> https://github.com/ROCm/TheRock, featuring a unified CMake build with bundled
+> dependencies, Windows support, and more.
+>
+> The instructions below describe the prior process for building from source
+> which will be replaced once TheRock is mature enough.

-AMD ROCm is built from open source software. It is, therefore, possible to modify the various components of ROCm by downloading the source code and rebuilding the components. The source code for ROCm components can be cloned from each of the GitHub repositories using git.  For easy access to download the correct versions of each of these tools, the ROCm repository contains a repo manifest file called [default.xml](./default.xml). You can use this manifest file to download the source code for ROCm software.
+## Getting and Building ROCm from Source

-### Installing the repo tool
-
-The repo tool from Google allows you to manage multiple git repositories simultaneously. Run the following commands to install the repo tool:
-
-```bash
-mkdir -p ~/bin/
-curl https://storage.googleapis.com/git-repo-downloads/repo > ~/bin/repo
-chmod a+x ~/bin/repo
-```
-
-**Note:** The ```~/bin/``` folder is used as an example. You can specify a different folder to install the repo tool into if you desire.
-
-### Installing git-lfs
-
-Some ROCm projects use the Git Large File Storage (LFS) format that may require you to install git-lfs. Refer to [Git Large File Storage](https://github.com/git-lfs/git-lfs/blob/main/INSTALLING.md) for more information. For example, to install git-lfs for Ubuntu, use the following command:
-
-```bash
-sudo apt-get install git-lfs
-```
-
-### Downloading the ROCm source code
-
-The following example shows how to use the repo tool to download the ROCm source code. If you choose a directory other than ~/bin/ to install the repo tool, you must use that chosen directory in the code as shown below:
-
-```bash
-mkdir -p ~/ROCm/
-cd ~/ROCm/
-export ROCM_VERSION=6.4.1
-~/bin/repo init -u http://github.com/ROCm/ROCm.git -b roc-6.4.x -m tools/rocm-build/rocm-${ROCM_VERSION}.xml
-~/bin/repo sync
-```
-
-**Note:** Using this sample code will cause the repo tool to download the open source code associated with the specified ROCm release. Ensure that you have ssh-keys configured on your machine for your GitHub ID prior to the download as explained at [Connecting to GitHub with SSH](https://docs.github.com/en/authentication/connecting-to-github-with-ssh).
-
-## Building the ROCm source code
-
-Each ROCm component repository contains directions for building that component, such as the rocSPARSE documentation [Installation and Building for Linux](https://rocm.docs.amd.com/projects/rocSPARSE/en/latest/install/Linux_Install_Guide.html). Refer to the specific component documentation for instructions on building the repository.
-
-Each release of the ROCm software supports specific hardware and software configurations. Refer to [System requirements (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html) for the current supported hardware and OS.
-
-## Build ROCm from source
-
-The Build will use as many processors as it can find to build in parallel. Some of the compiles can consume as much as 10GB of RAM, so make sure you have plenty of Swap Space !
-
-By default the ROCm build will compile for all supported GPU architectures and will take approximately 500 CPU hours.
-The Build time will reduce significantly if we limit the GPU Architecture/s against which we need to build by using the environment variable GPU_ARCHS as mentioned below.
-
-```bash
-# --------------------------------------
-# Step1: clone source code
-# --------------------------------------
-
-mkdir -p ~/WORKSPACE/      # Or any folder name other than WORKSPACE
-cd ~/WORKSPACE/
-export ROCM_VERSION=6.4.1
-~/bin/repo init -u http://github.com/ROCm/ROCm.git -b roc-6.4.x -m tools/rocm-build/rocm-${ROCM_VERSION}.xml
-~/bin/repo sync
-
-# --------------------------------------
-# Step 2: Prepare build environment
-# --------------------------------------
-
-# Option 1: Start a docker container
-# Pulling required base docker images:
-# Ubuntu22.04 built from ROCm/tools/rocm-build/docker/ubuntu22/Dockerfile
-docker pull rocm/rocm-build-ubuntu-22.04:6.4
-# Ubuntu24.04 built from ROCm/tools/rocm-build/docker/ubuntu24/Dockerfile
-docker pull rocm/rocm-build-ubuntu-24.04:6.4
-
-# Start docker container and mount the source code folder:
-docker run -ti \
-    -e ROCM_VERSION=${ROCM_VERSION} \
-    -e CCACHE_DIR=$HOME/.ccache \
-    -e CCACHE_ENABLED=true \
-    -e DOCK_WORK_FOLD=/src \
-    -w /src \
-    -v $PWD:/src \
-    -v /etc/passwd:/etc/passwd \
-    -v /etc/shadow:/etc/shadow \
-    -v ${HOME}/.ccache:${HOME}/.ccache \
-    -u $(id -u):$(id -g) \
-    <replace_with_required_ubuntu_base_docker_image> bash
-
-# Option 2: Install required packages into the host machine
-# For ubuntu22.04 system
-cd ROCm/tools/rocm-build/docker/ubuntu22
-cp * /tmp && cd /tmp
-bash install-prerequisites.sh
-# For ubuntu24.04 system
-cd ROCm/tools/rocm-build/docker/ubuntu24
-cp * /tmp && cd /tmp
-bash install-prerequisites.sh
-
-# --------------------------------------
-# Step 3: Run build command line
-# --------------------------------------
-
-# Select GPU targets before building:
-# When GPU_ARCHS is not set, default GPU targets supported by ROCm6.1 will be used.
-# To build against a subset of GFX architectures you can use the below env variable.
-# Support MI300 (gfx940, gfx941, gfx942).
-export GPU_ARCHS="gfx942"               # Example
-export GPU_ARCHS="gfx940;gfx941;gfx942" # Example
-
-cd ~/WORKSPACE/
-# Pick and run build commands in the docker container:
-# Build rocm-dev packages
-make -f ROCm/tools/rocm-build/ROCm.mk -j ${NPROC:-$(nproc)} rocm-dev
-# Build all ROCm packages
-make -f ROCm/tools/rocm-build/ROCm.mk -j ${NPROC:-$(nproc)} all
-# list all ROCm components to find required components
-make -f ROCm/tools/rocm-build/ROCm.mk list_components
-# Build a single ROCm packages
-make -f ROCm/tools/rocm-build/ROCm.mk T_rocblas
-
-# Find built packages in ubuntu22.04:
-out/ubuntu-22.04/22.04/deb/
-# Find built packages in ubuntu24.04:
-out/ubuntu-24.04/24.04/deb/
-
-# Find built logs in ubuntu22.04:
-out/ubuntu-22.04/22.04/logs/
-# Find built logs in ubuntu24.04:
-out/ubuntu-24.04/24.04/logs/
-# All logs pertaining to failed components, end with .errrors extension.
-out/ubuntu-22.04/22.04/logs/rocblas.errors      # Example
-# All logs pertaining to building components, end with .inprogress extension.
-out/ubuntu-22.04/22.04/logs/rocblas.inprogress  # Example
-# All logs pertaining to passed components, use the component names.
-out/ubuntu-22.04/22.04/logs/rocblas             # Example
-```
-
-Note: [Overview for ROCm.mk](tools/rocm-build/README.md)
+Please use [TheRock](https://github.com/ROCm/TheRock) build system to build ROCm from source.

 ## ROCm documentation

--- a/RELEASE.md
+++ b/RELEASE.md
@@ -462,8 +462,6 @@ See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/roc

 * HIP runtime uses device bitcode before SPIRV.
 * The implementation of preventing `hipLaunchKernel` latency degradation with number of idle streams is reverted/disabled by default.
-* Stop using `__AMDGCN_WAVEFRONT_SIZE` and `warpSize` as compile-time constants. The `warpSize` variable is no longer `constexpr`, in order to match the CUDA specification.
-  See more details of the `warpSize` change within the [ROCm upcoming changes](#rocm-upcoming-changes).

 #### Optimized

@@ -579,9 +577,9 @@ See [issue #4768](https://github.com/ROCm/ROCm/issues/4768) on GitHub.

 `rocm-smi-lib` does not get uninstalled and remains orphaned on RHEL and SLES systems when:

-* [Uninstalling ROCm using the AMDGPU installer](https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.1/install/amdgpu-install.html#uninstalling-rocm) with `amdgpu-install --uninstall`
+* [Uninstalling ROCm using the AMDGPU installer](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/amdgpu-install.html#uninstalling-rocm) with `amdgpu-install --uninstall`

-* [Uninstalling via package manager](https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.1/install/install-methods/package-manager/package-manager-rhel.html#uninstall-rocm-packages)
+* [Uninstalling via package manager](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/install-methods/package-manager/package-manager-rhel.html#uninstall-rocm-packages)
  with `dnf remove rocm-core` on RHEL or `zypper remove rocm-core` on SLES.

 As a workaround, manually remove the `rocm-smi-lib` package using `sudo dnf remove rocm-smi-lib` or `sudo zypper remove rocm-smi-lib`.
@@ -656,4 +654,4 @@ There are a number of upcoming changes planned for HIP runtime API in an upcomin
 that are not backward compatible with prior releases. Most of these changes increase 
 alignment between HIP and CUDA APIs or behavior. Some of the upcoming changes are to 
 clean up header files, remove namespace collision, and have a clear separation between 
-`hipRTC` and HIP runtime. For more information, see [HIP 7.0 Is Coming: What You Need to Know to Stay Ahead](https://rocm.blogs.amd.com/ecosystems-and-partners/transition-to-hip-7.0-blog/README.html).
+`hipRTC` and HIP runtime.
--- a/docs/compatibility/compatibility-matrix-historical-6.0.csv
+++ b/docs/compatibility/compatibility-matrix-historical-6.0.csv
@@ -30,9 +30,6 @@ ROCm Version,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0, 6.1.5,
      :doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13"
      :doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.14.0, 2.13.1, 2.12.1","2.14.0, 2.13.1, 2.12.1"
      :doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.4.35,0.4.35,0.4.31,0.4.31,0.4.31,0.4.31,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26
-      :doc:`verl <../compatibility/ml-compatibility/verl-compatibility>` [#verl_compat]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.3.0.post0,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>` [#stanford-megatron-lm_compat]_,N/A,N/A,N/A,N/A,N/A,85f95ae,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat]_,N/A,2.4.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
      `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.2,1.2,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.14.1,1.14.1
 ,,,,,,,,,,,,,,,,
      ,,,,,,,,,,,,,,,,
--- a/docs/compatibility/compatibility-matrix.rst
+++ b/docs/compatibility/compatibility-matrix.rst
@@ -54,9 +54,7 @@ compatibility and system requirements.
      FRAMEWORK SUPPORT,.. _framework-support-compatibility-matrix:,,
      :doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.4, 2.3, 2.2, 2.1, 2.0, 1.13"
      :doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1"
-      :doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.4.35,0.4.35,0.4.31  
-      :doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>`,N/A,N/A,85f95ae
-      :doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>`,N/A,2.4.0,N/A
+      :doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.4.35,0.4.35,0.4.31
      `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.2,1.2,1.17.3
      ,,,
      THIRD PARTY COMMS,.. _thirdpartycomms-support-compatibility-matrix:,,
@@ -157,7 +155,7 @@ compatibility and system requirements.
 .. [#mi300x] Oracle Linux and Azure Linux are supported only on AMD Instinct MI300X.
 .. [#single-node] Debian 12 is supported only on AMD Instinct MI300X for single-node functionality. 
 .. [#mi300_620] **For ROCm 6.2.0** - MI300X (gfx942) is supported on listed operating systems *except* Ubuntu 22.04.5 [6.8 HWE] and Ubuntu 22.04.4 [6.5 HWE].
-.. [#kfd_support] As of ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The tested user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and kernel-space support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
+.. [#kfd_support] Starting from ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart (assuming hardware support is available in both). For earlier ROCm releases, the compatibility is provided for +/- 2 releases. These are the compatibility combinations that are currently supported.
 .. [#ROCT-rocr] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.
 .. [#RDNA-OS] Radeon AI PRO R9700, Radeon RX 9070 XT (gfx1201), Radeon RX 9060 XT (gfx1200), Radeon PRO W7700 (gfx1101), and Radeon RX 7800 XT (gfx1101) are supported only on Ubuntu 24.04.2, Ubuntu 22.04.5, RHEL 9.6, RHEL 9.5, and RHEL 9.4.

@@ -237,9 +235,6 @@ Expand for full historical view of:
   .. [#mi300_610-past-60] **For ROCm 6.1.0** - MI300A (gfx942) is supported on Ubuntu 22.04.4, RHEL 9.4, RHEL 9.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.4.
   .. [#mi300_602-past-60] **For ROCm 6.0.2** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.
   .. [#mi300_600-past-60] **For ROCm 6.0.0** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.
-   .. [#verl_compat] verl is only supported on ROCm 6.2.0.
-   .. [#stanford-megatron-lm_compat] Stanford Megatron-LM is only supported on ROCm 6.3.0.
-   .. [#dgl_compat] DGL is only supported on ROCm 6.4.0.
-   .. [#kfd_support-past-60] As of ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The tested user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and kernel-space support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
+   .. [#kfd_support-past-60] Starting from ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart (assuming hardware support is available in both). For earlier ROCm releases, the compatibility is provided for +/- 2 releases. These are the compatibility combinations that are currently supported.
   .. [#ROCT-rocr-past-60] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.
   .. [#RDNA-OS-past-60] Radeon AI PRO R9700, Radeon RX 9070 XT (gfx1201), Radeon RX 9060 XT (gfx1200), Radeon PRO W7700 (gfx1101), and Radeon RX 7800 XT (gfx1101) are supported only on Ubuntu 24.04.2, Ubuntu 22.04.5, RHEL 9.6, RHEL 9.5, and RHEL 9.4.
--- a/docs/compatibility/ml-compatibility/dgl-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/dgl-compatibility.rst
@@ -1,255 +0,0 @@
-:orphan:
-
-.. meta::
-    :description: Deep Graph Library (DGL) compatibility
-    :keywords: GPU, DGL compatibility
-
-.. version-set:: rocm_version latest
-
-********************************************************************************
-DGL compatibility
-********************************************************************************
-
-Deep Graph Library `(DGL) <https://www.dgl.ai/>`_ is an easy-to-use, high-performance and scalable 
-Python package for deep learning on graphs. DGL is framework agnostic, meaning 
-if a deep graph model is a component in an end-to-end application, the rest of 
-the logic is implemented using PyTorch.  
-
-* ROCm support for DGL is hosted in the `https://github.com/ROCm/dgl <https://github.com/ROCm/dgl>`_ repository. 
-* Due to independent compatibility considerations, this location differs from the `https://github.com/dmlc/dgl <https://github.com/dmlc/dgl>`_ upstream repository. 
-* Use the prebuilt :ref:`Docker images <dgl-docker-compat>` with DGL, PyTorch, and ROCm preinstalled.
-* See the :doc:`ROCm DGL installation guide <rocm-install-on-linux:install/3rd-party/dgl-install>` 
-  to install and get started.
-
-
-Supported devices
-================================================================================
-
- **Officially Supported**: TF32 with AMD Instinct MI300X (through hipblaslt)
- **Partially Supported**: TF32 with AMD Instinct MI250X
-
-
-.. _dgl-recommendations:
-
-Use cases and recommendations
-================================================================================
-
-DGL can be used for Graph Learning, and building popular graph models like  
-GAT, GCN and GraphSage. Using these we can support a variety of use-cases such as:
-
- Recommender systems
- Network Optimization and Analysis
- 1D (Temporal) and 2D (Image) Classification
- Drug Discovery
-
-Multiple use cases of DGL have been tested and verified.
-However, a recommended example follows a drug discovery pipeline using the ``SE3Transformer``.
-Refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`_, 
-where you can search for DGL examples and best practices to optimize your training workflows on AMD GPUs. 
-
-Coverage includes:
-
- Single-GPU training/inference
- Multi-GPU training
-
-
-.. _dgl-docker-compat:
-
-Docker image compatibility
-================================================================================
-
-.. |docker-icon| raw:: html
-
-   <i class="fab fa-docker"></i>
-
-AMD validates and publishes `DGL images <https://hub.docker.com/r/rocm/dgl>`_
-with ROCm and Pytorch backends on Docker Hub. The following Docker image tags and associated
-inventories were tested on `ROCm 6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`_.
-Click the |docker-icon| to view the image on Docker Hub.
-
-.. list-table:: DGL Docker image components
-    :header-rows: 1
-    :class: docker-image-compatibility
-
-    * - Docker
-      - DGL
-      - PyTorch
-      - Ubuntu
-      - Python
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.6.0/images/sha256-8ce2c3bcfaa137ab94a75f9e2ea711894748980f57417739138402a542dd5564"><i class="fab fa-docker fa-lg"></i></a>
-
-      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`_
-      - `2.6.0 <https://github.com/ROCm/pytorch/tree/release/2.6>`_
-      - 24.04
-      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.4.1/images/sha256-cf1683283b8eeda867b690229c8091c5bbf1edb9f52e8fb3da437c49a612ebe4"><i class="fab fa-docker fa-lg"></i></a>
-
-      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`_
-      - `2.4.1 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
-      - 24.04
-      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
-
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.4.1/images/sha256-4834f178c3614e2d09e89e32041db8984c456d45dfd20286e377ca8635686554"><i class="fab fa-docker fa-lg"></i></a>
-
-      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`_
-      - `2.4.1 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
-      - 22.04
-      - `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
-
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/dgl/dgl-2.4_rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.3.0/images/sha256-88740a2c8ab4084b42b10c3c6ba984cab33dd3a044f479c6d7618e2b2cb05e69"><i class="fab fa-docker fa-lg"></i></a>
-
-      - `2.4.0 <https://github.com/dmlc/dgl/releases/tag/v2.4.0>`_
-      - `2.3.0 <https://github.com/ROCm/pytorch/tree/release/2.3>`_
-      - 22.04
-      - `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
-      
-
-Key ROCm libraries for DGL
-================================================================================
-
-DGL on ROCm depends on specific libraries that affect its features and performance.
-Using the DGL Docker container or building it with the provided docker file or a ROCm base image is recommended.
-If you prefer to build it yourself, ensure the following dependencies are installed:
-
-.. list-table:: 
-    :header-rows: 1
-
-    * - ROCm library
-      - Version
-      - Purpose
-    * - `Composable Kernel <https://github.com/ROCm/composable_kernel>`_
-      - :version-ref:`"Composable Kernel" rocm_version`
-      - Enables faster execution of core operations like matrix multiplication
-        (GEMM), convolutions and transformations.
-    * - `hipBLAS <https://github.com/ROCm/hipBLAS>`_
-      - :version-ref:`hipBLAS rocm_version`
-      - Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
-        matrix and vector operations.
-    * - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`_
-      - :version-ref:`hipBLASLt rocm_version`
-      - hipBLASLt is an extension of the hipBLAS library, providing additional
-        features like epilogues fused into the matrix multiplication kernel or
-        use of integer tensor cores.
-    * - `hipCUB <https://github.com/ROCm/hipCUB>`_
-      - :version-ref:`hipCUB rocm_version`
-      - Provides a C++ template library for parallel algorithms for reduction,
-        scan, sort and select.
-    * - `hipFFT <https://github.com/ROCm/hipFFT>`_
-      - :version-ref:`hipFFT rocm_version`
-      - Provides GPU-accelerated Fast Fourier Transform (FFT) operations.
-    * - `hipRAND <https://github.com/ROCm/hipRAND>`_
-      - :version-ref:`hipRAND rocm_version`
-      - Provides fast random number generation for GPUs.
-    * - `hipSOLVER <https://github.com/ROCm/hipSOLVER>`_
-      - :version-ref:`hipSOLVER rocm_version`
-      - Provides GPU-accelerated solvers for linear systems, eigenvalues, and
-        singular value decompositions (SVD).
-    * - `hipSPARSE <https://github.com/ROCm/hipSPARSE>`_
-      - :version-ref:`hipSPARSE rocm_version`
-      - Accelerates operations on sparse matrices, such as sparse matrix-vector
-        or matrix-matrix products.
-    * - `hipSPARSELt <https://github.com/ROCm/hipSPARSELt>`_
-      - :version-ref:`hipSPARSELt rocm_version`
-      - Accelerates operations on sparse matrices, such as sparse matrix-vector
-        or matrix-matrix products.
-    * - `hipTensor <https://github.com/ROCm/hipTensor>`_
-      - :version-ref:`hipTensor rocm_version`
-      - Optimizes for high-performance tensor operations, such as contractions.
-    * - `MIOpen <https://github.com/ROCm/MIOpen>`_
-      - :version-ref:`MIOpen rocm_version`
-      - Optimizes deep learning primitives such as convolutions, pooling,
-        normalization, and activation functions.
-    * - `MIGraphX <https://github.com/ROCm/AMDMIGraphX>`_
-      - :version-ref:`MIGraphX rocm_version`
-      - Adds graph-level optimizations, ONNX models and mixed precision support
-        and enable Ahead-of-Time (AOT) Compilation.
-    * - `MIVisionX <https://github.com/ROCm/MIVisionX>`_
-      - :version-ref:`MIVisionX rocm_version`
-      - Optimizes acceleration for computer vision and AI workloads like
-        preprocessing, augmentation, and inferencing.
-    * - `rocAL <https://github.com/ROCm/rocAL>`_
-      - :version-ref:`rocAL rocm_version`
-      - Accelerates the data pipeline by offloading intensive preprocessing and
-        augmentation tasks. rocAL is part of MIVisionX.
-    * - `RCCL <https://github.com/ROCm/rccl>`_
-      - :version-ref:`RCCL rocm_version`
-      - Optimizes for multi-GPU communication for operations like AllReduce and
-        Broadcast.
-    * - `rocDecode <https://github.com/ROCm/rocDecode>`_
-      - :version-ref:`rocDecode rocm_version`
-      - Provides hardware-accelerated data decoding capabilities, particularly
-        for image, video, and other dataset formats.
-    * - `rocJPEG <https://github.com/ROCm/rocJPEG>`_
-      - :version-ref:`rocJPEG rocm_version`
-      - Provides hardware-accelerated JPEG image decoding and encoding.
-    * - `RPP <https://github.com/ROCm/RPP>`_
-      - :version-ref:`RPP rocm_version`
-      - Speeds up data augmentation, transformation, and other preprocessing steps.
-    * - `rocThrust <https://github.com/ROCm/rocThrust>`_
-      - :version-ref:`rocThrust rocm_version`
-      - Provides a C++ template library for parallel algorithms like sorting,
-        reduction, and scanning.
-    * - `rocWMMA <https://github.com/ROCm/rocWMMA>`_
-      - :version-ref:`rocWMMA rocm_version`
-      - Accelerates warp-level matrix-multiply and matrix-accumulate to speed up matrix
-        multiplication (GEMM) and accumulation operations with mixed precision
-        support.
-
-
-Supported features
-================================================================================
-
-Many functions and methods available in DGL Upstream are also supported in DGL ROCm.
-Instead of listing them all, support is grouped into the following categories to provide a general overview. 
-
-* DGL Base
-* DGL Backend 
-* DGL Data
-* DGL Dataloading
-* DGL DGLGraph
-* DGL Function
-* DGL Ops
-* DGL Sampling
-* DGL Transforms
-* DGL Utils
-* DGL Distributed
-* DGL Geometry
-* DGL Mpops
-* DGL NN
-* DGL Optim
-* DGL Sparse
-
-
-Unsupported features
-================================================================================
-
-* Graphbolt
-* Partial TF32 Support (MI250x only)
-* Kineto/ ROCTracer integration
-
-
-Unsupported functions
-================================================================================
-
-* ``more_nnz``
-* ``format``
-* ``multiprocess_sparse_adam_state_dict``
-* ``record_stream_ndarray``
-* ``half_spmm``
-* ``segment_mm`` 
-* ``gather_mm_idx_b``
-* ``pgexplainer``
-* ``sample_labors_prob``
-* ``sample_labors_noprob``
--- a/docs/compatibility/ml-compatibility/jax-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/jax-compatibility.rst
@@ -53,7 +53,7 @@ Use cases and recommendations
 * The `nanoGPT in JAX <https://rocm.blogs.amd.com/artificial-intelligence/nanoGPT-JAX/README.html>`_
  blog explores the implementation and training of a Generative Pre-trained
  Transformer (GPT) model in JAX, inspired by Andrej Karpathy’s JAX-based
-  nanoGPT. Comparing how essential GPT components—such as self-attention
+  nanoGPT. Comparing how essential GPT components—such as self-attention 
  mechanisms and optimizers—are realized in JAX and JAX, also highlights
  JAX’s unique features.

@@ -160,14 +160,12 @@ associated inventories are tested for `ROCm 6.3.2 <https://repo.radeon.com/rocm/
      - Ubuntu 22.04
      - `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_

-.. _key_rocm_libraries:
-
 Key ROCm libraries for JAX
 ================================================================================

-The following ROCm libraries represent potential targets that could be utilized
-by JAX on ROCm for various computational tasks. The actual libraries used will
-depend on the specific implementation and operations performed.
+JAX functionality on ROCm is determined by its underlying library
+dependencies. These ROCm components affect the capabilities, performance, and
+feature set available to developers.

 .. list-table::
    :header-rows: 1
@@ -175,140 +173,347 @@ depend on the specific implementation and operations performed.
    * - ROCm library
      - Version
      - Purpose
+      - Used in
    * - `hipBLAS <https://github.com/ROCm/hipBLAS>`_
      - :version-ref:`hipBLAS rocm_version`
      - Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
        matrix and vector operations.
+      - Matrix multiplication in ``jax.numpy.matmul``, ``jax.lax.dot`` and
+        ``jax.lax.dot_general``, operations like ``jax.numpy.dot``, which
+        involve vector and matrix computations and batch matrix multiplications
+        ``jax.numpy.einsum`` with matrix-multiplication patterns algebra
+        operations.
    * - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`_
      - :version-ref:`hipBLASLt rocm_version`
      - hipBLASLt is an extension of hipBLAS, providing additional
        features like epilogues fused into the matrix multiplication kernel or
        use of integer tensor cores.
+      - Matrix multiplication in ``jax.numpy.matmul`` or ``jax.lax.dot``, and
+        the XLA (Accelerated Linear Algebra) use hipBLASLt for optimized matrix
+        operations, mixed-precision support, and hardware-specific
+        optimizations.
    * - `hipCUB <https://github.com/ROCm/hipCUB>`_
      - :version-ref:`hipCUB rocm_version`
      - Provides a C++ template library for parallel algorithms for reduction,
        scan, sort and select.
+      - Reduction functions (``jax.numpy.sum``, ``jax.numpy.mean``,
+        ``jax.numpy.prod``, ``jax.numpy.max`` and ``jax.numpy.min``), prefix sum
+        (``jax.numpy.cumsum``, ``jax.numpy.cumprod``) and sorting
+        (``jax.numpy.sort``, ``jax.numpy.argsort``).
    * - `hipFFT <https://github.com/ROCm/hipFFT>`_
      - :version-ref:`hipFFT rocm_version`
      - Provides GPU-accelerated Fast Fourier Transform (FFT) operations.
+      - Used in functions like ``jax.numpy.fft``.
    * - `hipRAND <https://github.com/ROCm/hipRAND>`_
      - :version-ref:`hipRAND rocm_version`
      - Provides fast random number generation for GPUs.
+      - The ``jax.random.uniform``, ``jax.random.normal``,
+        ``jax.random.randint`` and ``jax.random.split``.
    * - `hipSOLVER <https://github.com/ROCm/hipSOLVER>`_
      - :version-ref:`hipSOLVER rocm_version`
      - Provides GPU-accelerated solvers for linear systems, eigenvalues, and
        singular value decompositions (SVD).
+      - Solving linear systems (``jax.numpy.linalg.solve``), matrix
+        factorizations, SVD (``jax.numpy.linalg.svd``) and eigenvalue problems
+        (``jax.numpy.linalg.eig``).
    * - `hipSPARSE <https://github.com/ROCm/hipSPARSE>`_
      - :version-ref:`hipSPARSE rocm_version`
      - Accelerates operations on sparse matrices, such as sparse matrix-vector
        or matrix-matrix products.
+      - Sparse matrix multiplication (``jax.numpy.matmul``), sparse
+        matrix-vector and matrix-matrix products
+        (``jax.experimental.sparse.dot``), sparse linear system solvers and
+        sparse data handling.
    * - `hipSPARSELt <https://github.com/ROCm/hipSPARSELt>`_
      - :version-ref:`hipSPARSELt rocm_version`
      - Accelerates operations on sparse matrices, such as sparse matrix-vector
        or matrix-matrix products.
+      - Sparse matrix multiplication (``jax.numpy.matmul``), sparse
+        matrix-vector and matrix-matrix products
+        (``jax.experimental.sparse.dot``) and sparse linear system solvers.
    * - `MIOpen <https://github.com/ROCm/MIOpen>`_
      - :version-ref:`MIOpen rocm_version`
      - Optimized for deep learning primitives such as convolutions, pooling,
        normalization, and activation functions.
+      - Speeds up convolutional neural networks (CNNs), recurrent neural
+        networks (RNNs), and other layers. Used in operations like
+        ``jax.nn.conv``, ``jax.nn.relu``, and ``jax.nn.batch_norm``.
    * - `RCCL <https://github.com/ROCm/rccl>`_
      - :version-ref:`RCCL rocm_version`
      - Optimized for multi-GPU communication for operations like  all-reduce,
        broadcast, and scatter.
+      - Distribute computations across multiple GPU with ``pmap`` and
+        ``jax.distributed``. XLA automatically uses rccl when executing
+        operations across multiple GPUs on AMD hardware.
    * - `rocThrust <https://github.com/ROCm/rocThrust>`_
      - :version-ref:`rocThrust rocm_version`
      - Provides a C++ template library for parallel algorithms like sorting,
        reduction, and scanning.
+      - Reduction operations like ``jax.numpy.sum``, ``jax.pmap`` for
+        distributed training, which involves parallel reductions or
+        operations like ``jax.numpy.cumsum`` can use rocThrust.

-.. note::
-
-    This table shows ROCm libraries that could potentially be utilized by JAX. Not
-    all libraries may be used in every configuration, and the actual library usage
-    will depend on the specific operations and implementation details.
-
-Supported data types and modules
+Supported features
 ===============================================================================

-The following tables lists the supported public JAX API data types and modules.
-
-Supported data types
--------------------------------------------------------------------------------
-
-ROCm supports all the JAX data types of `jax.dtypes <https://docs.jax.dev/en/latest/jax.dtypes.html>`_
-module, `jax.numpy.dtype <https://docs.jax.dev/en/latest/_autosummary/jax.numpy.dtype.html>`_
-and `default_dtype <https://docs.jax.dev/en/latest/default_dtypes.html>`_ .
-The ROCm supported data types in JAX are collected in the following table.
+The following table maps the public JAX API modules to their supported
+ROCm and JAX versions.

 .. list-table::
    :header-rows: 1

-    * - Data type
+    * - Module
+      - Description
+      - As of JAX
+      - As of ROCm
+    * - ``jax.numpy``
+      - Implements the NumPy API, using the primitives in ``jax.lax``.
+      - 0.1.56
+      - 5.0.0
+    * - ``jax.scipy``
+      - Provides GPU-accelerated and differentiable implementations of many
+        functions from the SciPy library, leveraging JAX's transformations
+        (e.g., ``grad``, ``jit``, ``vmap``).
+      - 0.1.56
+      - 5.0.0
+    * - ``jax.lax``
+      - A library of primitives operations that underpins libraries such as
+        ``jax.numpy.`` Transformation rules, such as Jacobian-vector product
+        (JVP) and batching rules, are typically defined as transformations on
+        ``jax.lax`` primitives.
+      - 0.1.57
+      - 5.0.0
+    * - ``jax.random``
+      - Provides a number of routines for deterministic generation of sequences
+        of pseudorandom numbers.
+      - 0.1.58
+      - 5.0.0
+    * - ``jax.sharding``
+      - Allows to define partitioning and distributing arrays across multiple
+        devices.
+      - 0.3.20
+      - 5.1.0
+    * - ``jax.distributed``
+      - Enables the scaling of computations across multiple devices on a single
+        machine or across multiple machines.
+      - 0.1.74
+      - 5.0.0
+    * - ``jax.image``
+      - Contains image manipulation functions like resize, scale and translation.
+      - 0.1.57
+      - 5.0.0
+    * - ``jax.nn``
+      - Contains common functions for neural network libraries.
+      - 0.1.56
+      - 5.0.0
+    * - ``jax.ops``
+      - Computes the minimum, maximum, sum or product within segments of an
+        array.
+      - 0.1.57
+      - 5.0.0
+    * - ``jax.stages``
+      - Contains interfaces to stages of the compiled execution process.
+      - 0.3.4
+      - 5.0.0
+    * - ``jax.extend``
+      - Provides modules for access to JAX internal machinery module. The
+        ``jax.extend`` module defines a library view of some of JAX’s internal
+        components.
+      - 0.4.15
+      - 5.5.0
+    * - ``jax.example_libraries``
+      - Serves as a collection of example code and libraries that demonstrate
+        various capabilities of JAX.
+      - 0.1.74
+      - 5.0.0
+    * - ``jax.experimental``
+      - Namespace for experimental features and APIs that are in development or
+        are not yet fully stable for production use.
+      - 0.1.56
+      - 5.0.0
+    * - ``jax.lib``
+      - Set of internal tools and types for bridging between JAX’s Python
+        frontend and its XLA backend.
+      - 0.4.6
+      - 5.3.0
+    * - ``jax_triton``
+      - Library that integrates the Triton deep learning compiler with JAX.
+      - jax_triton 0.2.0
+      - 6.2.4
+
+jax.scipy module
+-------------------------------------------------------------------------------
+
+A SciPy-like API for scientific computing.
+
+.. list-table::
+    :header-rows: 1
+
+    * - Module
+      - As of JAX
+      - As of ROCm
+    * - ``jax.scipy.cluster``
+      - 0.3.11
+      - 5.1.0
+    * - ``jax.scipy.fft``
+      - 0.1.71
+      - 5.0.0
+    * - ``jax.scipy.integrate``
+      - 0.4.15
+      - 5.5.0
+    * - ``jax.scipy.interpolate``
+      - 0.1.76
+      - 5.0.0
+    * - ``jax.scipy.linalg``
+      - 0.1.56
+      - 5.0.0
+    * - ``jax.scipy.ndimage``
+      - 0.1.56
+      - 5.0.0
+    * - ``jax.scipy.optimize``
+      - 0.1.57
+      - 5.0.0
+    * - ``jax.scipy.signal``
+      - 0.1.56
+      - 5.0.0
+    * - ``jax.scipy.spatial.transform``
+      - 0.4.12
+      - 5.4.0
+    * - ``jax.scipy.sparse.linalg``
+      - 0.1.56
+      - 5.0.0
+    * - ``jax.scipy.special``
+      - 0.1.56
+      - 5.0.0
+    * - ``jax.scipy.stats``
+      - 0.1.56
+      - 5.0.0
+
+jax.scipy.stats module
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. list-table::
+   :header-rows: 1
+
+   * - Module
+     - As of JAX
+     - As of ROCm
+   * - ``jax.scipy.stats.bernouli``
+     - 0.1.56
+     - 5.0.0
+   * - ``jax.scipy.stats.beta``
+     - 0.1.56
+     - 5.0.0
+   * - ``jax.scipy.stats.betabinom``
+     - 0.1.61
+     - 5.0.0
+   * - ``jax.scipy.stats.binom``
+     - 0.4.14
+     - 5.4.0
+   * - ``jax.scipy.stats.cauchy``
+     - 0.1.56
+     - 5.0.0
+   * - ``jax.scipy.stats.chi2``
+     - 0.1.61
+     - 5.0.0
+   * - ``jax.scipy.stats.dirichlet``
+     - 0.1.56
+     - 5.0.0
+   * - ``jax.scipy.stats.expon``
+     - 0.1.56
+     - 5.0.0
+   * - ``jax.scipy.stats.gamma``
+     - 0.1.56
+     - 5.0.0
+   * - ``jax.scipy.stats.gennorm``
+     - 0.3.15
+     - 5.2.0
+   * - ``jax.scipy.stats.geom``
+     - 0.1.56
+     - 5.0.0
+   * - ``jax.scipy.stats.laplace``
+     - 0.1.56
+     - 5.0.0
+   * - ``jax.scipy.stats.logistic``
+     - 0.1.56
+     - 5.0.0
+   * - ``jax.scipy.stats.multinomial``
+     - 0.3.18
+     - 5.1.0
+   * - ``jax.scipy.stats.multivariate_normal``
+     - 0.1.56
+     - 5.0.0
+   * - ``jax.scipy.stats.nbinom``
+     - 0.1.72
+     - 5.0.0
+   * - ``jax.scipy.stats.norm``
+     - 0.1.56
+     - 5.0.0
+   * - ``jax.scipy.stats.pareto``
+     - 0.1.56
+     - 5.0.0
+   * - ``jax.scipy.stats.poisson``
+     - 0.1.56
+     - 5.0.0
+   * - ``jax.scipy.stats.t``
+     - 0.1.56
+     - 5.0.0
+   * - ``jax.scipy.stats.truncnorm``
+     - 0.4.0
+     - 5.3.0
+   * - ``jax.scipy.stats.uniform``
+     - 0.1.56
+     - 5.0.0
+   * - ``jax.scipy.stats.vonmises``
+     - 0.4.2
+     - 5.3.0
+   * - ``jax.scipy.stats.wrapcauchy``
+     - 0.4.20
+     - 5.6.0
+
+jax.extend module
+-------------------------------------------------------------------------------
+
+Modules for JAX extensions.
+
+.. list-table::
+    :header-rows: 1
+
+    * - Module
+      - As of JAX
+      - As of ROCm
+    * - ``jax.extend.ffi``
+      - 0.4.30
+      - 6.0.0
+    * - ``jax.extend.linear_util``
+      - 0.4.17
+      - 5.6.0
+    * - ``jax.extend.mlir``
+      - 0.4.26
+      - 5.6.0
+    * - ``jax.extend.random``
+      - 0.4.15
+      - 5.5.0
+
+Unsupported JAX features
+===============================================================================
+
+The following GPU-accelerated JAX features are not supported by ROCm for
+the listed supported JAX versions.
+
+.. list-table::
+    :header-rows: 1
+
+    * - Feature
      - Description

-    * - ``bfloat16``
-      - 16-bit bfloat (brain floating point).
+    * - Mixed Precision with TF32
+      - Mixed precision with TF32 is used for matrix multiplications,
+        convolutions, and other linear algebra operations, particularly in
+        deep learning workloads like CNNs and transformers.

-    * - ``bool``
-      - Boolean.
+    * - XLA int4 support
+      - 4-bit integer (int4) precision in the XLA compiler.

-    * - ``complex128``
-      - 128-bit complex.
-
-    * - ``complex64``
-      - 64-bit complex.
-
-    * - ``float16``
-      - 16-bit (half precision) floating-point.
-
-    * - ``float32``
-      - 32-bit (single precision) floating-point.
-
-    * - ``float64``
-      - 64-bit (double precision) floating-point.
-
-    * - ``half``
-      - 16-bit (half precision) floating-point.
-
-    * - ``int16``
-      - Signed 16-bit integer.
-
-    * - ``int32``
-      - Signed 32-bit integer.
-
-    * - ``int64``
-      - Signed 64-bit integer.
-
-    * - ``int8``
-      - Signed 8-bit integer.
-
-    * - ``uint16``
-      - Unsigned 16-bit (word) integer.
-
-    * - ``uint32``
-      - Unsigned 32-bit (dword) integer.
-
-    * - ``uint64``
-      - Unsigned 64-bit (qword) integer.
-
-    * - ``uint8``
-      - Unsigned 8-bit (byte) integer.
-
-.. note::
-
-  JAX data type support is effected by the :ref:`key_rocm_libraries` and it's
-  collected on :doc:`ROCm data types and precision support <rocm:reference/precision-support>`
-  page.
-
-Supported modules
--------------------------------------------------------------------------------
-
-For a complete and up-to-date list of JAX public modules (for example, ``jax.numpy``,
-``jax.scipy``, ``jax.lax``), their descriptions, and usage, please refer directly to the
-`official JAX API documentation <https://jax.readthedocs.io/en/latest/jax.html>`_.
-
-.. note::
-
-  Since version 0.1.56, JAX has full support for ROCm, and the
-  :ref:`Known issues and important notes <jax_comp_known_issues>` section
-  contains details about limitations specific to the ROCm backend. The list of
-  JAX API modules is maintained by the JAX project and is subject to change. 
-  Refer to the official Jax documentation for the most up-to-date information.
+    * - MOSAIC (GPU)
+      - Mosaic is a library of kernel-building abstractions for JAX's Pallas system
--- a/docs/compatibility/ml-compatibility/pytorch-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/pytorch-compatibility.rst
@@ -372,15 +372,24 @@ feature set available to developers.
        involve matrix products, such as ``torch.matmul``, ``torch.bmm``, and
        more.

-Supported modules and data types
+Supported features
 ================================================================================

-The following section outlines the supported data types, modules, and domain libraries available in PyTorch on ROCm.
+This section maps GPU-accelerated PyTorch features to their supported ROCm and
+PyTorch versions.

-Supported data types
+torch
 --------------------------------------------------------------------------------

-The tensor data type is specified using the ``dtype`` attribute or argument.
+`torch <https://pytorch.org/docs/stable/index.html>`_ is the central module of
+PyTorch, providing data structures for multi-dimensional tensors and
+implementing mathematical operations on them. It also includes utilities for
+efficient serialization of tensors and arbitrary data types and other tools.
+
+Tensor data types
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The tensor data type is specified using the ``dtype`` attribute or argument. 
 PyTorch supports many data types for different use cases.

 The following table lists `torch.Tensor <https://pytorch.org/docs/stable/tensors.html>`_
@@ -391,154 +400,539 @@ single data types:

    * - Data type
      - Description
+      - As of PyTorch
+      - As of ROCm
    * - ``torch.float8_e4m3fn``
      - 8-bit floating point, e4m3
+      - 2.3
+      - 5.5
    * - ``torch.float8_e5m2``
      - 8-bit floating point, e5m2
+      - 2.3
+      - 5.5
    * - ``torch.float16`` or ``torch.half``
      - 16-bit floating point
+      - 0.1.6
+      - 2.0
    * - ``torch.bfloat16``
      - 16-bit floating point
+      - 1.6
+      - 2.6
    * - ``torch.float32`` or ``torch.float``
      - 32-bit floating point
+      - 0.1.12_2
+      - 2.0
    * - ``torch.float64`` or ``torch.double``
      - 64-bit floating point
+      - 0.1.12_2
+      - 2.0
    * - ``torch.complex32`` or ``torch.chalf``
-      - 32-bit complex numbers
+      - PyTorch provides native support for 32-bit complex numbers
+      - 1.6
+      - 2.0
    * - ``torch.complex64`` or ``torch.cfloat``
-      - 64-bit complex numbers
+      - PyTorch provides native support for 64-bit complex numbers
+      - 1.6
+      - 2.0
    * - ``torch.complex128`` or ``torch.cdouble``
-      - 128-bit complex numbers
+      - PyTorch provides native support for 128-bit complex numbers
+      - 1.6
+      - 2.0
    * - ``torch.uint8``
      - 8-bit integer (unsigned)
+      - 0.1.12_2
+      - 2.0
    * - ``torch.uint16``
-      - 16-bit integer (unsigned);
-        Not natively supported in ROCm
+      - 16-bit integer (unsigned)
+      - 2.3
+      - Not natively supported
    * - ``torch.uint32``
-      - 32-bit integer (unsigned);
-        Not natively supported in ROCm
+      - 32-bit integer (unsigned)
+      - 2.3
+      - Not natively supported
    * - ``torch.uint64``
-      - 64-bit integer (unsigned);
-        Not natively supported in ROCm
+      - 32-bit integer (unsigned)
+      - 2.3
+      - Not natively supported
    * - ``torch.int8``
      - 8-bit integer (signed)
+      - 1.12
+      - 5.0
    * - ``torch.int16`` or ``torch.short``
      - 16-bit integer (signed)
+      - 0.1.12_2
+      - 2.0
    * - ``torch.int32`` or ``torch.int``
      - 32-bit integer (signed)
+      - 0.1.12_2
+      - 2.0
    * - ``torch.int64`` or ``torch.long``
      - 64-bit integer (signed)
+      - 0.1.12_2
+      - 2.0
    * - ``torch.bool``
      - Boolean
+      - 1.2
+      - 2.0
    * - ``torch.quint8``
      - Quantized 8-bit integer (unsigned)
+      - 1.8
+      - 5.0
    * - ``torch.qint8``
      - Quantized 8-bit integer (signed)
+      - 1.8
+      - 5.0
    * - ``torch.qint32``
      - Quantized 32-bit integer (signed)
+      - 1.8
+      - 5.0
    * - ``torch.quint4x2``
      - Quantized 4-bit integer (unsigned)
+      - 1.8
+      - 5.0

 .. note::

-  Unsigned types, except ``uint8``, have limited support in eager mode. They
+  Unsigned types except ``uint8`` have limited support in eager mode. They
  primarily exist to assist usage with ``torch.compile``.

  See :doc:`ROCm precision support <rocm:reference/precision-support>` for the
  native hardware support of data types.

-Supported modules
--------------------------------------------------------------------------------
+torch.cuda
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-For a complete and up-to-date list of PyTorch core modules (for example., ``torch``,
-``torch.nn``, ``torch.cuda``, ``torch.backends.cuda`` and
-``torch.backends.cudnn``), their descriptions, and usage, please refer directly
-to the `official PyTorch documentation <https://pytorch.org/docs/stable/index.html>`_.
-
-Core PyTorch functionality on ROCm includes tensor operations, neural network
-layers, automatic differentiation, distributed training, mixed-precision
-training, compilation features, and domain-specific libraries for audio, vision,
-text processing, and more.
-
-Supported domain libraries
--------------------------------------------------------------------------------
-
-PyTorch offers specialized `domain libraries <https://pytorch.org/domains/>`_ with
-GPU acceleration that build on its core features to support specific application
-areas. The table below lists the PyTorch domain libraries that are compatible
-with ROCm.
+``torch.cuda`` in PyTorch is a module that provides utilities and functions for
+managing and utilizing AMD and NVIDIA GPUs. It enables GPU-accelerated
+computations, memory management, and efficient execution of tensor operations,
+leveraging ROCm and CUDA as the underlying frameworks.

 .. list-table::
    :header-rows: 1

-    * - Library
+    * - Feature
      - Description
+      - As of PyTorch
+      - As of ROCm
+    * - Device management
+      - Utilities for managing and interacting with GPUs.
+      - 0.4.0
+      - 3.8
+    * - Tensor operations on GPU
+      - Performs tensor operations such as addition and matrix multiplications on
+        the GPU.
+      - 0.4.0
+      - 3.8
+    * - Streams and events
+      - Streams allow overlapping computation and communication for optimized
+        performance. Events enable synchronization.
+      - 1.6.0
+      - 3.8
+    * - Memory management
+      - Functions to manage and inspect memory usage like
+        ``torch.cuda.memory_allocated()``, ``torch.cuda.max_memory_allocated()``,
+        ``torch.cuda.memory_reserved()`` and ``torch.cuda.empty_cache()``.
+      - 0.3.0
+      - 1.9.2
+    * - Running process lists of memory management
+      - Returns a human-readable printout of the running processes and their GPU
+        memory use for a given device with functions like
+        ``torch.cuda.memory_stats()`` and ``torch.cuda.memory_summary()``.
+      - 1.8.0
+      - 4.0
+    * - Communication collectives
+      - Set of APIs that enable efficient communication between multiple GPUs,
+        allowing for distributed computing and data parallelism.
+      - 1.9.0
+      - 5.0
+    * - ``torch.cuda.CUDAGraph``
+      - Graphs capture sequences of GPU operations to minimize kernel launch
+        overhead and improve performance.
+      - 1.10.0
+      - 5.3
+    * - TunableOp
+      - A mechanism that allows certain operations to be more flexible and
+        optimized for performance. It enables automatic tuning of kernel
+        configurations and other settings to achieve the best possible
+        performance based on the specific hardware (GPU) and workload.
+      - 2.0
+      - 5.4
+    * - NVIDIA Tools Extension (NVTX)
+      - Integration with NVTX for profiling and debugging GPU performance using
+        NVIDIA's Nsight tools.
+      - 1.8.0
+      - ❌
+    * - Lazy loading NVRTC
+      - Delays JIT compilation with NVRTC until the code is explicitly needed.
+      - 1.13.0
+      - ❌
+    * - Jiterator (beta)
+      - Jiterator allows asynchronous data streaming into computation streams
+        during training loops.
+      - 1.13.0
+      - 5.2

-    * - `torchaudio <https://docs.pytorch.org/audio/stable/index.html>`_ 
-      - Audio and signal processing library for PyTorch. Provides utilities for
-        audio I/O, signal and data processing functions, datasets, model
-        implementations, and application components for audio and speech
-        processing tasks.
+.. Need to validate and extend.

-        **Note:** To ensure GPU-acceleration with ``torchaudio.transforms``,
-        you need to explicitly move audio data (waveform tensor) to GPU using
-        ``.to('cuda')``.
+torch.backends.cuda
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-    * - `torchtune <https://docs.pytorch.org/torchtune/stable/index.html>`_
-      - PyTorch-native library designed for fine-tuning large language models
-        (LLMs). Provides supports the full fine-tuning workflow and offers
-        compatibility with popular production inference systems.
+``torch.backends.cuda`` is a PyTorch module that provides configuration options
+and flags to control the behavior of ROCm or CUDA operations. It is part of the
+PyTorch backend configuration system, which allows users to fine-tune how
+PyTorch interacts with the ROCm or CUDA environment.

-        **Note:** Only official release exists.
+.. list-table::
+    :header-rows: 1

-    * - `torchvision <https://docs.pytorch.org/vision/stable/index.html>`_
-      - Computer vision library that is part of the PyTorch project. Provides
-        popular datasets, model architectures, and common image transformations
-        for computer vision applications.
+    * - Feature
+      - Description
+      - As of PyTorch
+      - As of ROCm
+    * - ``cufft_plan_cache``
+      - Manages caching of GPU FFT plans to optimize repeated FFT computations.
+      - 1.7.0
+      - 5.0
+    * - ``matmul.allow_tf32``
+      - Enables or disables the use of TensorFloat-32 (TF32) precision for
+        faster matrix multiplications on GPUs with Tensor Cores.
+      - 1.10.0
+      - ❌
+    * - ``matmul.allow_fp16_reduced_precision_reduction``
+      - Reduced precision reductions (e.g., with fp16 accumulation type) are
+        allowed with fp16 GEMMs.
+      - 2.0
+      - ❌
+    * - ``matmul.allow_bf16_reduced_precision_reduction``
+      - Reduced precision reductions are allowed with bf16 GEMMs.
+      - 2.0
+      - ❌
+    * - ``enable_cudnn_sdp``
+      - Globally enables cuDNN SDPA's kernels within SDPA.
+      - 2.0
+      - ❌
+    * - ``enable_flash_sdp``
+      - Globally enables or disables FlashAttention for SDPA.
+      - 2.1
+      - ❌
+    * - ``enable_mem_efficient_sdp``
+      - Globally enables or disables Memory-Efficient Attention for SDPA.
+      - 2.1
+      - ❌
+    * - ``enable_math_sdp``
+      - Globally enables or disables the PyTorch C++ implementation within SDPA.
+      - 2.1
+      - ❌

-    * - `torchtext <https://docs.pytorch.org/text/stable/index.html>`_
-      - Text processing library for PyTorch. Provides data processing utilities
-        and popular datasets for natural language processing, including
-        tokenization, vocabulary management, and text embeddings.
+.. Need to validate and extend.

-        **Note:** ``torchtext`` does not implement ROCm-specific kernels. 
-        ROCm acceleration is provided through the underlying PyTorch framework
-        and ROCm library integration. Only official release exists.
+torch.backends.cudnn
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-    * - `torchdata <https://docs.pytorch.org/data/beta/index.html>`_
-      - Beta library of common modular data loading primitives for easily
-        constructing flexible and performant data pipelines, with features still
-        in prototype stage.
+Supported ``torch`` options include:

-    * - `torchrec <https://docs.pytorch.org/torchrec/>`_
-      - PyTorch domain library for common sparsity and parallelism primitives
-        needed for large-scale recommender systems, enabling authors to train
-        models with large embedding tables shared across many GPUs.
+.. list-table::
+    :header-rows: 1

-        **Note:** ``torchrec`` does not implement ROCm-specific kernels. ROCm
-        acceleration is provided through the underlying PyTorch framework and
-        ROCm library integration.
+    * - Option
+      - Description
+      - As of PyTorch
+      - As of ROCm
+    * - ``allow_tf32``
+      - TensorFloat-32 tensor cores may be used in cuDNN convolutions on NVIDIA
+        Ampere or newer GPUs.
+      - 1.12.0
+      - ❌
+    * - ``deterministic``
+      - A bool that, if True, causes cuDNN to only use deterministic
+        convolution algorithms.
+      - 1.12.0
+      - 6.0

-    * - `torchserve <https://docs.pytorch.org/serve/>`_
-      - Performant, flexible and easy-to-use tool for serving PyTorch models in
-        production, providing features for model management, batch processing,
-        and scalable deployment.
+Automatic mixed precision: torch.amp
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-        **Note:** `torchserve <https://docs.pytorch.org/serve/>`_ is no longer
-        actively maintained. Last official release is sent out with PyTorch 2.4.
+PyTorch automates the process of using both 16-bit (half-precision, float16) and
+32-bit (single-precision, float32) floating-point types in model training and
+inference.

-    * - `torchrl <https://docs.pytorch.org/rl/stable/index.html>`_
-      - Open-source, Python-first Reinforcement Learning library for PyTorch
-        with a focus on high modularity and good runtime performance, providing
-        low and high-level RL abstractions and reusable functionals for cost
-        functions, returns, and data processing.
+.. list-table::
+    :header-rows: 1

-        **Note:** Only official release exists.
+    * - Feature
+      - Description
+      - As of PyTorch
+      - As of ROCm
+    * - Autocasting
+      - Autocast instances serve as context managers or decorators that allow
+        regions of your script to run in mixed precision.
+      - 1.9
+      - 2.5
+    * - Gradient scaling
+      - To prevent underflow, “gradient scaling” multiplies the network’s
+        loss by a scale factor and invokes a backward pass on the scaled
+        loss. The same factor then scales gradients flowing backward through
+        the network. In other words, gradient values have a larger magnitude so
+        that they don’t flush to zero.
+      - 1.9
+      - 2.5
+    * - CUDA op-specific behavior
+      - These ops always go through autocasting whether they are invoked as part
+        of a ``torch.nn.Module``, as a function, or as a ``torch.Tensor`` method. If
+        functions are exposed in multiple namespaces, they go through
+        autocasting regardless of the namespace.
+      - 1.9
+      - 2.5

-    * - `tensordict <https://docs.pytorch.org/tensordict/stable/index.html>`_
-      - Dictionary-like class that simplifies operations on batches of tensors,
-        enhancing code readability, compactness, and modularity by abstracting
-        tailored operations and reducing errors through automatic operation
-        dispatching.
+Distributed library features
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-        **Note:** Only official release exists.
+PyTorch distributed library includes a collective of parallelism modules, a
+communications layer, and infrastructure for launching and debugging large
+training jobs. See :ref:`rocm-for-ai-pytorch-distributed` for more information.
+
+The Distributed Library feature in PyTorch provides tools and APIs for building
+and running distributed machine learning workflows. It allows training models
+across multiple processes, GPUs, or nodes in a cluster, enabling efficient use
+of computational resources and scalability for large-scale tasks.
+
+.. list-table::
+    :header-rows: 1
+
+    * - Feature
+      - Description
+      - As of PyTorch
+      - As of ROCm
+    * - TensorPipe
+      - A point-to-point communication library integrated into
+        PyTorch for distributed training. It handles tensor data transfers
+        efficiently between different processes or devices, including those on
+        separate machines.
+      - 1.8
+      - 5.4
+    * - Gloo
+      - Designed for multi-machine and multi-GPU setups, enabling
+        efficient communication and synchronization between processes. Gloo is
+        one of the default backends for PyTorch's Distributed Data Parallel
+        (DDP) and RPC frameworks, alongside other backends like NCCL and MPI.
+      - 1.0
+      - 2.0
+
+torch.compiler
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. list-table::
+    :header-rows: 1
+
+    * - Feature
+      - Description
+      - As of PyTorch
+      - As of ROCm
+    * - ``torch.compiler`` (AOT Autograd)
+      - Autograd captures not only the user-level code, but also backpropagation,
+        which results in capturing the backwards pass “ahead-of-time”. This
+        enables acceleration of both forwards and backwards pass using
+        ``TorchInductor``.
+      - 2.0
+      - 5.3
+    * - ``torch.compiler`` (TorchInductor)
+      - The default ``torch.compile`` deep learning compiler that generates fast
+        code for multiple accelerators and backends. You need to use a backend
+        compiler to make speedups through ``torch.compile`` possible. For AMD,
+        NVIDIA, and Intel GPUs, it leverages OpenAI Triton as the key building block.
+      - 2.0
+      - 5.3
+
+torchaudio
+--------------------------------------------------------------------------------
+
+The `torchaudio <https://pytorch.org/audio/stable/index.html>`_ library provides
+utilities for processing audio data in PyTorch, such as audio loading,
+transformations, and feature extraction.
+
+To ensure GPU-acceleration with ``torchaudio.transforms``, you need to
+explicitly move audio data (waveform tensor) to GPU using ``.to('cuda')``.
+
+The following ``torchaudio`` features are GPU-accelerated.
+
+.. list-table::
+    :header-rows: 1
+
+    * - Feature
+      - Description
+      - As of torchaudio version
+      - As of ROCm
+    * - ``torchaudio.transforms.Spectrogram``
+      - Generate a spectrogram of an input waveform using STFT.
+      - 0.6.0
+      - 4.5
+    * - ``torchaudio.transforms.MelSpectrogram``
+      - Generates the mel-scale spectrogram of raw audio signals.
+      - 0.9.0
+      - 4.5
+    * - ``torchaudio.transforms.MFCC``
+      - Extract of MFCC features.
+      - 0.9.0
+      - 4.5
+    * - ``torchaudio.transforms.Resample``
+      - Resamples a signal from one frequency to another.
+      - 0.9.0
+      - 4.5
+
+torchvision
+--------------------------------------------------------------------------------
+
+The `torchvision <https://pytorch.org/vision/stable/index.html>`_ library
+provides datasets, model architectures, and common image transformations for
+computer vision.
+
+The following ``torchvision`` features are GPU-accelerated.
+
+.. list-table::
+    :header-rows: 1
+
+    * - Feature
+      - Description
+      - As of torchvision version
+      - As of ROCm
+    * - ``torchvision.transforms.functional``
+      - Provides GPU-compatible transformations for image preprocessing like
+        resize, normalize, rotate and crop.
+      - 0.2.0
+      - 4.0
+    * - ``torchvision.ops``
+      - GPU-accelerated operations for object detection and segmentation tasks.
+        ``torchvision.ops.roi_align``, ``torchvision.ops.nms`` and
+        ``box_convert``.
+      - 0.6.0
+      - 3.3
+    * - ``torchvision.models`` with ``.to('cuda')``
+      - ``torchvision`` provides several pre-trained models (ResNet, Faster
+        R-CNN, Mask R-CNN, ...) that can run on CUDA for faster inference and
+        training.
+      - 0.1.6
+      - 2.x
+    * - ``torchvision.io``
+      - Enables video decoding and frame extraction using GPU acceleration with NVIDIA’s
+        NVDEC and nvJPEG (rocJPEG) on CUDA-enabled GPUs.
+      - 0.4.0
+      - 6.3
+
+torchtext
+--------------------------------------------------------------------------------
+
+The `torchtext <https://pytorch.org/text/stable/index.html>`_ library provides
+utilities for processing and working with text data in PyTorch, including
+tokenization, vocabulary management, and text embeddings. torchtext supports
+preprocessing pipelines and integration with PyTorch models, simplifying the
+implementation of natural language processing (NLP) tasks.
+
+To leverage GPU acceleration in torchtext, you need to move tensors
+explicitly to the GPU using ``.to('cuda')``.
+
+* torchtext does not implement its own kernels. ROCm support is enabled by linking against ROCm libraries.
+
+* Only official release exists.
+
+torchtune
+--------------------------------------------------------------------------------
+
+The `torchtune <https://pytorch.org/torchtune/stable/index.html>`_ library for
+authoring, fine-tuning and experimenting with LLMs.
+
+* Usage: Enabling developers to fine-tune ROCm PyTorch solutions.
+
+* Only official release exists.
+
+torchserve
+--------------------------------------------------------------------------------
+
+The `torchserve <https://pytorch.org/serve/>`_ is a PyTorch domain library
+for common sparsity and parallelism primitives needed for large-scale recommender
+systems.
+
+* torchtext does not implement its own kernels. ROCm support is enabled by
+  linking against ROCm libraries.
+
+* Only official release exists.
+
+torchrec
+--------------------------------------------------------------------------------
+
+The `torchrec <https://pytorch.org/torchrec/>`_ is a PyTorch domain library for
+common sparsity and parallelism primitives needed for large-scale recommender
+systems.
+
+* torchrec does not implement its own kernels. ROCm support is enabled by
+  linking against ROCm libraries.
+
+* Only official release exists.
+
+Unsupported PyTorch features
+================================================================================
+
+The following GPU-accelerated PyTorch features are not supported by ROCm for
+the listed supported PyTorch versions.
+
+.. list-table::
+    :widths: 30, 60, 10
+    :header-rows: 1
+
+    * - Feature
+      - Description
+      - As of PyTorch
+    * - APEX batch norm
+      - Use APEX batch norm instead of PyTorch batch norm.
+      - 1.6.0
+    * - ``torch.backends.cuda`` / ``matmul.allow_tf32``
+      - A bool that controls whether TensorFloat-32 tensor cores may be used in
+        matrix multiplications.
+      - 1.7
+    * - ``torch.cuda`` / NVIDIA Tools Extension (NVTX)
+      - Integration with NVTX for profiling and debugging GPU performance using
+        NVIDIA's Nsight tools.
+      - 1.7.0
+    * - ``torch.cuda`` / Lazy loading NVRTC
+      - Delays JIT compilation with NVRTC until the code is explicitly needed.
+      - 1.8.0
+    * - ``torch-tensorrt``
+      - Integrate TensorRT library for optimizing and deploying PyTorch models.
+        ROCm does not have equialent library for TensorRT.
+      - 1.9.0
+    * - ``torch.backends`` / ``cudnn.allow_tf32``
+      - TensorFloat-32 tensor cores may be used in cuDNN convolutions.
+      - 1.10.0
+    * - ``torch.backends.cuda`` / ``matmul.allow_fp16_reduced_precision_reduction``
+      - Reduced precision reductions with fp16 accumulation type are
+        allowed with fp16 GEMMs.
+      - 2.0
+    * - ``torch.backends.cuda`` / ``matmul.allow_bf16_reduced_precision_reduction``
+      - Reduced precision reductions are allowed with bf16 GEMMs.
+      - 2.0
+    * - ``torch.nn.functional`` / ``scaled_dot_product_attention``
+      - Flash attention backend for SDPA to accelerate attention computation in
+        transformer-based models.
+      - 2.0
+    * - ``torch.backends.cuda`` / ``enable_cudnn_sdp``
+      - Globally enables cuDNN SDPA's kernels within SDPA.
+      - 2.0
+    * - ``torch.backends.cuda`` / ``enable_flash_sdp``
+      - Globally enables or disables FlashAttention for SDPA.
+      - 2.1
+    * - ``torch.backends.cuda`` / ``enable_mem_efficient_sdp``
+      - Globally enables or disables Memory-Efficient Attention for SDPA.
+      - 2.1
+    * - ``torch.backends.cuda`` / ``enable_math_sdp``
+      - Globally enables or disables the PyTorch C++ implementation within SDPA.
+      - 2.1
+    * - Dynamic parallelism
+      - PyTorch itself does not directly expose dynamic parallelism as a core
+        feature. Dynamic parallelism allow GPU threads to launch additional
+        threads which can be reached using custom operations via the
+        ``torch.utils.cpp_extension`` module.
+      - Not a core feature
+    * - Unified memory support in PyTorch
+      - Unified Memory is not directly exposed in PyTorch's core API, it can be
+        utilized effectively through custom CUDA extensions or advanced
+        workflows.
+      - Not a core feature
--- a/docs/compatibility/ml-compatibility/stanford-megatron-lm-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/stanford-megatron-lm-compatibility.rst
@@ -1,100 +0,0 @@
-:orphan:
-
-.. meta::
-    :description: Stanford Megatron-LM compatibility
-    :keywords: Stanford, Megatron-LM, compatibility
-
-.. version-set:: rocm_version latest
-
-********************************************************************************
-Stanford Megatron-LM compatibility
-********************************************************************************
-
-Stanford Megatron-LM is a large-scale language model training framework developed by NVIDIA `https://github.com/NVIDIA/Megatron-LM <https://github.com/NVIDIA/Megatron-LM>`_. It is
-designed to train massive transformer-based language models efficiently by model and data parallelism. 
-
-* ROCm support for Stanford Megatron-LM is hosted in the official `https://github.com/ROCm/Stanford-Megatron-LM <https://github.com/ROCm/Stanford-Megatron-LM>`_ repository. 
-* Due to independent compatibility considerations, this location differs from the `https://github.com/stanford-futuredata/Megatron-LM <https://github.com/stanford-futuredata/Megatron-LM>`_ upstream repository. 
-* Use the prebuilt :ref:`Docker image <megatron-lm-docker-compat>` with ROCm, PyTorch, and Megatron-LM preinstalled. 
-* See the :doc:`ROCm Stanford Megatron-LM installation guide <rocm-install-on-linux:install/3rd-party/stanford-megatron-lm-install>` to install and get started.
-
-.. note::
-
-	Stanford Megatron-LM is supported on ROCm 6.3.0.
-
-
-Supported Devices
-================================================================================
-
- **Officially Supported**: AMD Instinct MI300X
- **Partially Supported** (functionality or performance limitations): AMD Instinct MI250X, MI210X
-
-
-Supported models and features
-================================================================================
-
-This section details models & features that are supported by the ROCm version on Stanford Megatron-LM.
-
-Models:
-
-* Bert
-* GPT
-* T5
-* ICT
-
-Features:
-
-* Distributed Pre-training
-* Activation Checkpointing and Recomputation
-* Distributed Optimizer
-* Mixture-of-Experts
-
-.. _megatron-lm-recommendations:
-
-Use cases and recommendations
-================================================================================
-
-See the `Efficient MoE training on AMD ROCm: How-to use Megablocks on AMD GPUs blog <https://rocm.blogs.amd.com/artificial-intelligence/megablocks/README.html>`_ post  
-to leverage the ROCm platform for pre-training by using the Stanford Megatron-LM framework of pre-processing datasets on AMD GPUs. 
-Coverage includes:
-
-  * Single-GPU pre-training
-  * Multi-GPU pre-training
-
-
-.. _megatron-lm-docker-compat:
-
-Docker image compatibility
-================================================================================
-
-.. |docker-icon| raw:: html
-
-   <i class="fab fa-docker"></i>
-
-AMD validates and publishes `Stanford Megatron-LM images <https://hub.docker.com/r/rocm/megatron-lm>`_
-with ROCm and Pytorch backends on Docker Hub. The following Docker image tags and associated
-inventories represent the latest Megatron-LM version from the official Docker Hub.
-The Docker images have been validated for `ROCm 6.3.0 <https://repo.radeon.com/rocm/apt/6.3/>`_.
-Click |docker-icon| to view the image on Docker Hub.
-
-.. list-table:: 
-    :header-rows: 1
-    :class: docker-image-compatibility
-
-    * - Docker image
-      - Stanford Megatron-LM
-      - PyTorch
-      - Ubuntu
-      - Python
-
-    * - .. raw:: html
-
-           <a href="https://hub.docker.com/layers/rocm/stanford-megatron-lm/stanford-megatron-lm85f95ae_rocm6.3.0_ubuntu24.04_py3.12_pytorch2.4.0/images/sha256-070556f078be10888a1421a2cb4f48c29f28b02bfeddae02588d1f7fc02a96a6"><i class="fab fa-docker fa-lg"></i></a>
-
-      - `85f95ae <https://github.com/stanford-futuredata/Megatron-LM/commit/85f95aef3b648075fe6f291c86714fdcbd9cd1f5>`_
-      - `2.4.0 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
-      - 24.04
-      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
-
-      
-
--- a/docs/compatibility/ml-compatibility/verl-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/verl-compatibility.rst
@@ -1,85 +0,0 @@
-:orphan:
-
-.. meta::
-   :description: verl compatibility
-   :keywords: GPU, verl compatibility
-
-.. version-set:: rocm_version latest
-
-*******************************************************************************
-verl compatibility
-*******************************************************************************
-
-Volcano Engine Reinforcement Learning for LLMs (verl) is a reinforcement learning framework designed for large language models (LLMs). 
-verl offers a scalable, open-source fine-tuning solution optimized for AMD Instinct GPUs with full ROCm support.
-
-* See the `verl documentation <https://verl.readthedocs.io/en/latest/>`_ for more information about verl. 
-* The official verl GitHub repository is `https://github.com/volcengine/verl <https://github.com/volcengine/verl>`_.
-* Use the AMD-validated :ref:`Docker images <verl-docker-compat>` with ROCm and verl preinstalled. 
-* See the :doc:`ROCm verl installation guide <rocm-install-on-linux:install/3rd-party/verl-install>` to get started.
-
-.. note::
-
-	verl is supported on ROCm 6.2.0.
-
-
-.. _verl-recommendations:
-
-Use cases and recommendations
-================================================================================
-
-The benefits of verl in large-scale reinforcement leaning from human feedback (RLHF) are discussed in the `Reinforcement Learning from Human Feedback on AMD GPUs with verl and ROCm Integration <https://rocm.blogs.amd.com/artificial-intelligence/verl-large-scale/README.html>`_ blog.
-
-.. _verl-docker-compat:
-
-Docker image compatibility
-================================================================================
-
-.. |docker-icon| raw:: html
-
-   <i class="fab fa-docker"></i>
-
-AMD validates and publishes ready-made `ROCm verl Docker images <https://hub.docker.com/r/rocm/verl>`_
-with ROCm backends on Docker Hub. The following Docker image tags and associated inventories represent the latest verl version from the official Docker Hub. The Docker images have been validated for `ROCm 6.2.0 <https://repo.radeon.com/rocm/apt/6.2/>`_. 
-
-.. list-table:: 
-    :header-rows: 1
-
-    *   - Docker image
-        - verl
-        - Linux
-        - Pytorch
-        - Python
-        - vllm
-
-    *   - .. raw:: html
-
-            <a href="https://hub.docker.com/layers/rocm/verl/verl-0.3.0.post0_rocm6.2_vllm0.6.3/images/sha256-cbe423803fd7850448b22444176bee06f4dcf22cd3c94c27732752d3a39b04b2"><i class="fab fa-docker fa-lg"></i> rocm/verl</a>
-        - `0.3.0post0 <https://github.com/volcengine/verl/releases/tag/v0.3.0.post0>`_
-        - Ubuntu 20.04
-        - `2.5.0 <https://download.pytorch.org/whl/cu118/torch-2.5.0%2Bcu118-cp39-cp39-linux_x86_64.whl#sha256=1ee24b267418c37b297529ede875b961e382c1c365482f4142af2398b92ed127>`_
-        - `3.9.19 <https://www.python.org/downloads/release/python-3919/>`_
-        - `0.6.4 <https://github.com/vllm-project/vllm/releases/tag/v0.6.4>`_
-
-
-Supported features
-===============================================================================
-
-The following table shows verl and ROCm support for GPU-accelerated modules.
-
-.. list-table::
-    :header-rows: 1
-
-    * - Module
-      - Description
-      - verl version
-      - ROCm version
-    * - ``FSDP``
-      - Training engine
-      - 0.3.0.post0
-      - 6.2
-    * - ``vllm``
-      - Inference engine
-      - 0.3.0.post0
-      - 6.2
-  
--- a/docs/conceptual/gpu-arch/mi300-mi200-performance-counters.rst
+++ b/docs/conceptual/gpu-arch/mi300-mi200-performance-counters.rst
@@ -8,7 +8,7 @@ MI300 and MI200 series performance counters and metrics

 This document lists and describes the hardware performance counters and derived metrics available
 for the AMD Instinct™ MI300 and MI200 GPU. You can also access this information using the
-:doc:`ROCprofiler-SDK <rocprofiler-sdk:how-to/using-rocprofv3>`.
+:doc:`ROCProfiler tool <rocprofiler:rocprofv1>`.

 MI300 and MI200 series performance counters
 ===============================================================
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -34,90 +34,86 @@ project = "ROCm Documentation"
 project_path = os.path.abspath(".").replace("\\", "/")
 author = "Advanced Micro Devices, Inc."
 copyright = "Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved."
-version = "6.4.1"
-release = "6.4.1"
+version = "7.0 Alpha"
+release = "7.0 Alpha"
 setting_all_article_info = True
 all_article_info_os = ["linux", "windows"]
 all_article_info_author = ""

 # pages with specific settings
 article_pages = [
-    {"file": "about/release-notes", "os": ["linux"], "date": "2025-05-07"},
-    {"file": "release/changelog", "os": ["linux"],},
-    {"file": "compatibility/compatibility-matrix", "os": ["linux"]},
-    {"file": "compatibility/ml-compatibility/pytorch-compatibility", "os": ["linux"]},
-    {"file": "compatibility/ml-compatibility/tensorflow-compatibility", "os": ["linux"]},
-    {"file": "compatibility/ml-compatibility/jax-compatibility", "os": ["linux"]},
-    {"file": "how-to/deep-learning-rocm", "os": ["linux"]},
+    {"file": "preview/index", "os": ["linux"],},
+    {"file": "preview/release", "os": ["linux"],},
+    {"file": "preview/install/index", "os": ["linux"],},
+    {"file": "preview/install/instinct-driver", "os": ["linux"],},
+    {"file": "preview/install/rocm", "os": ["linux"],},
+    {"file": "preview/benchmark-docker/index", "os": ["linux"],},
+    {"file": "preview/benchmark-docker/training", "os": ["linux"],},
+    {"file": "preview/benchmark-docker/pre-training-megatron-lm-llama-3-8b", "os": ["linux"],},
+    {"file": "preview/benchmark-docker/pre-training-torchtitan-llama-3-70b", "os": ["linux"],},
+    {"file": "preview/benchmark-docker/fine-tuning-lora-llama-2-70b", "os": ["linux"],},
+    {"file": "preview/benchmark-docker/inference", "os": ["linux"],},
+    {"file": "preview/benchmark-docker/inference-vllm-llama-3.1-405b-fp4", "os": ["linux"],},
+    {"file": "preview/benchmark-docker/inference-sglang-deepseek-r1-fp4", "os": ["linux"],},

-    {"file": "how-to/rocm-for-ai/index", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/install", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/system-health-check", "os": ["linux"]},
-
-    {"file": "how-to/rocm-for-ai/training/index", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/train-a-model", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/prerequisite-system-validation", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/scale-model-training", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/benchmark-docker/megatron-lm", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v24.12-dev", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.5", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.3", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.4", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry", "os": ["linux"]},
-
-    {"file": "how-to/rocm-for-ai/fine-tuning/index", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/fine-tuning/overview", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/fine-tuning/single-gpu-fine-tuning-and-inference", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/fine-tuning/multi-gpu-fine-tuning-and-inference", "os": ["linux"]},
-
-    {"file": "how-to/rocm-for-ai/inference/index", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/inference/hugging-face-models", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/inference/llm-inference-frameworks", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/vllm", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.4.3", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.4", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.6", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.7.3-20250325", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.3-20250415", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250513", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250521", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250605", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250702", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/inference/deploy-your-model", "os": ["linux"]},
-
-    {"file": "how-to/rocm-for-ai/inference-optimization/index", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/inference-optimization/model-quantization", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/inference-optimization/model-acceleration-libraries", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/inference-optimization/optimizing-with-composable-kernel", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/inference-optimization/optimizing-triton-kernel", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/inference-optimization/profiling-and-debugging", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/inference-optimization/workload", "os": ["linux"]},
-
-    {"file": "how-to/system-optimization/index", "os": ["linux"]},
-    {"file": "how-to/system-optimization/mi300x", "os": ["linux"]},
-    {"file": "how-to/system-optimization/mi200", "os": ["linux"]},
-    {"file": "how-to/system-optimization/mi100", "os": ["linux"]},
-    {"file": "how-to/system-optimization/w6000-v620", "os": ["linux"]},
-    {"file": "how-to/tuning-guides/mi300x/index", "os": ["linux"]},
-    {"file": "how-to/tuning-guides/mi300x/system", "os": ["linux"]},
-    {"file": "how-to/tuning-guides/mi300x/workload", "os": ["linux"]},
-    {"file": "how-to/system-debugging", "os": ["linux"]},
-    {"file": "how-to/gpu-enabled-mpi", "os": ["linux"]},
+    # {"file": "about/release-notes", "os": ["linux"], "date": "2025-06-26"},
+    # {"file": "release/changelog", "os": ["linux"],},
+    # {"file": "compatibility/compatibility-matrix", "os": ["linux"]},
+    # {"file": "compatibility/ml-compatibility/pytorch-compatibility", "os": ["linux"]},
+    # {"file": "compatibility/ml-compatibility/tensorflow-compatibility", "os": ["linux"]},
+    # {"file": "compatibility/ml-compatibility/jax-compatibility", "os": ["linux"]},
+    # {"file": "how-to/deep-learning-rocm", "os": ["linux"]},
+    #
+    # {"file": "how-to/rocm-for-ai/index", "os": ["linux"]},
+    # {"file": "how-to/rocm-for-ai/install", "os": ["linux"]},
+    # {"file": "how-to/rocm-for-ai/system-health-check", "os": ["linux"]},
+    #
+    # {"file": "how-to/rocm-for-ai/training/index", "os": ["linux"]},
+    # {"file": "how-to/rocm-for-ai/training/train-a-model", "os": ["linux"]},
+    # {"file": "how-to/rocm-for-ai/training/prerequisite-system-validation", "os": ["linux"]},
+    # {"file": "how-to/rocm-for-ai/training/benchmark-docker/megatron-lm", "os": ["linux"]},
+    # {"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]},
+    # {"file": "how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry", "os": ["linux"]},
+    # {"file": "how-to/rocm-for-ai/training/scale-model-training", "os": ["linux"]},
+    #
+    # {"file": "how-to/rocm-for-ai/fine-tuning/index", "os": ["linux"]},
+    # {"file": "how-to/rocm-for-ai/fine-tuning/overview", "os": ["linux"]},
+    # {"file": "how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference", "os": ["linux"]},
+    # {"file": "how-to/rocm-for-ai/fine-tuning/single-gpu-fine-tuning-and-inference", "os": ["linux"]},
+    # {"file": "how-to/rocm-for-ai/fine-tuning/multi-gpu-fine-tuning-and-inference", "os": ["linux"]},
+    #
+    # {"file": "how-to/rocm-for-ai/inference/index", "os": ["linux"]},
+    # {"file": "how-to/rocm-for-ai/inference/hugging-face-models", "os": ["linux"]},
+    # {"file": "how-to/rocm-for-ai/inference/llm-inference-frameworks", "os": ["linux"]},
+    # {"file": "how-to/rocm-for-ai/inference/benchmark-docker/vllm", "os": ["linux"]},
+    # {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250513", "os": ["linux"]},
+    # {"file": "how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference", "os": ["linux"]},
+    # {"file": "how-to/rocm-for-ai/inference/deploy-your-model", "os": ["linux"]},
+    #
+    # {"file": "how-to/rocm-for-ai/inference-optimization/index", "os": ["linux"]},
+    # {"file": "how-to/rocm-for-ai/inference-optimization/model-quantization", "os": ["linux"]},
+    # {"file": "how-to/rocm-for-ai/inference-optimization/model-acceleration-libraries", "os": ["linux"]},
+    # {"file": "how-to/rocm-for-ai/inference-optimization/optimizing-with-composable-kernel", "os": ["linux"]},
+    # {"file": "how-to/rocm-for-ai/inference-optimization/optimizing-triton-kernel", "os": ["linux"]},
+    # {"file": "how-to/rocm-for-ai/inference-optimization/profiling-and-debugging", "os": ["linux"]},
+    # {"file": "how-to/rocm-for-ai/inference-optimization/workload", "os": ["linux"]},
+    #
+    # {"file": "how-to/system-optimization/index", "os": ["linux"]},
+    # {"file": "how-to/system-optimization/mi300x", "os": ["linux"]},
+    # {"file": "how-to/system-optimization/mi200", "os": ["linux"]},
+    # {"file": "how-to/system-optimization/mi100", "os": ["linux"]},
+    # {"file": "how-to/system-optimization/w6000-v620", "os": ["linux"]},
+    # {"file": "how-to/tuning-guides/mi300x/index", "os": ["linux"]},
+    # {"file": "how-to/tuning-guides/mi300x/system", "os": ["linux"]},
+    # {"file": "how-to/tuning-guides/mi300x/workload", "os": ["linux"]},
+    # {"file": "how-to/system-debugging", "os": ["linux"]},
+    # {"file": "how-to/gpu-enabled-mpi", "os": ["linux"]},
 ]

 external_toc_path = "./sphinx/_toc.yml"
+# Options to improve documentation build time for preview release documentation
+external_toc_exclude_missing = True # don't build files that aren't in the TOC
+external_projects_remote_repository = "" # don't fetch data to resolve intersphinx xrefs

 # Add the _extensions directory to Python's search path
 sys.path.append(str(Path(__file__).parent / 'extension'))
@@ -143,14 +139,13 @@ html_static_path = ["sphinx/static/css", "extension/how-to/rocm-for-ai/inference
 html_css_files = ["rocm_custom.css", "rocm_rn.css", "vllm-benchmark.css"]
 html_js_files = ["vllm-benchmark.js"]

-html_title = "ROCm Documentation"
+html_title = "ROCm 7.0 Alpha documentation"

 html_theme_options = {"link_main_doc": False}

 redirects = {"reference/openmp/openmp": "../../about/compatibility/openmp.html"}

 numfig = False
-suppress_warnings = ["autosectionlabel.*"]

 html_context = {
    "project_path" : {project_path},
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.0.1_20250605-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.0.1_20250605-benchmark-models.yaml
@@ -1,162 +0,0 @@
-vllm_benchmark:
-  unified_docker:
-    latest:
-      pull_tag: rocm/vllm:rocm6.4.1_vllm_0.9.0.1_20250605
-      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.0.1_20250605/images/sha256-f48beeb3d72663a93c77211eb45273d564451447c097e060befa713d565fa36c
-      rocm_version: 6.4.1
-      vllm_version: 0.9.0.1 (0.9.0.2.dev108+g71faa1880.rocm641)
-      pytorch_version: 2.7.0+gitf717b2a
-      hipblaslt_version: 0.15
-  model_groups:
-    - group: Meta Llama
-      tag: llama
-      models:
-      - model: Llama 3.1 8B
-        mad_tag: pyt_vllm_llama-3.1-8b
-        model_repo: meta-llama/Llama-3.1-8B-Instruct
-        url: https://huggingface.co/meta-llama/Llama-3.1-8B
-        precision: float16
-      - model: Llama 3.1 70B
-        mad_tag: pyt_vllm_llama-3.1-70b
-        model_repo: meta-llama/Llama-3.1-70B-Instruct
-        url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
-        precision: float16
-      - model: Llama 3.1 405B
-        mad_tag: pyt_vllm_llama-3.1-405b
-        model_repo: meta-llama/Llama-3.1-405B-Instruct
-        url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
-        precision: float16
-      - model: Llama 2 7B
-        mad_tag: pyt_vllm_llama-2-7b
-        model_repo: meta-llama/Llama-2-7b-chat-hf
-        url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
-        precision: float16
-      - model: Llama 2 70B
-        mad_tag: pyt_vllm_llama-2-70b
-        model_repo: meta-llama/Llama-2-70b-chat-hf
-        url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
-        precision: float16
-      - model: Llama 3.1 8B FP8
-        mad_tag: pyt_vllm_llama-3.1-8b_fp8
-        model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
-        url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
-        precision: float8
-      - model: Llama 3.1 70B FP8
-        mad_tag: pyt_vllm_llama-3.1-70b_fp8
-        model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
-        url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
-        precision: float8
-      - model: Llama 3.1 405B FP8
-        mad_tag: pyt_vllm_llama-3.1-405b_fp8
-        model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
-        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
-        precision: float8
-    - group: Mistral AI
-      tag: mistral
-      models:
-      - model: Mixtral MoE 8x7B
-        mad_tag: pyt_vllm_mixtral-8x7b
-        model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
-        url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
-        precision: float16
-      - model: Mixtral MoE 8x22B
-        mad_tag: pyt_vllm_mixtral-8x22b
-        model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
-        url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
-        precision: float16
-      - model: Mistral 7B
-        mad_tag: pyt_vllm_mistral-7b
-        model_repo: mistralai/Mistral-7B-Instruct-v0.3
-        url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
-        precision: float16
-      - model: Mixtral MoE 8x7B FP8
-        mad_tag: pyt_vllm_mixtral-8x7b_fp8
-        model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
-        url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
-        precision: float8
-      - model: Mixtral MoE 8x22B FP8
-        mad_tag: pyt_vllm_mixtral-8x22b_fp8
-        model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
-        url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
-        precision: float8
-      - model: Mistral 7B FP8
-        mad_tag: pyt_vllm_mistral-7b_fp8
-        model_repo: amd/Mistral-7B-v0.1-FP8-KV
-        url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
-        precision: float8
-    - group: Qwen
-      tag: qwen
-      models:
-      - model: Qwen2 7B
-        mad_tag: pyt_vllm_qwen2-7b
-        model_repo: Qwen/Qwen2-7B-Instruct
-        url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
-        precision: float16
-      - model: Qwen2 72B
-        mad_tag: pyt_vllm_qwen2-72b
-        model_repo: Qwen/Qwen2-72B-Instruct
-        url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
-        precision: float16
-      - model: QwQ-32B
-        mad_tag: pyt_vllm_qwq-32b
-        model_repo: Qwen/QwQ-32B
-        url: https://huggingface.co/Qwen/QwQ-32B
-        precision: float16
-        tunableop: true
-    - group: Databricks DBRX
-      tag: dbrx
-      models:
-      - model: DBRX Instruct
-        mad_tag: pyt_vllm_dbrx-instruct
-        model_repo: databricks/dbrx-instruct
-        url: https://huggingface.co/databricks/dbrx-instruct
-        precision: float16
-      - model: DBRX Instruct FP8
-        mad_tag: pyt_vllm_dbrx_fp8
-        model_repo: amd/dbrx-instruct-FP8-KV
-        url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
-        precision: float8
-    - group: Google Gemma
-      tag: gemma
-      models:
-      - model: Gemma 2 27B
-        mad_tag: pyt_vllm_gemma-2-27b
-        model_repo: google/gemma-2-27b
-        url: https://huggingface.co/google/gemma-2-27b
-        precision: float16
-    - group: Cohere
-      tag: cohere
-      models:
-      - model: C4AI Command R+ 08-2024
-        mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
-        model_repo: CohereForAI/c4ai-command-r-plus-08-2024
-        url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
-        precision: float16
-      - model: C4AI Command R+ 08-2024 FP8
-        mad_tag: pyt_vllm_command-r-plus_fp8
-        model_repo: amd/c4ai-command-r-plus-FP8-KV
-        url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
-        precision: float8
-    - group: DeepSeek
-      tag: deepseek
-      models:
-      - model: DeepSeek MoE 16B
-        mad_tag: pyt_vllm_deepseek-moe-16b-chat
-        model_repo: deepseek-ai/deepseek-moe-16b-chat
-        url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
-        precision: float16
-    - group: Microsoft Phi
-      tag: phi
-      models:
-      - model: Phi-4
-        mad_tag: pyt_vllm_phi-4
-        model_repo: microsoft/phi-4
-        url: https://huggingface.co/microsoft/phi-4
-    - group: TII Falcon
-      tag: falcon
-      models:
-      - model: Falcon 180B
-        mad_tag: pyt_vllm_falcon-180b
-        model_repo: tiiuae/falcon-180B
-        url: https://huggingface.co/tiiuae/falcon-180B
-        precision: float16
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250702-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250702-benchmark-models.yaml
@@ -1,163 +0,0 @@
-vllm_benchmark:
-  unified_docker:
-    latest:
-      # TODO: update me
-      pull_tag: rocm/vllm:rocm6.4.1_vllm_0.9.1_20250702
-      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250702/images/sha256-45068a2079cb8df554ed777141bf0c67d6627c470a897256e60c9f262677faab
-      rocm_version: 6.4.1
-      vllm_version: 0.9.1 (0.9.2.dev206+gb335519f2.rocm641)
-      pytorch_version: 2.7.0+gitf717b2a
-      hipblaslt_version: 0.15
-  model_groups:
-    - group: Meta Llama
-      tag: llama
-      models:
-      - model: Llama 3.1 8B
-        mad_tag: pyt_vllm_llama-3.1-8b
-        model_repo: meta-llama/Llama-3.1-8B-Instruct
-        url: https://huggingface.co/meta-llama/Llama-3.1-8B
-        precision: float16
-      - model: Llama 3.1 70B
-        mad_tag: pyt_vllm_llama-3.1-70b
-        model_repo: meta-llama/Llama-3.1-70B-Instruct
-        url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
-        precision: float16
-      - model: Llama 3.1 405B
-        mad_tag: pyt_vllm_llama-3.1-405b
-        model_repo: meta-llama/Llama-3.1-405B-Instruct
-        url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
-        precision: float16
-      - model: Llama 2 7B
-        mad_tag: pyt_vllm_llama-2-7b
-        model_repo: meta-llama/Llama-2-7b-chat-hf
-        url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
-        precision: float16
-      - model: Llama 2 70B
-        mad_tag: pyt_vllm_llama-2-70b
-        model_repo: meta-llama/Llama-2-70b-chat-hf
-        url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
-        precision: float16
-      - model: Llama 3.1 8B FP8
-        mad_tag: pyt_vllm_llama-3.1-8b_fp8
-        model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
-        url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
-        precision: float8
-      - model: Llama 3.1 70B FP8
-        mad_tag: pyt_vllm_llama-3.1-70b_fp8
-        model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
-        url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
-        precision: float8
-      - model: Llama 3.1 405B FP8
-        mad_tag: pyt_vllm_llama-3.1-405b_fp8
-        model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
-        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
-        precision: float8
-    - group: Mistral AI
-      tag: mistral
-      models:
-      - model: Mixtral MoE 8x7B
-        mad_tag: pyt_vllm_mixtral-8x7b
-        model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
-        url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
-        precision: float16
-      - model: Mixtral MoE 8x22B
-        mad_tag: pyt_vllm_mixtral-8x22b
-        model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
-        url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
-        precision: float16
-      - model: Mistral 7B
-        mad_tag: pyt_vllm_mistral-7b
-        model_repo: mistralai/Mistral-7B-Instruct-v0.3
-        url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
-        precision: float16
-      - model: Mixtral MoE 8x7B FP8
-        mad_tag: pyt_vllm_mixtral-8x7b_fp8
-        model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
-        url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
-        precision: float8
-      - model: Mixtral MoE 8x22B FP8
-        mad_tag: pyt_vllm_mixtral-8x22b_fp8
-        model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
-        url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
-        precision: float8
-      - model: Mistral 7B FP8
-        mad_tag: pyt_vllm_mistral-7b_fp8
-        model_repo: amd/Mistral-7B-v0.1-FP8-KV
-        url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
-        precision: float8
-    - group: Qwen
-      tag: qwen
-      models:
-      - model: Qwen2 7B
-        mad_tag: pyt_vllm_qwen2-7b
-        model_repo: Qwen/Qwen2-7B-Instruct
-        url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
-        precision: float16
-      - model: Qwen2 72B
-        mad_tag: pyt_vllm_qwen2-72b
-        model_repo: Qwen/Qwen2-72B-Instruct
-        url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
-        precision: float16
-      - model: QwQ-32B
-        mad_tag: pyt_vllm_qwq-32b
-        model_repo: Qwen/QwQ-32B
-        url: https://huggingface.co/Qwen/QwQ-32B
-        precision: float16
-        tunableop: true
-    - group: Databricks DBRX
-      tag: dbrx
-      models:
-      - model: DBRX Instruct
-        mad_tag: pyt_vllm_dbrx-instruct
-        model_repo: databricks/dbrx-instruct
-        url: https://huggingface.co/databricks/dbrx-instruct
-        precision: float16
-      - model: DBRX Instruct FP8
-        mad_tag: pyt_vllm_dbrx_fp8
-        model_repo: amd/dbrx-instruct-FP8-KV
-        url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
-        precision: float8
-    - group: Google Gemma
-      tag: gemma
-      models:
-      - model: Gemma 2 27B
-        mad_tag: pyt_vllm_gemma-2-27b
-        model_repo: google/gemma-2-27b
-        url: https://huggingface.co/google/gemma-2-27b
-        precision: float16
-    - group: Cohere
-      tag: cohere
-      models:
-      - model: C4AI Command R+ 08-2024
-        mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
-        model_repo: CohereForAI/c4ai-command-r-plus-08-2024
-        url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
-        precision: float16
-      - model: C4AI Command R+ 08-2024 FP8
-        mad_tag: pyt_vllm_command-r-plus_fp8
-        model_repo: amd/c4ai-command-r-plus-FP8-KV
-        url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
-        precision: float8
-    - group: DeepSeek
-      tag: deepseek
-      models:
-      - model: DeepSeek MoE 16B
-        mad_tag: pyt_vllm_deepseek-moe-16b-chat
-        model_repo: deepseek-ai/deepseek-moe-16b-chat
-        url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
-        precision: float16
-    - group: Microsoft Phi
-      tag: phi
-      models:
-      - model: Phi-4
-        mad_tag: pyt_vllm_phi-4
-        model_repo: microsoft/phi-4
-        url: https://huggingface.co/microsoft/phi-4
-    - group: TII Falcon
-      tag: falcon
-      models:
-      - model: Falcon 180B
-        mad_tag: pyt_vllm_falcon-180b
-        model_repo: tiiuae/falcon-180B
-        url: https://huggingface.co/tiiuae/falcon-180B
-        precision: float16
--- a/docs/data/how-to/rocm-for-ai/inference/pytorch-inference-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/pytorch-inference-benchmark-models.yaml
@@ -31,11 +31,3 @@ pytorch_inference_benchmark:
        model_repo: genmo/mochi-1-preview
        url: https://huggingface.co/genmo/mochi-1-preview
        precision: float16
-    - group: Wan2.1
-      tag: wan
-      models:
-      - model: Wan2.1
-        mad_tag: pyt_wan2.1_inference
-        model_repo: Wan-AI/Wan2.1-T2V-14B
-        url: https://huggingface.co/Wan-AI/Wan2.1-T2V-14B
-        precision: bfloat16
--- a/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
@@ -1,11 +1,10 @@
 vllm_benchmark:
  unified_docker:
    latest:
-      # TODO: update me
-      pull_tag: rocm/vllm:rocm6.4.1_vllm_0.9.1_20250715
-      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250715/images/sha256-4a429705fa95a58f6d20aceab43b1b76fa769d57f32d5d28bd3f4e030e2a78ea
+      pull_tag: rocm/vllm:rocm6.4.1_vllm_0.9.0.1_20250605
+      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.0.1_20250605/images/sha256-f48beeb3d72663a93c77211eb45273d564451447c097e060befa713d565fa36c
      rocm_version: 6.4.1
-      vllm_version: 0.9.1 (0.9.2.dev364+gb432b7a28.rocm641)
+      vllm_version: 0.9.0.1 (0.9.0.2.dev108+g71faa1880.rocm641)
      pytorch_version: 2.7.0+gitf717b2a
      hipblaslt_version: 0.15
  model_groups:
--- a/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
@@ -1,60 +1,29 @@
-dockers:
-  - pull_tag: rocm/megatron-lm:v25.6_py312
-    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py312/images/sha256-482ff906532285bceabdf2bda629bd32cb6174d2d07f4243a736378001b28df0
-    components:
-      ROCm: 6.4.1
-      PyTorch: 2.8.0a0+git7d205b2
-      Python: 3.12
-      Transformer Engine: 2.1.0.dev0+8c4a512
-      hipBLASLt: 393e413
-      Triton: 3.3.0
-      RCCL: 2.23.4.7a84c5d
-    doc_name: Ubuntu 24.04 + Python 3.12
-  - pull_tag: rocm/megatron-lm:v25.6_py310
-    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py310/images/sha256-9627bd9378684fe26cb1a10c7dd817868f553b33402e49b058355b0f095568d6
-    components:
-      ROCm: 6.4.1
-      PyTorch: 2.8.0a0+git7d205b2
-      Python: "3.10"
-      Transformer Engine: 2.1.0.dev0+8c4a512
-      hipBLASLt: 393e413
-      Triton: 3.3.0
-      RCCL: 2.23.4.7a84c5d
-    doc_name: Ubuntu 22.04 + Python 3.10
-model_groups:
-  - group: Meta Llama
-    tag: llama
-    models:
+megatron-lm_benchmark:
+  model_groups:
+    - group: Meta Llama
+      tag: llama
+      models:
      - model: Llama 3.3 70B
        mad_tag: pyt_megatron_lm_train_llama-3.3-70b
      - model: Llama 3.1 8B
        mad_tag: pyt_megatron_lm_train_llama-3.1-8b
      - model: Llama 3.1 70B
        mad_tag: pyt_megatron_lm_train_llama-3.1-70b
-      - model: Llama 3.1 70B (proxy)
-        mad_tag: pyt_megatron_lm_train_llama-3.1-70b-proxy
      - model: Llama 2 7B
        mad_tag: pyt_megatron_lm_train_llama-2-7b
      - model: Llama 2 70B
        mad_tag: pyt_megatron_lm_train_llama-2-70b
-  - group: DeepSeek
-    tag: deepseek
-    models:
-      - model: DeepSeek-V3 (proxy)
+    - group: DeepSeek
+      tag: deepseek
+      models:
+      - model: DeepSeek-V3
        mad_tag: pyt_megatron_lm_train_deepseek-v3-proxy
      - model: DeepSeek-V2-Lite
        mad_tag: pyt_megatron_lm_train_deepseek-v2-lite-16b
-  - group: Mistral AI
-    tag: mistral
-    models:
+    - group: Mistral AI
+      tag: mistral
+      models:
      - model: Mixtral 8x7B
        mad_tag: pyt_megatron_lm_train_mixtral-8x7b
-      - model: Mixtral 8x22B (proxy)
+      - model: Mixtral 8x22B
        mad_tag: pyt_megatron_lm_train_mixtral-8x22b-proxy
-  - group: Qwen
-    tag: qwen
-    models:
-      - model: Qwen 2.5 7B
-        mad_tag: pyt_megatron_lm_train_qwen2.5-7b
-      - model: Qwen 2.5 72B
-        mad_tag: pyt_megatron_lm_train_qwen2.5-72b
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.5-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.5-benchmark-models.yaml
@@ -1,29 +0,0 @@
-megatron-lm_benchmark:
-  model_groups:
-    - group: Meta Llama
-      tag: llama
-      models:
-      - model: Llama 3.3 70B
-        mad_tag: pyt_megatron_lm_train_llama-3.3-70b
-      - model: Llama 3.1 8B
-        mad_tag: pyt_megatron_lm_train_llama-3.1-8b
-      - model: Llama 3.1 70B
-        mad_tag: pyt_megatron_lm_train_llama-3.1-70b
-      - model: Llama 2 7B
-        mad_tag: pyt_megatron_lm_train_llama-2-7b
-      - model: Llama 2 70B
-        mad_tag: pyt_megatron_lm_train_llama-2-70b
-    - group: DeepSeek
-      tag: deepseek
-      models:
-      - model: DeepSeek-V3
-        mad_tag: pyt_megatron_lm_train_deepseek-v3-proxy
-      - model: DeepSeek-V2-Lite
-        mad_tag: pyt_megatron_lm_train_deepseek-v2-lite-16b
-    - group: Mistral AI
-      tag: mistral
-      models:
-      - model: Mixtral 8x7B
-        mad_tag: pyt_megatron_lm_train_mixtral-8x7b
-      - model: Mixtral 8x22B
-        mad_tag: pyt_megatron_lm_train_mixtral-8x22b-proxy
--- a/docs/data/rocm-software-stack-6_4_0.jpg
+++ b/docs/data/rocm-software-stack-6_4_0.jpg
--- a/docs/how-to/deep-learning-rocm.rst
+++ b/docs/how-to/deep-learning-rocm.rst
@@ -17,9 +17,6 @@ features for these ROCm-enabled deep learning frameworks.
 * :doc:`PyTorch compatibility <../compatibility/ml-compatibility/pytorch-compatibility>`
 * :doc:`TensorFlow compatibility <../compatibility/ml-compatibility/tensorflow-compatibility>`
 * :doc:`JAX compatibility <../compatibility/ml-compatibility/jax-compatibility>`
-* :doc:`verl compatibility <../compatibility/ml-compatibility/verl-compatibility>`
-* :doc:`Stanford Megatron-LM compatibility <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>`
-* :doc:`DGL compatibility <../compatibility/ml-compatibility/dgl-compatibility>`

 This chart steps through typical installation workflows for installing deep learning frameworks for ROCm.

@@ -32,9 +29,6 @@ See the installation instructions to get started.
 * :doc:`PyTorch for ROCm <rocm-install-on-linux:install/3rd-party/pytorch-install>`
 * :doc:`TensorFlow for ROCm <rocm-install-on-linux:install/3rd-party/tensorflow-install>`
 * :doc:`JAX for ROCm <rocm-install-on-linux:install/3rd-party/jax-install>`
-* :doc:`verl for ROCm <rocm-install-on-linux:install/3rd-party/verl-install>`
-* :doc:`Stanford Megatron-LM for ROCm <rocm-install-on-linux:install/3rd-party/stanford-megatron-lm-install>`
-* :doc:`DGL for ROCm <rocm-install-on-linux:install/3rd-party/dgl-install>`

 .. note::

--- a/docs/how-to/gpu-performance/mi300x.rst
+++ b/docs/how-to/gpu-performance/mi300x.rst
@@ -7,21 +7,21 @@ AMD Instinct MI300X performance guides
 **************************************

 The following performance guides provide essential guidance on the necessary
-steps to properly `configure your system for AMD Instinct™ MI300X accelerators
-<https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
-They include detailed instructions on system settings and application
-:doc:`workload tuning </how-to/rocm-for-ai/inference-optimization/workload>` to
-help you leverage the maximum capabilities of these accelerators and achieve
-superior performance.
+steps to properly :doc:`configure your system for AMD Instinct™ MI300X
+accelerators <../system-optimization/mi300x>`. They include detailed
+instructions on system settings and application :doc:`workload tuning
+<../rocm-for-ai/inference-optimization/workload>` to help you
+leverage the maximum capabilities of these accelerators and achieve superior
+performance.

 * `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`__
  covers essential system settings and system management practices to configure
  your AMD Instinct MI300X system for performance.

-* :doc:`/how-to/rocm-for-ai/inference-optimization/workload` covers steps to
+* :doc:`../rocm-for-ai/inference-optimization/workload` covers steps to
  optimize the performance of AMD Instinct MI300X series accelerators for HPC
  and deep learning operations.

-* :doc:`/how-to/rocm-for-ai/inference/benchmark-docker/vllm` introduces a preconfigured
+* :doc:`../rocm-for-ai/inference/vllm-benchmark` introduces a preconfigured
  environment for LLM inference, designed to help you test performance with
  popular models on AMD Instinct MI300X series accelerators.
--- a/docs/how-to/rocm-for-ai/fine-tuning/index.rst
+++ b/docs/how-to/rocm-for-ai/fine-tuning/index.rst
@@ -24,3 +24,5 @@ training, fine-tuning, and inference. It leverages popular machine learning fram
 - :doc:`Fine-tuning and inference <fine-tuning-and-inference>` using a
  :doc:`single-accelerator <single-gpu-fine-tuning-and-inference>` or
  :doc:`multi-accelerator <multi-gpu-fine-tuning-and-inference>` system.
+
+
--- a/docs/how-to/rocm-for-ai/index.rst
+++ b/docs/how-to/rocm-for-ai/index.rst
@@ -6,7 +6,7 @@
 Use ROCm for AI
 **************************

-ROCm is an open-source software platform that enables high-performance computing and machine learning applications. It features the ability to accelerate training, fine-tuning, and inference for AI application development. With ROCm, you can access the full power of AMD GPUs, which can significantly improve the performance and efficiency of AI workloads.
+ROCm™ is an open-source software platform that enables high-performance computing and machine learning applications. It features the ability to accelerate training, fine-tuning, and inference for AI application development. With ROCm, you can access the full power of AMD GPUs, which can significantly improve the performance and efficiency of AI workloads.

 You can use ROCm to perform distributed training, which enables you to train models across multiple GPUs or nodes simultaneously. Additionally, ROCm supports mixed-precision training, which can help reduce the memory and compute requirements of training workloads. For fine-tuning, ROCm provides access to various algorithms and optimization techniques. In terms of inference, ROCm provides several techniques that can help you optimize your models for deployment, such as quantization, GEMM tuning, and optimization with composable kernel.
 
--- a/docs/how-to/rocm-for-ai/inference-optimization/workload.rst
+++ b/docs/how-to/rocm-for-ai/inference-optimization/workload.rst
@@ -151,8 +151,8 @@ desired effect. Continuous iteration helps refine the performance gains and
 address any new bottlenecks that may emerge.

 ROCm provides a prebuilt optimized Docker image that has everything required to implement
-the LLM inference tips in this section. It includes ROCm, PyTorch, and vLLM.
-For more information, see :doc:`/how-to/rocm-for-ai/inference/benchmark-docker/vllm`.
+the tips in this section. It includes ROCm, vLLM, PyTorch, and tuning files in the CSV 
+format. For more information, see :doc:`../inference/vllm-benchmark`.

 .. _mi300x-profiling-tools:

@@ -343,10 +343,9 @@ The following performance tips are not *specific* to vLLM -- they are general
 but relevant in this context. You can tune the following vLLM parameters to
 achieve optimal request latency and throughput performance.

-* As described in `Environment variables (MI300X)
-  <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#environment-variables>`_,
-  the environment variable ``HIP_FORCE_DEV_KERNARG`` can improve vLLM
-  performance. Set it to ``export HIP_FORCE_DEV_KERNARG=1``.
+* As described in :ref:`mi300x-env-vars`, the environment
+  variable ``HIP_FORCE_DEV_KERNARG`` can improve vLLM performance. Set it to
+  ``export HIP_FORCE_DEV_KERNARG=1``.

 * Set the :ref:`RCCL environment variable <mi300x-rccl>` ``NCCL_MIN_NCHANNELS``
  to ``112`` to increase the number of channels on MI300X to potentially improve
@@ -411,9 +410,9 @@ for additional performance tips. :ref:`fine-tuning-llms-vllm` describes vLLM
 usage with ROCm.

 ROCm provides a prebuilt optimized Docker image for validating the performance
-of LLM inference with vLLM on MI300X series accelerators. The Docker image includes
-ROCm, vLLM, and PyTorch. For more information, see
-:doc:`/how-to/rocm-for-ai/inference/benchmark-docker/vllm`.
+of LLM inference with vLLM on the MI300X accelerator. The Docker image includes
+ROCm, vLLM, PyTorch, and tuning files in the CSV format. For more information,
+see :doc:`../inference/vllm-benchmark`.

 .. _mi300x-vllm-throughput-measurement:

@@ -1478,9 +1477,8 @@ following command: ``cat /proc/sys/kernel/numa_balancing`` and
 checking whether the output is ``0``.

 If the output is ``1``, you can disable NUMA auto-balancing by running the
-following command: ``sudo sysctl kernel.numa_balancing=0``. For more details,
-see `AMD Instinct MI300X system optimization
-<https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#disable-numa-auto-balancing>`_.
+following command: ``sudo sysctl kernel.numa_balancing=0``. For more
+details, see :ref:`AMD Instinct MI300X system optimization <mi300x-disable-numa>`.

 .. _mi300x-rccl-disable-acs:

--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.4.3.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.4.3.rst
@@ -59,7 +59,7 @@ MI300X accelerator with the prebuilt vLLM Docker image.

   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
   might hang until the periodic balancing is finalized. For more information,
-   see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
+   see :ref:`AMD Instinct MI300X system optimization <mi300x-disable-numa>`.

   .. code-block:: shell

@@ -322,22 +322,22 @@ Further reading
 ===============

 - For application performance optimization strategies for HPC and AI workloads,
-  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
+  including inference with vLLM, see :doc:`/how-to/tuning-guides/mi300x/workload`.

 - To learn more about the options for latency and throughput benchmark scripts,
  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.

 - To learn more about system settings and management practices to configure your system for
-  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+  MI300X accelerators, see :doc:`/how-to/system-optimization/mi300x`.

- To learn how to run community models from Hugging Face on AMD GPUs, see
-  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
+- To learn how to run LLM models from Hugging Face or your own model, see
+  :doc:`Using ROCm for AI </how-to/rocm-for-ai/index>`.

- To learn how to fine-tune LLMs and optimize inference, see
-  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
+- To learn how to optimize inference on LLMs, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/llm-fine-tuning-optimization/index>`.

- For a list of other ready-made Docker images for AI with ROCm, see
-  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+- For a list of other ready-made Docker images for ROCm, see the
+  :doc:`Docker image support matrix <rocm-install-on-linux:reference/docker-image-support-matrix>`.

 Previous versions
 =================
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.4.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.4.rst
@@ -82,7 +82,7 @@ MI300X accelerator with the prebuilt vLLM Docker image.

   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
   might hang until the periodic balancing is finalized. For more information,
-   see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
+   see :ref:`AMD Instinct MI300X system optimization <mi300x-disable-numa>`.

   .. code-block:: shell

@@ -392,22 +392,25 @@ Further reading
 ===============

 - For application performance optimization strategies for HPC and AI workloads,
-  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
+  including inference with vLLM, see :doc:`/how-to/tuning-guides/mi300x/workload`.

 - To learn more about the options for latency and throughput benchmark scripts,
  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.

 - To learn more about system settings and management practices to configure your system for
-  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+  MI300X accelerators, see :doc:`/how-to/system-optimization/mi300x`.

- To learn how to run community models from Hugging Face on AMD GPUs, see
-  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
+- To learn how to run LLM models from Hugging Face or your own model, see
+  :doc:`Using ROCm for AI </how-to/rocm-for-ai/index>`.

- To learn how to fine-tune LLMs and optimize inference, see
-  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
+- To learn how to optimize inference on LLMs, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/llm-fine-tuning-optimization/index>`.

- For a list of other ready-made Docker images for AI with ROCm, see
-  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+- For a list of other ready-made Docker images for ROCm, see the
+  :doc:`Docker image support matrix <rocm-install-on-linux:reference/docker-image-support-matrix>`.
+
+- To compare with the previous version of the ROCm vLLM Docker image for performance validation, refer to
+  `LLM inference performance validation on AMD Instinct MI300X (ROCm 6.2.0) <https://rocm.docs.amd.com/en/docs-6.2.0/how-to/performance-validation/mi300x/vllm-benchmark.html>`_.

 Previous versions
 =================
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.6.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.6.rst
@@ -55,7 +55,7 @@ MI300X accelerator with the prebuilt vLLM Docker image.

   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
   might hang until the periodic balancing is finalized. For more information,
-   see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
+   see :ref:`AMD Instinct MI300X system optimization <mi300x-disable-numa>`.

   .. code-block:: shell

@@ -437,22 +437,22 @@ Further reading
 ===============

 - For application performance optimization strategies for HPC and AI workloads,
-  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
+  including inference with vLLM, see :doc:`../inference-optimization/workload`.

 - To learn more about the options for latency and throughput benchmark scripts,
  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.

 - To learn more about system settings and management practices to configure your system for
-  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+  MI300X accelerators, see :doc:`../../system-optimization/mi300x`.

- To learn how to run community models from Hugging Face on AMD GPUs, see
-  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
+- To learn how to run LLM models from Hugging Face or your own model, see
+  :doc:`Running models from Hugging Face <hugging-face-models>`.

- To learn how to fine-tune LLMs and optimize inference, see
-  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
+- To learn how to optimize inference on LLMs, see
+  :doc:`Inference optimization <../inference-optimization/index>`.

- For a list of other ready-made Docker images for AI with ROCm, see
-  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+- To learn how to fine-tune LLMs, see
+  :doc:`Fine-tuning LLMs <../fine-tuning/index>`.

 Previous versions
 =================
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.7.3-20250325.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.7.3-20250325.rst
@@ -130,7 +130,7 @@ vLLM inference performance testing

      To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
      might hang until the periodic balancing is finalized. For more information,
-   see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
+      see :ref:`AMD Instinct MI300X system optimization <mi300x-disable-numa>`.

      .. code-block:: shell

@@ -305,22 +305,22 @@ Further reading
 ===============

 - For application performance optimization strategies for HPC and AI workloads,
-  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
+  including inference with vLLM, see :doc:`../inference-optimization/workload`.

 - To learn more about the options for latency and throughput benchmark scripts,
  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.

 - To learn more about system settings and management practices to configure your system for
-  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+  MI300X accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_

- To learn how to run community models from Hugging Face on AMD GPUs, see
-  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
+- To learn how to run LLM models from Hugging Face or your own model, see
+  :doc:`Running models from Hugging Face <hugging-face-models>`.

- To learn how to fine-tune LLMs and optimize inference, see
-  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
+- To learn how to optimize inference on LLMs, see
+  :doc:`Inference optimization <../inference-optimization/index>`.

- For a list of other ready-made Docker images for AI with ROCm, see
-  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+- To learn how to fine-tune LLMs, see
+  :doc:`Fine-tuning LLMs <../fine-tuning/index>`.

 Previous versions
 =================
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.3-20250415.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.3-20250415.rst
@@ -1,5 +1,3 @@
-:orphan:
-
 .. meta::
   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
                 ROCm vLLM Docker image.
@@ -321,22 +319,22 @@ Further reading
 ===============

 - For application performance optimization strategies for HPC and AI workloads,
-  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
+  including inference with vLLM, see :doc:`../inference-optimization/workload`.

 - To learn more about the options for latency and throughput benchmark scripts,
  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.

 - To learn more about system settings and management practices to configure your system for
-  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+  MI300X accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_

- To learn how to run community models from Hugging Face on AMD GPUs, see
-  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
+- To learn how to run LLM models from Hugging Face or your own model, see
+  :doc:`Running models from Hugging Face <hugging-face-models>`.

- To learn how to fine-tune LLMs and optimize inference, see
-  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
+- To learn how to optimize inference on LLMs, see
+  :doc:`Inference optimization <../inference-optimization/index>`.

- For a list of other ready-made Docker images for AI with ROCm, see
-  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+- To learn how to fine-tune LLMs, see
+  :doc:`Fine-tuning LLMs <../fine-tuning/index>`.

 Previous versions
 =================
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250513.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250513.rst
@@ -333,19 +333,19 @@ Further reading
  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.

 - To learn more about system settings and management practices to configure your system for
-  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+  MI300X accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_

 - For application performance optimization strategies for HPC and AI workloads,
-  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
+  including inference with vLLM, see :doc:`../../../inference-optimization/workload`.

- To learn how to run community models from Hugging Face on AMD GPUs, see
-  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
+- To learn how to run LLM models from Hugging Face or your own model, see
+  :doc:`Running models from Hugging Face <../../hugging-face-models>`.

- To learn how to fine-tune LLMs and optimize inference, see
-  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
+- To learn how to optimize inference on LLMs, see
+  :doc:`Inference optimization <../../../inference-optimization/index>`.

- For a list of other ready-made Docker images for AI with ROCm, see
-  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+- To learn how to fine-tune LLMs, see
+  :doc:`Fine-tuning LLMs <../../../fine-tuning/index>`.

 Previous versions
 =================
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250521.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250521.rst
@@ -333,23 +333,22 @@ Further reading
  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.

 - To learn more about system settings and management practices to configure your system for
-  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+  MI300X accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_

 - For application performance optimization strategies for HPC and AI workloads,
-  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
+  including inference with vLLM, see :doc:`../../inference-optimization/workload`.

- To learn how to run community models from Hugging Face on AMD GPUs, see
-  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
+- To learn how to run LLM models from Hugging Face or your own model, see
+  :doc:`Running models from Hugging Face <../hugging-face-models>`.

- To learn how to fine-tune LLMs and optimize inference, see
-  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
+- To learn how to optimize inference on LLMs, see
+  :doc:`Inference optimization <../../inference-optimization/index>`.

- For a list of other ready-made Docker images for AI with ROCm, see
-  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+- To learn how to fine-tune LLMs, see
+  :doc:`Fine-tuning LLMs <../../fine-tuning/index>`.

 Previous versions
 =================

 See :doc:`vllm-history` to find documentation for previous releases
 of the ``ROCm/vllm`` Docker image.
-
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250605.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250605.rst
@@ -1,353 +0,0 @@
-:orphan:
-
-.. meta::
-   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
-                 ROCm vLLM Docker image.
-   :keywords: model, MAD, automation, dashboarding, validate
-
-**********************************
-vLLM inference performance testing
-**********************************
-
-.. caution::
-
-   This documentation does not reflect the latest version of ROCm vLLM
-   inference performance documentation. See :doc:`../vllm` for the latest version.
-
-.. _vllm-benchmark-unified-docker:
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.0.1_20250605-benchmark-models.yaml
-
-   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
-   {% set model_groups = data.vllm_benchmark.model_groups %}
-
-   The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
-   a prebuilt, optimized environment for validating large language model (LLM)
-   inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
-   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
-   accelerators and includes the following components:
-
-   * `ROCm {{ unified_docker.rocm_version }} <https://github.com/ROCm/ROCm>`_
-
-   * `vLLM {{ unified_docker.vllm_version }} <https://docs.vllm.ai/en/latest>`_
-
-   * `PyTorch {{ unified_docker.pytorch_version }} <https://github.com/ROCm/pytorch.git>`_
-
-   * `hipBLASLt {{ unified_docker.hipblaslt_version }} <https://github.com/ROCm/hipBLASLt>`_
-
-   With this Docker image, you can quickly test the :ref:`expected
-   inference performance numbers <vllm-benchmark-performance-measurements>` for
-   MI300X series accelerators.
-
-   .. _vllm-benchmark-available-models:
-
-   Supported models
-   ================
-
-   The following models are supported for inference performance benchmarking
-   with vLLM and ROCm. Some instructions, commands, and recommendations in this
-   documentation might vary by model -- select one to get started.
-
-   .. raw:: html
-
-      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
-        <div class="row">
-          <div class="col-2 me-2 model-param-head">Model group</div>
-          <div class="row col-10">
-   {% for model_group in model_groups %}
-            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
-   {% endfor %}
-          </div>
-        </div>
-
-        <div class="row mt-1">
-          <div class="col-2 me-2 model-param-head">Model</div>
-          <div class="row col-10">
-   {% for model_group in model_groups %}
-      {% set models = model_group.models %}
-      {% for model in models %}
-         {% if models|length % 3 == 0 %}
-            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-         {% else %}
-            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-         {% endif %}
-      {% endfor %}
-   {% endfor %}
-          </div>
-        </div>
-      </div>
-
-   .. _vllm-benchmark-vllm:
-
-   {% for model_group in model_groups %}
-      {% for model in model_group.models %}
-
-   .. container:: model-doc {{model.mad_tag}}
-
-      .. note::
-
-         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
-         Some models require access authorization prior to use via an external license agreement through a third party.
-
-      {% endfor %}
-   {% endfor %}
-
-   .. note::
-
-      vLLM is a toolkit and library for LLM inference and serving. AMD implements
-      high-performance custom kernels and modules in vLLM to enhance performance.
-      See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
-      more information.
-
-   .. _vllm-benchmark-performance-measurements:
-
-   Performance measurements
-   ========================
-
-   To evaluate performance, the
-   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
-   page provides reference throughput and latency measurements for inferencing popular AI models.
-
-   .. important::
-
-      The performance data presented in
-      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
-      only reflects the latest version of this inference benchmarking environment.
-      The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
-
-   Advanced features and known issues
-   ==================================
-
-   For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
-   see the developer's guide at `<https://github.com/ROCm/vllm/tree/7bb0618b1fe725b7d4fad9e525aa44da12c94a8b/docs/dev-docker>`__.
-
-   System validation
-   =================
-
-   Before running AI workloads, it's important to validate that your AMD hardware is configured
-   correctly and performing optimally.
-
-   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
-   might hang until the periodic balancing is finalized. For more information,
-   see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
-
-   .. code-block:: shell
-
-      # disable automatic NUMA balancing
-      sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
-      # check if NUMA balancing is disabled (returns 0 if disabled)
-      cat /proc/sys/kernel/numa_balancing
-      0
-
-   To test for optimal performance, consult the recommended :ref:`System health benchmarks
-   <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
-   system's configuration.
-
-   Pull the Docker image
-   =====================
-
-   Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
-   Use the following command to pull the Docker image from Docker Hub.
-
-   .. code-block:: shell
-
-      docker pull {{ unified_docker.pull_tag }}
-
-   Benchmarking
-   ============
-
-   Once the setup is complete, choose between two options to reproduce the
-   benchmark results:
-
-   .. _vllm-benchmark-mad:
-
-   {% for model_group in model_groups %}
-      {% for model in model_group.models %}
-
-   .. container:: model-doc {{model.mad_tag}}
-
-      .. tab-set::
-
-         .. tab-item:: MAD-integrated benchmarking
-
-            Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
-            directory and install the required packages on the host machine.
-
-            .. code-block:: shell
-
-               git clone https://github.com/ROCm/MAD
-               cd MAD
-               pip install -r requirements.txt
-
-            Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
-            using one GPU with the ``{{model.precision}}`` data type on the host machine.
-
-            .. code-block:: shell
-
-               export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
-               python3 tools/run_models.py --tags {{model.mad_tag}} --keep-model-dir --live-output --timeout 28800
-
-            MAD launches a Docker container with the name
-            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
-            model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
-
-            Although the :ref:`available models <vllm-benchmark-available-models>` are preconfigured
-            to collect latency and throughput performance data, you can also change the benchmarking
-            parameters. See the standalone benchmarking tab for more information.
-
-            {% if model.tunableop %}
-
-            .. note::
-
-               For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
-               TunableOp automatically explores different implementations and configurations of certain PyTorch
-               operators to find the fastest one for your hardware.
-
-               By default, ``{{model.mad_tag}}`` runs with TunableOp disabled
-               (see
-               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To
-               enable it, edit the default run behavior in the ``models.json``
-               configuration before running inference -- update the model's run
-               ``args`` by changing ``--tunableop off`` to ``--tunableop on``.
-
-               Enabling TunableOp triggers a two-pass run -- a warm-up followed by the performance-collection run.
-
-            {% endif %}
-
-         .. tab-item:: Standalone benchmarking
-
-            Run the vLLM benchmark tool independently by starting the
-            `Docker container <{{ unified_docker.docker_hub_url }}>`_
-            as shown in the following snippet.
-
-            .. code-block::
-
-               docker pull {{ unified_docker.pull_tag }}
-               docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name test {{ unified_docker.pull_tag }}
-
-            In the Docker container, clone the ROCm MAD repository and navigate to the
-            benchmark scripts directory at ``~/MAD/scripts/vllm``.
-
-            .. code-block::
-
-               git clone https://github.com/ROCm/MAD
-               cd MAD/scripts/vllm
-
-            To start the benchmark, use the following command with the appropriate options.
-
-            .. code-block::
-
-               ./vllm_benchmark_report.sh -s $test_option -m {{model.model_repo}} -g $num_gpu -d {{model.precision}}
-
-            .. list-table::
-               :header-rows: 1
-               :align: center
-
-               * - Name
-                 - Options
-                 - Description
-
-               * - ``$test_option``
-                 - latency
-                 - Measure decoding token latency
-
-               * -
-                 - throughput
-                 - Measure token generation throughput
-
-               * -
-                 - all
-                 - Measure both throughput and latency
-
-               * - ``$num_gpu``
-                 - 1 or 8
-                 - Number of GPUs
-
-               * - ``$datatype``
-                 - ``float16`` or ``float8``
-                 - Data type
-
-            .. note::
-
-               The input sequence length, output sequence length, and tensor parallel (TP) are
-               already configured. You don't need to specify them with this script.
-
-            .. note::
-
-               If you encounter the following error, pass your access-authorized Hugging
-               Face token to the gated models.
-
-               .. code-block::
-
-                  OSError: You are trying to access a gated repo.
-
-                  # pass your HF_TOKEN
-                  export HF_TOKEN=$your_personal_hf_token
-
-            Here are some examples of running the benchmark with various options.
-
-            * Latency benchmark
-
-              Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision.
-
-              .. code-block::
-
-                 ./vllm_benchmark_report.sh -s latency -m {{model.model_repo}} -g 8 -d {{model.precision}}
-
-              Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
-
-            * Throughput benchmark
-
-              Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision.
-
-              .. code-block:: shell
-
-                 ./vllm_benchmark_report.sh -s throughput -m {{model.model_repo}} -g 8 -d {{model.precision}}
-
-              Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
-
-            .. raw:: html
-
-               <style>
-               mjx-container[jax="CHTML"][display="true"] {
-                  text-align: left;
-                  margin: 0;
-               }
-               </style>
-
-            .. note::
-
-               Throughput is calculated as:
-
-               - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
-
-               - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
-      {% endfor %}
-   {% endfor %}
-
-Further reading
-===============
-
- To learn more about the options for latency and throughput benchmark scripts,
-  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
-
- To learn more about system settings and management practices to configure your system for
-  MI300X accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
-
- For application performance optimization strategies for HPC and AI workloads,
-  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
-
- To learn how to run community models from Hugging Face on AMD GPUs, see
-  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
-
- To learn how to fine-tune LLMs and optimize inference, see
-  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
-
- For a list of other ready-made Docker images for AI with ROCm, see
-  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
-
-Previous versions
-=================
-
-See :doc:`vllm-history` to find documentation for previous releases
-of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250702.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250702.rst
@@ -1,353 +0,0 @@
-:orphan:
-
-.. meta::
-   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
-                 ROCm vLLM Docker image.
-   :keywords: model, MAD, automation, dashboarding, validate
-
-**********************************
-vLLM inference performance testing
-**********************************
-
-.. caution::
-
-   This documentation does not reflect the latest version of ROCm vLLM
-   inference performance documentation. See :doc:`../vllm` for the latest version.
-
-.. _vllm-benchmark-unified-docker:
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250702-benchmark-models.yaml
-
-   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
-   {% set model_groups = data.vllm_benchmark.model_groups %}
-
-   The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
-   a prebuilt, optimized environment for validating large language model (LLM)
-   inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
-   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
-   accelerators and includes the following components:
-
-   * `ROCm {{ unified_docker.rocm_version }} <https://github.com/ROCm/ROCm>`_
-
-   * `vLLM {{ unified_docker.vllm_version }} <https://docs.vllm.ai/en/latest>`_
-
-   * `PyTorch {{ unified_docker.pytorch_version }} <https://github.com/ROCm/pytorch.git>`_
-
-   * `hipBLASLt {{ unified_docker.hipblaslt_version }} <https://github.com/ROCm/hipBLASLt>`_
-
-   With this Docker image, you can quickly test the :ref:`expected
-   inference performance numbers <vllm-benchmark-performance-measurements-20250702>` for
-   MI300X series accelerators.
-
-   .. _vllm-benchmark-available-models-20250702:
-
-   Supported models
-   ================
-
-   The following models are supported for inference performance benchmarking
-   with vLLM and ROCm. Some instructions, commands, and recommendations in this
-   documentation might vary by model -- select one to get started.
-
-   .. raw:: html
-
-      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
-        <div class="row">
-          <div class="col-2 me-2 model-param-head">Model group</div>
-          <div class="row col-10">
-   {% for model_group in model_groups %}
-            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
-   {% endfor %}
-          </div>
-        </div>
-
-        <div class="row mt-1">
-          <div class="col-2 me-2 model-param-head">Model</div>
-          <div class="row col-10">
-   {% for model_group in model_groups %}
-      {% set models = model_group.models %}
-      {% for model in models %}
-         {% if models|length % 3 == 0 %}
-            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-         {% else %}
-            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-         {% endif %}
-      {% endfor %}
-   {% endfor %}
-          </div>
-        </div>
-      </div>
-
-   .. _vllm-benchmark-vllm:
-
-   {% for model_group in model_groups %}
-      {% for model in model_group.models %}
-
-   .. container:: model-doc {{model.mad_tag}}
-
-      .. note::
-
-         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
-         Some models require access authorization prior to use via an external license agreement through a third party.
-
-      {% endfor %}
-   {% endfor %}
-
-   .. note::
-
-      vLLM is a toolkit and library for LLM inference and serving. AMD implements
-      high-performance custom kernels and modules in vLLM to enhance performance.
-      See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
-      more information.
-
-   .. _vllm-benchmark-performance-measurements-20250702:
-
-   Performance measurements
-   ========================
-
-   To evaluate performance, the
-   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
-   page provides reference throughput and latency measurements for inferencing popular AI models.
-
-   .. important::
-
-      The performance data presented in
-      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
-      only reflects the latest version of this inference benchmarking environment.
-      The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
-
-   Advanced features and known issues
-   ==================================
-
-   For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
-   see the developer's guide at `<https://github.com/ROCm/vllm/tree/5486e7bc8523be0324ccd68f221959445b56cc2a/docs/dev-docker>`__.
-
-   System validation
-   =================
-
-   Before running AI workloads, it's important to validate that your AMD hardware is configured
-   correctly and performing optimally.
-
-   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
-   might hang until the periodic balancing is finalized. For more information,
-   see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
-
-   .. code-block:: shell
-
-      # disable automatic NUMA balancing
-      sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
-      # check if NUMA balancing is disabled (returns 0 if disabled)
-      cat /proc/sys/kernel/numa_balancing
-      0
-
-   To test for optimal performance, consult the recommended :ref:`System health benchmarks
-   <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
-   system's configuration.
-
-   Pull the Docker image
-   =====================
-
-   Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
-   Use the following command to pull the Docker image from Docker Hub.
-
-   .. code-block:: shell
-
-      docker pull {{ unified_docker.pull_tag }}
-
-   Benchmarking
-   ============
-
-   Once the setup is complete, choose between two options to reproduce the
-   benchmark results:
-
-   .. _vllm-benchmark-mad:
-
-   {% for model_group in model_groups %}
-      {% for model in model_group.models %}
-
-   .. container:: model-doc {{model.mad_tag}}
-
-      .. tab-set::
-
-         .. tab-item:: MAD-integrated benchmarking
-
-            Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
-            directory and install the required packages on the host machine.
-
-            .. code-block:: shell
-
-               git clone https://github.com/ROCm/MAD
-               cd MAD
-               pip install -r requirements.txt
-
-            Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
-            using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
-
-            .. code-block:: shell
-
-               export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
-               python3 tools/run_models.py --tags {{model.mad_tag}} --keep-model-dir --live-output --timeout 28800
-
-            MAD launches a Docker container with the name
-            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
-            model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
-
-            Although the :ref:`available models <vllm-benchmark-available-models-20250702>` are preconfigured
-            to collect latency and throughput performance data, you can also change the benchmarking
-            parameters. See the standalone benchmarking tab for more information.
-
-            {% if model.tunableop %}
-
-            .. note::
-
-               For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
-               TunableOp automatically explores different implementations and configurations of certain PyTorch
-               operators to find the fastest one for your hardware.
-
-               By default, ``{{model.mad_tag}}`` runs with TunableOp disabled
-               (see
-               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To
-               enable it, edit the default run behavior in the ``models.json``
-               configuration before running inference -- update the model's run
-               ``args`` by changing ``--tunableop off`` to ``--tunableop on``.
-
-               Enabling TunableOp triggers a two-pass run -- a warm-up followed by the performance-collection run.
-
-            {% endif %}
-
-         .. tab-item:: Standalone benchmarking
-
-            Run the vLLM benchmark tool independently by starting the
-            `Docker container <{{ unified_docker.docker_hub_url }}>`_
-            as shown in the following snippet.
-
-            .. code-block::
-
-               docker pull {{ unified_docker.pull_tag }}
-               docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name test {{ unified_docker.pull_tag }}
-
-            In the Docker container, clone the ROCm MAD repository and navigate to the
-            benchmark scripts directory at ``~/MAD/scripts/vllm``.
-
-            .. code-block::
-
-               git clone https://github.com/ROCm/MAD
-               cd MAD/scripts/vllm
-
-            To start the benchmark, use the following command with the appropriate options.
-
-            .. code-block::
-
-               ./vllm_benchmark_report.sh -s $test_option -m {{model.model_repo}} -g $num_gpu -d {{model.precision}}
-
-            .. list-table::
-               :header-rows: 1
-               :align: center
-
-               * - Name
-                 - Options
-                 - Description
-
-               * - ``$test_option``
-                 - latency
-                 - Measure decoding token latency
-
-               * -
-                 - throughput
-                 - Measure token generation throughput
-
-               * -
-                 - all
-                 - Measure both throughput and latency
-
-               * - ``$num_gpu``
-                 - 1 or 8
-                 - Number of GPUs
-
-               * - ``$datatype``
-                 - ``float16`` or ``float8``
-                 - Data type
-
-            .. note::
-
-               The input sequence length, output sequence length, and tensor parallel (TP) are
-               already configured. You don't need to specify them with this script.
-
-            .. note::
-
-               If you encounter the following error, pass your access-authorized Hugging
-               Face token to the gated models.
-
-               .. code-block::
-
-                  OSError: You are trying to access a gated repo.
-
-                  # pass your HF_TOKEN
-                  export HF_TOKEN=$your_personal_hf_token
-
-            Here are some examples of running the benchmark with various options.
-
-            * Latency benchmark
-
-              Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with :literal`{{model.precision}}` precision.
-
-              .. code-block::
-
-                 ./vllm_benchmark_report.sh -s latency -m {{model.model_repo}} -g 8 -d {{model.precision}}
-
-              Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
-
-            * Throughput benchmark
-
-              Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
-
-              .. code-block:: shell
-
-                 ./vllm_benchmark_report.sh -s throughput -m {{model.model_repo}} -g 8 -d {{model.precision}}
-
-              Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
-
-            .. raw:: html
-
-               <style>
-               mjx-container[jax="CHTML"][display="true"] {
-                  text-align: left;
-                  margin: 0;
-               }
-               </style>
-
-            .. note::
-
-               Throughput is calculated as:
-
-               - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
-
-               - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
-      {% endfor %}
-   {% endfor %}
-
-Further reading
-===============
-
- To learn more about the options for latency and throughput benchmark scripts,
-  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
-
- To learn more about system settings and management practices to configure your system for
-  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
-
- For application performance optimization strategies for HPC and AI workloads,
-  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
-
- To learn how to run community models from Hugging Face on AMD GPUs, see
-  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
-
- To learn how to fine-tune LLMs and optimize inference, see
-  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
-
- For a list of other ready-made Docker images for AI with ROCm, see
-  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
-
-Previous versions
-=================
-
-See :doc:`vllm-history` to find documentation for previous releases
-of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst
@@ -11,99 +11,65 @@ previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub <https://hub.

 .. list-table::
   :header-rows: 1
+   :stub-columns: 1

-   * - Docker image tag
-     - Components
+   * - ROCm version
+     - vLLM version
+     - PyTorch version
     - Resources

-   * - ``rocm/vllm:rocm6.4.1_vllm_0.9.1_20250715``
-       (latest)
-     - 
-       * ROCm 6.4.1
-       * vLLM 0.9.1
-       * PyTorch 2.7.0
+   * - 6.4.0
+     - 0.9.0.1
+     - 2.7.0
     - 
       * :doc:`Documentation <../vllm>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250715/images/sha256-4a429705fa95a58f6d20aceab43b1b76fa769d57f32d5d28bd3f4e030e2a78ea>`__
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250521/images/sha256-38410c51af7208897cd8b737c9bdfc126e9bc8952d4aa6b88c85482f03092a11>`_

-   * - ``rocm/vllm:rocm6.4.1_vllm_0.9.1_20250702``
-     - 
-       * ROCm 6.4.1
-       * vLLM 0.9.1
-       * PyTorch 2.7.0
-     - 
-       * :doc:`Documentation <vllm-0.9.1-20250702>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250702/images/sha256-45068a2079cb8df554ed777141bf0c67d6627c470a897256e60c9f262677faab>`__
-
-   * - ``rocm/vllm:rocm6.4.1_vllm_0.9.0.1_20250605``
-     - 
-       * ROCm 6.4.1
-       * vLLM 0.9.0.1
-       * PyTorch 2.7.0
-     - 
-       * :doc:`Documentation <vllm-0.9.0.1-20250605>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.0.1_20250605/images/sha256-f48beeb3d72663a93c77211eb45273d564451447c097e060befa713d565fa36c>`_
-
-   * - ``rocm/vllm:rocm6.3.1_vllm_0.8.5_20250521``
-     - 
-       * ROCm 6.3.1
-       * 0.8.5 vLLM (0.8.6.dev)
-       * PyTorch 2.7.0
+   * - 6.3.1
+     - 0.8.5 (0.8.6.dev)
+     - 2.7.0
     - 
       * :doc:`Documentation <vllm-0.8.5-20250521>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250521/images/sha256-38410c51af7208897cd8b737c9bdfc126e9bc8952d4aa6b88c85482f03092a11>`__
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250521/images/sha256-38410c51af7208897cd8b737c9bdfc126e9bc8952d4aa6b88c85482f03092a11>`_

-   * - ``rocm/vllm:rocm6.3.1_vllm_0.8.5_20250513``
-     - 
-       * ROCm 6.3.1
-       * vLLM 0.8.5
-       * PyTorch 2.7.0
+   * - 6.3.1
+     - 0.8.5
+     - 2.7.0
     - 
       * :doc:`Documentation <vllm-0.8.5-20250513>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250513/images/sha256-5c8b4436dd0464119d9df2b44c745fadf81512f18ffb2f4b5dc235c71ebe26b4>`__
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250513/images/sha256-5c8b4436dd0464119d9df2b44c745fadf81512f18ffb2f4b5dc235c71ebe26b4>`_

-   * - ``rocm/vllm:rocm6.3.1_instinct_vllm0.8.3_20250415``
-     - 
-       * ROCm 6.3.1
-       * vLLM 0.8.3
-       * PyTorch 2.7.0
+   * - 6.3.1
+     - 0.8.3
+     - 2.7.0
     - 
       * :doc:`Documentation <vllm-0.8.3-20250415>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.8.3_20250415/images/sha256-ad9062dea3483d59dedb17c67f7c49f30eebd6eb37c3fac0a171fb19696cc845>`__
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.8.3_20250415/images/sha256-ad9062dea3483d59dedb17c67f7c49f30eebd6eb37c3fac0a171fb19696cc845>`_

-   * - ``rocm/vllm:rocm6.3.1_instinct_vllm0.7.3_20250325``
-     - 
-       * ROCm 6.3.1
-       * vLLM 0.7.3
-       * PyTorch 2.7.0
+   * - 6.3.1
+     - 0.7.3
+     - 2.7.0
     - 
       * :doc:`Documentation <vllm-0.7.3-20250325>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.7.3_20250325/images/sha256-25245924f61750b19be6dcd8e787e46088a496c1fe17ee9b9e397f3d84d35640>`__
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.7.3_20250325/images/sha256-25245924f61750b19be6dcd8e787e46088a496c1fe17ee9b9e397f3d84d35640>`_

-   * - ``rocm/vllm:rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6``
-     - 
-       * ROCm 6.3.1
-       * vLLM 0.6.6
-       * PyTorch 2.7.0
+   * - 6.3.1
+     - 0.6.6
+     - 2.7.0
     - 
       * :doc:`Documentation <vllm-0.6.6>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6/images/sha256-9a12ef62bbbeb5a4c30a01f702c8e025061f575aa129f291a49fbd02d6b4d6c9>`__
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6/images/sha256-9a12ef62bbbeb5a4c30a01f702c8e025061f575aa129f291a49fbd02d6b4d6c9>`_

-   * - ``rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4``
-     - 
-       * ROCm 6.2.1
-       * vLLM 0.6.4
-       * PyTorch 2.5.0
+   * - 6.2.1
+     - 0.6.4
+     - 2.5.0
     - 
       * :doc:`Documentation <vllm-0.6.4>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4/images/sha256-ccbb74cc9e7adecb8f7bdab9555f7ac6fc73adb580836c2a35ca96ff471890d8>`__
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4/images/sha256-ccbb74cc9e7adecb8f7bdab9555f7ac6fc73adb580836c2a35ca96ff471890d8>`_

-   * - ``rocm/vllm:rocm6.2_mi300_ubuntu22.04_py3.9_vllm_7c5fd50``
-     - 
-       * ROCm 6.2.0
-       * vLLM 0.4.3
-       * PyTorch 2.4.0
+   * - 6.2.0
+     - 0.4.3
+     - 2.4.0
     - 
       * :doc:`Documentation <vllm-0.4.3>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.2_mi300_ubuntu22.04_py3.9_vllm_7c5fd50/images/sha256-9e4dd4788a794c3d346d7d0ba452ae5e92d39b8dfac438b2af8efdc7f15d22c0>`__
-
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.2_mi300_ubuntu22.04_py3.9_vllm_7c5fd50/images/sha256-9e4dd4788a794c3d346d7d0ba452ae5e92d39b8dfac438b2af8efdc7f15d22c0>`_
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference.rst
@@ -32,10 +32,10 @@ PyTorch inference performance testing

      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
        <div class="row">
-          <div class="col-2 me-2 model-param-head">Model</div>
+          <div class="col-2 me-2 model-param-head">Model group</div>
          <div class="row col-10">
   {% for model_group in model_groups %}
-            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+            <div class="col-4 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
   {% endfor %}
          </div>
        </div>
@@ -103,7 +103,7 @@ PyTorch inference performance testing

         The Chai-1 benchmark uses a specifically selected Docker image using ROCm 6.2.3 and PyTorch 2.3.0 to address an accuracy issue.

-   .. container:: model-doc pyt_clip_inference pyt_mochi_video_inference pyt_wan2.1_inference
+   .. container:: model-doc pyt_clip_inference pyt_mochi_video_inference

      Use the following command to pull the `ROCm PyTorch Docker image <https://hub.docker.com/layers/rocm/pytorch/latest/images/sha256-05b55983e5154f46e7441897d0908d79877370adca4d1fff4899d9539d6c4969>`_ from Docker Hub.

@@ -140,11 +140,7 @@ PyTorch inference performance testing
      .. code-block:: shell

         export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
-         madengine run \
-             --tags {{model.mad_tag}} \
-             --keep-model-dir \
-             --live-output \
-             --timeout 28800
+         python3 tools/run_models.py --tags {{model.mad_tag}} --keep-model-dir --live-output --timeout 28800

      MAD launches a Docker container with the name
      ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
@@ -155,7 +151,8 @@ PyTorch inference performance testing
         For improved performance, consider enabling TunableOp. By default,
         ``{{model.mad_tag}}`` runs with TunableOp disabled (see
         `<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To enable
-         it, include the ``--tunableop on`` argument in your run.
+         it, edit the default run behavior in the ``tools/run_models.py``-- update the model's
+         run ``args`` by changing ``--tunableop off`` to ``--tunableop on``.

         Enabling TunableOp triggers a two-pass run -- a warm-up followed by the performance-collection run.
         Although this might increase the initial training time, it can result in a performance gain.
@@ -166,10 +163,8 @@ PyTorch inference performance testing
 Further reading
 ===============

- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
-
 - To learn more about system settings and management practices to configure your system for
-  AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+  MI300X accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.

 - For application performance optimization strategies for HPC and AI workloads,
  including inference with vLLM, see :doc:`../../inference-optimization/workload`.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
@@ -20,55 +20,23 @@ vLLM inference performance testing
   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
   accelerators and includes the following components:

-   .. list-table::
-      :header-rows: 1
+   * `ROCm {{ unified_docker.rocm_version }} <https://github.com/ROCm/ROCm>`_

-      * - Software component
-        - Version
+   * `vLLM {{ unified_docker.vllm_version }} <https://docs.vllm.ai/en/latest>`_

-      * - `ROCm <https://github.com/ROCm/ROCm>`__
-        - {{ unified_docker.rocm_version }}
+   * `PyTorch {{ unified_docker.pytorch_version }} <https://github.com/ROCm/pytorch.git>`_

-      * - `vLLM <https://docs.vllm.ai/en/latest>`__
-        - {{ unified_docker.vllm_version }}
+   * `hipBLASLt {{ unified_docker.hipblaslt_version }} <https://github.com/ROCm/hipBLASLt>`_

-      * - `PyTorch <https://github.com/ROCm/pytorch>`__
-        - {{ unified_docker.pytorch_version }}
-
-      * - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`__
-        - {{ unified_docker.hipblaslt_version }}
-
-With this Docker image, you can quickly test the :ref:`expected
-inference performance numbers <vllm-benchmark-performance-measurements>` for
-MI300X series accelerators.
-
-What's new
-==========
-
-The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <previous-versions/vllm-history>`.
-
-* The ``--compilation-config-parameter`` is no longer required as its options are now enabled by default.
-  This parameter has been removed from the benchmarking script.
-
-* Resolved Llama 3.1 405 B custom all-reduce issue, eliminating the need for ``--disable-custom-all-reduce``.
-  This parameter has been removed from the benchmarking script.
-
-* Fixed a ``+rms_norm`` custom kernel issue.
-
-* Added quick reduce functionality. Set ``VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=FP`` to enable; supported modes are ``FP``, ``INT8``, ``INT6``, ``INT4``.
-
-* Implemented a workaround to potentially mitigate GPU crashes experienced with the Command R+ model, pending a driver fix.
-
-Supported models
-================
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
-
-   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
-   {% set model_groups = data.vllm_benchmark.model_groups %}
+   With this Docker image, you can quickly test the :ref:`expected
+   inference performance numbers <vllm-benchmark-performance-measurements>` for
+   MI300X series accelerators.

   .. _vllm-benchmark-available-models:

+   Supported models
+   ================
+
   The following models are supported for inference performance benchmarking
   with vLLM and ROCm. Some instructions, commands, and recommendations in this
   documentation might vary by model -- select one to get started.
@@ -117,48 +85,56 @@ Supported models
      {% endfor %}
   {% endfor %}

-.. note::
+   .. note::

-   vLLM is a toolkit and library for LLM inference and serving. AMD implements
-   high-performance custom kernels and modules in vLLM to enhance performance.
-   See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
-   more information.
+      vLLM is a toolkit and library for LLM inference and serving. AMD implements
+      high-performance custom kernels and modules in vLLM to enhance performance.
+      See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
+      more information.

-.. _vllm-benchmark-performance-measurements:
+   .. _vllm-benchmark-performance-measurements:

-Performance measurements
-========================
+   Performance measurements
+   ========================

-To evaluate performance, the
-`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
-page provides reference throughput and latency measurements for inferencing popular AI models.
-
-.. important::
-
-   The performance data presented in
+   To evaluate performance, the
   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
-   only reflects the latest version of this inference benchmarking environment.
-   The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
+   page provides reference throughput and latency measurements for inferencing popular AI models.

-System validation
-=================
+   .. important::

-Before running AI workloads, it's important to validate that your AMD hardware is configured
-correctly and performing optimally.
+      The performance data presented in
+      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+      only reflects the latest version of this inference benchmarking environment.
+      The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.

-If you have already validated your system settings, including aspects like NUMA auto-balancing, you
-can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
-optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
-before starting training.
+   Advanced features and known issues
+   ==================================

-To test for optimal performance, consult the recommended :ref:`System health benchmarks
-<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
-system's configuration.
+   For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
+   see the developer's guide at `<https://github.com/ROCm/vllm/tree/7bb0618b1fe725b7d4fad9e525aa44da12c94a8b/docs/dev-docker>`__.

-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
+   System validation
+   =================

-   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
-   {% set model_groups = data.vllm_benchmark.model_groups %}
+   Before running AI workloads, it's important to validate that your AMD hardware is configured
+   correctly and performing optimally.
+
+   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
+   might hang until the periodic balancing is finalized. For more information,
+   see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
+
+   .. code-block:: shell
+
+      # disable automatic NUMA balancing
+      sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
+      # check if NUMA balancing is disabled (returns 0 if disabled)
+      cat /proc/sys/kernel/numa_balancing
+      0
+
+   To test for optimal performance, consult the recommended :ref:`System health benchmarks
+   <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+   system's configuration.

   Pull the Docker image
   =====================
@@ -187,26 +163,22 @@ system's configuration.

         .. tab-item:: MAD-integrated benchmarking

-            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
-               directory and install the required packages on the host machine.
+            Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+            directory and install the required packages on the host machine.

-               .. code-block:: shell
+            .. code-block:: shell

-                  git clone https://github.com/ROCm/MAD
-                  cd MAD
-                  pip install -r requirements.txt
+               git clone https://github.com/ROCm/MAD
+               cd MAD
+               pip install -r requirements.txt

-            2. Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
-               using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
+            Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
+            using one GPU with the ``{{model.precision}}`` data type on the host machine.

-               .. code-block:: shell
+            .. code-block:: shell

-                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
-                  madengine run \
-                      --tags {{model.mad_tag}} \
-                      --keep-model-dir \
-                      --live-output \
-                      --timeout 28800
+               export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+               python3 tools/run_models.py --tags {{model.mad_tag}} --keep-model-dir --live-output --timeout 28800

            MAD launches a Docker container with the name
            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
@@ -226,110 +198,86 @@ system's configuration.

               By default, ``{{model.mad_tag}}`` runs with TunableOp disabled
               (see
-               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__).
-               To enable it, include the ``--tunableop on`` argument in your
-               run.
+               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To
+               enable it, edit the default run behavior in the ``models.json``
+               configuration before running inference -- update the model's run
+               ``args`` by changing ``--tunableop off`` to ``--tunableop on``.

-               Enabling TunableOp triggers a two-pass run -- a warm-up followed
-               by the performance-collection run.
+               Enabling TunableOp triggers a two-pass run -- a warm-up followed by the performance-collection run.

            {% endif %}

         .. tab-item:: Standalone benchmarking

-            .. rubric:: Download the Docker image and required scripts
+            Run the vLLM benchmark tool independently by starting the
+            `Docker container <{{ unified_docker.docker_hub_url }}>`_
+            as shown in the following snippet.

-            1. Run the vLLM benchmark tool independently by starting the
-               `Docker container <{{ unified_docker.docker_hub_url }}>`_
-               as shown in the following snippet.
+            .. code-block::

-               .. code-block:: shell
+               docker pull {{ unified_docker.pull_tag }}
+               docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name test {{ unified_docker.pull_tag }}

-                  docker pull {{ unified_docker.pull_tag }}
-                  docker run -it \
-                      --device=/dev/kfd \
-                      --device=/dev/dri \
-                      --group-add video \
-                      --shm-size 16G \
-                      --security-opt seccomp=unconfined \
-                      --security-opt apparmor=unconfined \
-                      --cap-add=SYS_PTRACE \
-                      -v $(pwd):/workspace \
-                      --env HUGGINGFACE_HUB_CACHE=/workspace \
-                      --name test \
-                      {{ unified_docker.pull_tag }}
+            In the Docker container, clone the ROCm MAD repository and navigate to the
+            benchmark scripts directory at ``~/MAD/scripts/vllm``.

-            2. In the Docker container, clone the ROCm MAD repository and navigate to the
-               benchmark scripts directory at ``~/MAD/scripts/vllm``.
+            .. code-block::

-               .. code-block:: shell
+               git clone https://github.com/ROCm/MAD
+               cd MAD/scripts/vllm

-                  git clone https://github.com/ROCm/MAD
-                  cd MAD/scripts/vllm
+            To start the benchmark, use the following command with the appropriate options.

-            3. To start the benchmark, use the following command with the appropriate options.
+            .. code-block::

-               .. dropdown:: Benchmark options
-                  :open:
+               ./vllm_benchmark_report.sh -s $test_option -m {{model.model_repo}} -g $num_gpu -d {{model.precision}}

-                  .. list-table::
-                     :header-rows: 1
-                     :align: center
+            .. list-table::
+               :header-rows: 1
+               :align: center

-                     * - Name
-                       - Options
-                       - Description
+               * - Name
+                 - Options
+                 - Description

-                     * - ``$test_option``
-                       - latency
-                       - Measure decoding token latency
+               * - ``$test_option``
+                 - latency
+                 - Measure decoding token latency

-                     * -
-                       - throughput
-                       - Measure token generation throughput
+               * -
+                 - throughput
+                 - Measure token generation throughput

-                     * -
-                       - all
-                       - Measure both throughput and latency
+               * -
+                 - all
+                 - Measure both throughput and latency

-                     * - ``$num_gpu``
-                       - 1 or 8
-                       - Number of GPUs
+               * - ``$num_gpu``
+                 - 1 or 8
+                 - Number of GPUs

-                     * - ``$datatype``
-                       - ``float16`` or ``float8``
-                       - Data type
+               * - ``$datatype``
+                 - ``float16`` or ``float8``
+                 - Data type

-                  The input sequence length, output sequence length, and tensor parallel (TP) are
-                  already configured. You don't need to specify them with this script.
+            .. note::

-               Command:
+               The input sequence length, output sequence length, and tensor parallel (TP) are
+               already configured. You don't need to specify them with this script.
+
+            .. note::
+
+               If you encounter the following error, pass your access-authorized Hugging
+               Face token to the gated models.

               .. code-block::

-                  ./vllm_benchmark_report.sh \
-                      -s $test_option \
-                      -m {{model.model_repo}} \
-                      -g $num_gpu \
-                      -d {{model.precision}}
+                  OSError: You are trying to access a gated repo.

-               .. note::
+                  # pass your HF_TOKEN
+                  export HF_TOKEN=$your_personal_hf_token

-                  For best performance, it's recommend to run with ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``.
-
-                  If you encounter the following error, pass your access-authorized Hugging
-                  Face token to the gated models.
-
-                  .. code-block::
-
-                     OSError: You are trying to access a gated repo.
-
-                     # pass your HF_TOKEN
-                     export HF_TOKEN=$your_personal_hf_token
-
-            .. rubric:: Benchmarking examples
-
-            Here are some examples of running the benchmark with various options:
+            Here are some examples of running the benchmark with various options.

            * Latency benchmark

@@ -337,11 +285,7 @@ system's configuration.

              .. code-block::

-                 ./vllm_benchmark_report.sh \
-                     -s latency \
-                     -m {{model.model_repo}} \
-                     -g 8 \
-                     -d {{model.precision}}
+                 ./vllm_benchmark_report.sh -s latency -m {{model.model_repo}} -g 8 -d {{model.precision}}

              Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.

@@ -351,11 +295,7 @@ system's configuration.

              .. code-block:: shell

-                 ./vllm_benchmark_report.sh \
-                     -s throughput \
-                     -m {{model.model_repo}} \
-                     -g 8 \
-                     -d {{model.precision}}
+                 ./vllm_benchmark_report.sh -s throughput -m {{model.model_repo}} -g 8 -d {{model.precision}}

              Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.

@@ -378,66 +318,29 @@ system's configuration.
      {% endfor %}
   {% endfor %}

-Advanced usage
-==============
-
-For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
-see the developer's guide at `<https://github.com/ROCm/vllm/tree/f94ec9beeca1071cc34f9d1e206d8c7f3ac76129/docs/dev-docker>`__.
-
-Reproducing the Docker image
----------------------------
-
-To reproduce this ROCm/vLLM Docker image release, follow these steps:
-
-1. Clone the `vLLM repository <https://github.com/ROCm/vllm>`__.
-
-   .. code-block:: shell
-
-      git clone https://github.com/ROCm/vllm.git
-
-2. Checkout the specific release commit.
-
-   .. code-block:: shell
-
-      cd vllm
-      git checkout b432b7a285aa0dcb9677380936ffa74931bb6d6f
-
-3. Build the Docker image. Replace ``vllm-rocm`` with your desired image tag.
-
-   .. code-block:: shell
-
-      docker build -f docker/Dockerfile.rocm -t vllm-rocm .
-
-Known issues and workarounds
-============================
-
-AITER does not support FP8 KV cache yet.
-
 Further reading
 ===============

 - To learn more about the options for latency and throughput benchmark scripts,
  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.

- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
-
 - To learn more about system settings and management practices to configure your system for
-  AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+  MI300X accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_

 - For application performance optimization strategies for HPC and AI workloads,
-  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
+  including inference with vLLM, see :doc:`../../inference-optimization/workload`.

- To learn how to run community models from Hugging Face on AMD GPUs, see
-  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
+- To learn how to run LLM models from Hugging Face or your own model, see
+  :doc:`Running models from Hugging Face <../hugging-face-models>`.

- To learn how to fine-tune LLMs and optimize inference, see
-  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
+- To learn how to optimize inference on LLMs, see
+  :doc:`Inference optimization <../../inference-optimization/index>`.

- For a list of other ready-made Docker images for AI with ROCm, see
-  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+- To learn how to fine-tune LLMs, see
+  :doc:`Fine-tuning LLMs <../../fine-tuning/index>`.

 Previous versions
 =================

 See :doc:`previous-versions/vllm-history` to find documentation for previous releases
-of the ``ROCm/vllm`` Docker image.
+of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/index.rst
+++ b/docs/how-to/rocm-for-ai/inference/index.rst
@@ -14,14 +14,14 @@ Throughout the following topics, this section provides a comprehensive guide to
 The AI Developer Hub contains `AMD ROCm tutorials <https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/>`_ for
 training, fine-tuning, and inference. It leverages popular machine learning frameworks on AMD GPUs.

- :doc:`Installing ROCm and machine learning frameworks <../install>`
+- :doc:`Installing ROCm and machine learning frameworks <install>`

 - :doc:`Running models from Hugging Face <hugging-face-models>`

 - :doc:`LLM inference frameworks <llm-inference-frameworks>`

- :doc:`vLLM inference performance testing <benchmark-docker/vllm>`
+- :doc:`vLLM inference performance testing <vllm-benchmark>`

- :doc:`PyTorch inference performance testing <benchmark-docker/pytorch-inference>`
+- :doc:`PyTorch inference performance testing <pytorch-inference-benchmark>`

 - :doc:`Deploying your model <deploy-your-model>`
--- a/docs/how-to/rocm-for-ai/inference/llm-inference-frameworks.rst
+++ b/docs/how-to/rocm-for-ai/inference/llm-inference-frameworks.rst
@@ -141,7 +141,7 @@ Installing vLLM

   ROCm provides a prebuilt optimized Docker image for validating the performance of LLM inference with vLLM
   on the MI300X accelerator. The Docker image includes ROCm, vLLM, and PyTorch.
-   For more information, see :doc:`/how-to/rocm-for-ai/inference/benchmark-docker/vllm`.
+   For more information, see :doc:`vllm-benchmark`.

 .. _fine-tuning-llms-tgi:

--- a/Show More
+++ b/Show More