Add pybind11 as a pip module requirement for azure.

[Ex CI] revert PRIM default branch to develop (#4960 )
[Ex CI] allow rerun jobs to upload artifacts (#4959 )
2026-01-11 15:47:59 -05:00 · 2025-06-24 08:06:52 -05:00 · 2025-06-23 16:35:02 -04:00 · 2025-06-23 15:37:52 -04:00 · 2025-06-23 15:13:11 -04:00 · 2025-06-23 13:45:50 -04:00
127 changed files with 12264 additions and 4810 deletions
--- a/.azuredevops/ci-builds/mathlibs-trigger.yml
+++ b/.azuredevops/ci-builds/mathlibs-trigger.yml
@@ -0,0 +1,33 @@
 variables:
 - group: common
 - template: /.azuredevops/variables-global.yml@pipelines_repo
 parameters:
 - name: pipelinesRepoRef
  type: string
  default: refs/heads/develop
 - name: librariesRepoRef
  type: string
  default: refs/heads/develop
 resources:
  repositories:
  - repository: pipelines_repo
    type: github
    endpoint: ROCm
    name: ROCm/ROCm
    ref: ${{ parameters.pipelinesRepoRef }}
  - repository: libraries_repo
    type: github
    endpoint: ROCm
    name: ROCm/rocm-libraries
    ref: ${{ parameters.librariesRepoRef }}
 trigger: none
 pr: none
 jobs:
  - template: /.azuredevops/ci-builds/mathlibs.yml@pipelines_repo
    parameters:
      checkoutRepo: libraries_repo
      buildDependsOn: false
--- a/.azuredevops/ci-builds/mathlibs.yml
+++ b/.azuredevops/ci-builds/mathlibs.yml
@@ -0,0 +1,38 @@
 # entrypoint for kicking off a unified build of the mathlibs
 # this template is designed to be called by another pipeline (llvm, clr, etc.)
 # `buildDependsOn` will need to be set when calling this template
 # passes a `unifiedBuild` param to downstream pipelines, which will prevent duplicate jobs
 # logic needs to be added in individual mathlib pipelines for handling `unifiedBuild`
 parameters:
 - name: checkoutRepo
  type: string
  default: monorepo
 - name: buildDependsOn
  type: object
  default: false
 - name: downstreamComponentMatrix
  type: object
  default:
    - rocRAND:
      name: rocRAND
      sparseCheckoutDir: projects/rocrand
    - rocPRIM:
      name: rocPRIM
      sparseCheckoutDir: projects/rocprim
    - hipBLAS-common:
      name: hipBLAS-common
      sparseCheckoutDir: projects/hipblas-common
    # - composable_kernel:
    #   name: composable_kernel
    #   sparseCheckoutDir: projects/composablekernel
 jobs:
 - ${{ each component in parameters.downstreamComponentMatrix }}:
  - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
    parameters:
      checkoutRepo: ${{ parameters.checkoutRepo }}
      sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
      buildDependsOn: ${{ parameters.buildDependsOn }}
      triggerDownstreamJobs: true
      unifiedBuild: true
--- a/.azuredevops/components/HIP.yml
+++ b/.azuredevops/components/HIP.yml
@@ -20,7 +20,7 @@ parameters:
    - ocl-icd-libopencl1
    - ocl-icd-opencl-dev
    - opencl-headers
-    - python3-pip
+    - zlib1g-dev
 - name: pipModules
  type: object
  default:
@@ -41,120 +41,148 @@ parameters:
 # any changes for clr should just trigger HIP pipeline
 # similarly for hipother repo, for Nvidia backend
 - name: jobMatrix
  type: object
  default:
    buildJobs:
      - { os: ubuntu2204, packageManager: apt }
      - { os: almalinux8, packageManager: dnf }
 # HIP with AMD backend
 jobs:
- job: hip_clr_combined_amd
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
-  variables:
+  - job: hip_clr_combined_${{ job.os }}_amd
-  - group: common
+    pool:
-  - template: /.azuredevops/variables-global.yml
+      vmImage: 'ubuntu-22.04'
-  pool:
+    ${{ if eq(job.os, 'almalinux8') }}:
-    vmImage: ${{ variables.BASE_BUILD_POOL }}
+      container:
-  workspace:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-    clean: all
+        endpoint: ContainerService3
-  steps:
+    variables:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+    - group: common
-    parameters:
+    - template: /.azuredevops/variables-global.yml
-      aptPackages: ${{ parameters.aptPackages }}
+    workspace:
-      pipModules: ${{ parameters.pipModules }}
+      clean: all
-# checkout triggering repo (either HIP or clr)
+    steps:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-    parameters:
+      parameters:
-      checkoutRepo: ${{ parameters.checkoutRepo }}
+        aptPackages: ${{ parameters.aptPackages }}
-# if this is triggered by HIP repo, matching repo is clr
+        pipModules: ${{ parameters.pipModules }}
-# if this is triggered by clr repo, matching repo is HIP
+        packageManager: ${{ job.packageManager }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+  # checkout triggering repo (either HIP or clr)
-    parameters:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      checkoutRepo: matching_repo
+      parameters:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+        checkoutRepo: ${{ parameters.checkoutRepo }}
-    parameters:
+  # if this is triggered by HIP repo, matching repo is clr
-      checkoutRepo: hipother_repo
+  # if this is triggered by clr repo, matching repo is HIP
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-    parameters:
+      parameters:
-      checkoutRef: ${{ parameters.checkoutRef }}
+        checkoutRepo: matching_repo
-      dependencyList: ${{ parameters.rocmDependenciesAMD }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      aggregatePipeline: ${{ parameters.aggregatePipeline }}
+      parameters:
-# compile clr
+        checkoutRepo: hipother_repo
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-    parameters:
+      parameters:
-      componentName: clr
+        checkoutRef: ${{ parameters.checkoutRef }}
-      cmakeBuildDir: '$(Build.SourcesDirectory)/clr/build'
+        dependencyList: ${{ parameters.rocmDependenciesAMD }}
-      cmakeSourceDir: '$(Build.SourcesDirectory)/clr'
+        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-      extraBuildFlags: >-
+        os: ${{ job.os }}
-        -DHIP_COMMON_DIR=$(Build.SourcesDirectory)/HIP
+  # compile clr
-        -DHIP_PLATFORM=amd
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+      parameters:
-        -DROCM_PATH=$(Agent.BuildDirectory)/rocm
+        componentName: clr
-        -DHIPCC_BIN_DIR=$(Agent.BuildDirectory)/rocm/bin
+        cmakeBuildDir: '$(Build.SourcesDirectory)/clr/build'
-        -DCLR_BUILD_HIP=ON
+        cmakeSourceDir: '$(Build.SourcesDirectory)/clr'
-        -DCLR_BUILD_OCL=ON
+        os: ${{ job.os }}
-        -GNinja
+        useAmdclang: false
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+        extraBuildFlags: >-
-    parameters:
+          -DHIP_COMMON_DIR=$(Build.SourcesDirectory)/HIP
-      artifactName: amd
+          -DHIP_PLATFORM=amd
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-    parameters:
+          -DROCM_PATH=$(Agent.BuildDirectory)/rocm
-      artifactName: amd
+          -DHIPCC_BIN_DIR=$(Agent.BuildDirectory)/rocm/bin
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+          -DCLR_BUILD_HIP=ON
-  # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+          -DCLR_BUILD_OCL=ON
-  #   parameters:
+          -GNinja
-  #     aptPackages: ${{ parameters.aptPackages }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-  #     pipModules: ${{ parameters.pipModules }}
+      parameters:
-  #     environment: amd
+        artifactName: amd
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
        artifactName: amd
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
    #   parameters:
    #     aptPackages: ${{ parameters.aptPackages }}
    #     pipModules: ${{ parameters.pipModules }}
    #     environment: amd
 # HIP with Nvidia backend
- job: hip_clr_combined_nvidia
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
-  variables:
+  - job: hip_clr_combined_${{ job.os }}_nvidia
-  - group: common
+    pool:
-  - template: /.azuredevops/variables-global.yml
+      vmImage: 'ubuntu-22.04'
-  pool:
+    ${{ if eq(job.os, 'almalinux8') }}:
-    vmImage: ${{ variables.BASE_BUILD_POOL }}
+      container:
-  workspace:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-    clean: all
+        endpoint: ContainerService3
-  steps:
+    variables:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+    - group: common
-    parameters:
+    - template: /.azuredevops/variables-global.yml
-      aptPackages: ${{ parameters.aptPackages }}
+    workspace:
-      pipModules: ${{ parameters.pipModules }}
+      clean: all
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    steps:
-# checkout triggering repo (either HIP or clr)
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
-    parameters:
+        aptPackages: ${{ parameters.aptPackages }}
-      checkoutRepo: ${{ parameters.checkoutRepo }}
+        pipModules: ${{ parameters.pipModules }}
-# if this is triggered by HIP repo, matching repo is clr
+        packageManager: ${{ job.packageManager }}
-# if this is triggered by clr repo, matching repo is HIP
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+  # checkout triggering repo (either HIP or clr)
-    parameters:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      checkoutRepo: matching_repo
+      parameters:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+        checkoutRepo: ${{ parameters.checkoutRepo }}
-    parameters:
+  # if this is triggered by HIP repo, matching repo is clr
-      checkoutRepo: hipother_repo
+  # if this is triggered by clr repo, matching repo is HIP
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-    parameters:
+      parameters:
-      checkoutRef: ${{ parameters.checkoutRef }}
+        checkoutRepo: matching_repo
-      dependencyList: ${{ parameters.rocmDependenciesNvidia }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      aggregatePipeline: ${{ parameters.aggregatePipeline }}
+      parameters:
-  - script: 'ls -1R $(Agent.BuildDirectory)/rocm'
+        checkoutRepo: hipother_repo
-    displayName: 'Artifact listing'
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-# compile clr
+      parameters:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+        checkoutRef: ${{ parameters.checkoutRef }}
-    parameters:
+        dependencyList: ${{ parameters.rocmDependenciesNvidia }}
-      componentName: clr
+        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-      cmakeBuildDir: '$(Build.SourcesDirectory)/clr/build'
+        os: ${{ job.os }}
-      cmakeSourceDir: '$(Build.SourcesDirectory)/clr'
+    - script: 'ls -1R $(Agent.BuildDirectory)/rocm'
-      extraBuildFlags: >-
+      displayName: 'Artifact listing'
-        -DHIP_COMMON_DIR=$(Build.SourcesDirectory)/HIP
+  # compile clr
-        -DHIP_PLATFORM=nvidia
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-        -DHIPCC_BIN_DIR=$(Agent.BuildDirectory)/rocm/bin
+      parameters:
-        -DCLR_BUILD_HIP=ON
+        componentName: clr
-        -DCLR_BUILD_OCL=OFF
+        cmakeBuildDir: '$(Build.SourcesDirectory)/clr/build'
-        -DHIPNV_DIR=$(Build.SourcesDirectory)/hipother/hipnv
+        cmakeSourceDir: '$(Build.SourcesDirectory)/clr'
-        -GNinja
+        os: ${{ job.os }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+        useAmdclang: false
-    parameters:
+        extraBuildFlags: >-
-      artifactName: nvidia
+          -DHIP_COMMON_DIR=$(Build.SourcesDirectory)/HIP
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+          -DHIP_PLATFORM=nvidia
-  # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+          -DHIPCC_BIN_DIR=$(Agent.BuildDirectory)/rocm/bin
-  #   parameters:
+          -DCLR_BUILD_HIP=ON
-  #     aptPackages: ${{ parameters.aptPackages }}
+          -DCLR_BUILD_OCL=OFF
-  #     pipModules: ${{ parameters.pipModules }}
+          -DHIPNV_DIR=$(Build.SourcesDirectory)/hipother/hipnv
-  #     environment: nvidia
+          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
        artifactName: nvidia
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
    #   parameters:
    #     aptPackages: ${{ parameters.aptPackages }}
    #     pipModules: ${{ parameters.pipModules }}
    #     environment: nvidia
--- a/.azuredevops/components/HIPIFY.yml
+++ b/.azuredevops/components/HIPIFY.yml
@@ -1,4 +1,7 @@
 parameters:
 - name: componentName
  type: string
  default: HIPIFY
 - name: checkoutRepo
  type: string
  default: 'self'
@@ -13,113 +16,140 @@ parameters:
 - name: aptPackages
  type: object
  default:
-    - cmake
+    - cuda-toolkit-12-9
-    - ninja-build
+    - libcudnn9-dev-cuda-12
    - libnuma-dev
    - mesa-common-dev
    - ninja-build
    - python-is-python3
    - python3-dev
    - python3-pip
-    - python-is-python3
+- name: pipModules
-    - mesa-common-dev
+  type: object
-    - ccache
+  default:
-    - cuda-toolkit
+    - lit
-    - cudnn
+- name: rocmDependencies
  type: object
  default:
    - llvm-project
 - name: jobMatrix
  type: object
  default:
    buildJobs:
      - { os: ubuntu2204, packageManager: apt }
      - { os: almalinux8, packageManager: dnf }
    testJobs:
      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
 jobs:
- job: HIPIFY
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
-  variables:
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}
-  - group: common
+    variables:
-  - template: /.azuredevops/variables-global.yml
+    - group: common
-  - name: UPSTREAM_LLVM_GIT_URL
+    - template: /.azuredevops/variables-global.yml
-    value: https://github.com/llvm/llvm-project.git
+    pool:
-  - name: UPSTREAM_LLVM_TAG
+      ${{ if eq(job.os, 'ubuntu2404') }}:
-    value: llvmorg-18.1.2
+        name: rocm-ci_medium_build_pool_2404
-  pool: ${{ variables.MEDIUM_BUILD_POOL }}
+      ${{ else }}:
-  workspace:
+        name: ${{ variables.MEDIUM_BUILD_POOL }}
-    clean: all
+    ${{ if eq(job.os, 'almalinux8') }}:
-  steps:
+      container:
-  - task: Bash@3
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-    displayName: 'Register CUDA packages'
+        endpoint: ContainerService3
-    inputs:
+    workspace:
-      targetType: inline
+      clean: all
-      script: |
+    steps:
-        wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+    - task: Bash@3
-        sudo dpkg -i cuda-keyring_1.1-1_all.deb
+      displayName: 'Register CUDA packages'
-        sudo rm -f cuda-keyring_1.1-1_all.deb
+      inputs:
-        sudo apt update
+        targetType: inline
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        ${{ if eq(job.os, 'ubuntu2204') }}:
-    parameters:
+          script: |
-      aptPackages: ${{ parameters.aptPackages }}
+            wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+            sudo dpkg -i cuda-keyring_1.1-1_all.deb
-  - task: Bash@3
+            sudo rm -f cuda-keyring_1.1-1_all.deb
-    displayName: git clone upstream llvm-project
+            sudo apt update
-    inputs:
+        ${{ if eq(job.os, 'almalinux8') }}:
-      targetType: inline
+          script: |
-      script: git clone $(UPSTREAM_LLVM_GIT_URL) --depth=1 --branch $(UPSTREAM_LLVM_TAG) --recurse-submodules
+            sudo dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
-      workingDirectory: $(Pipeline.Workspace)
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
-    parameters:
+        aptPackages: ${{ parameters.aptPackages }}
-      checkoutRepo: ${{ parameters.checkoutRepo }}
+        pipModules: ${{ parameters.pipModules }}
-  - script: |
+        packageManager: ${{ job.packageManager }}
-      mkdir -p $(CCACHE_DIR)
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
-      echo "##vso[task.prependpath]/usr/lib/ccache:/usr/local/cuda/bin"
+    - task: Bash@3
-    displayName: Update path for cuda and ccache
+      displayName: Add lit to PATH
-  - task: Cache@2
+      inputs:
-    displayName: Ccache caching
+        targetType: inline
-    inputs:
+        script: |
-      key: HIPIFY | $(Agent.OS) | "$(UPSTREAM_LLVM_TAG)"
+          site_packages=$(python3 -m site --user-base)/bin
-      path: $(CCACHE_DIR)
+          sudo ln -sf $site_packages/bin/lit $(Pipeline.Workspace)/llvm-lit
-      restoreKeys: HIPIFY | $(Agent.OS)
+          echo "##vso[task.prependpath]$site_packages"
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    parameters:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      componentName: upstream-llvm
+      parameters:
-      cmakeBuildDir: $(Pipeline.Workspace)/llvm-project/llvm/build
+        checkoutRepo: ${{ parameters.checkoutRepo }}
-      cmakeSourceDir: $(Pipeline.Workspace)/llvm-project/llvm
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      installDir: $(Pipeline.Workspace)/llvm
+      parameters:
-      extraBuildFlags: >-
+        checkoutRef: ${{ parameters.checkoutRef }}
-        -DCMAKE_BUILD_TYPE=Release
+        dependencyList: ${{ parameters.rocmDependencies }}
-        -DLLVM_ENABLE_PROJECTS=clang
+        os: ${{ job.os }}
-        -DLLVM_INCLUDE_TESTS=OFF
+        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
+    # cutensor is not available from apt or dnf
-        -DCMAKE_C_COMPILER_LAUNCHER=ccache
+    - task: Bash@3
-        -GNinja
+      displayName: 'Download and install cutensor'
-  - task: Bash@3
+      inputs:
-    displayName: python install lit
+        targetType: inline
-    inputs:
+        script: |
-      targetType: inline
+          wget -q --show-progress https://developer.download.nvidia.com/compute/cutensor/redist/libcutensor/linux-x86_64/libcutensor-linux-x86_64-2.2.0.0-archive.tar.xz
-      script: sudo python3 $(Pipeline.Workspace)/llvm-project/llvm/utils/lit/setup.py install
+          tar -xvJf libcutensor-linux-x86_64-*.tar.xz
-  - task: Bash@3
+          mkdir -p $(Pipeline.Workspace)/cutensor
-    displayName: install FileCheck
+          cp -r libcutensor-linux-x86_64-*/* $(Pipeline.Workspace)/cutensor/
-    inputs:
+    - task: Bash@3
-      targetType: inline
+      displayName: 'List downloaded CUDA files'
-      script: cp $(Pipeline.Workspace)/llvm-project/llvm/build/bin/FileCheck $(Pipeline.Workspace)/llvm/bin
+      inputs:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+        targetType: inline
-    parameters:
+        script: ls -la1R /usr/local/cuda-12.9
-      componentName: HIPIFY
+    # script: cp $(Pipeline.Workspace)/llvm-project/llvm/build/bin/FileCheck $(Pipeline.Workspace)/llvm/bin
-      extraBuildFlags: >-
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-        -DHIPIFY_CLANG_TESTS=ON
+      parameters:
-        -DCMAKE_BUILD_TYPE=Release
+        componentName: ${{ parameters.componentName }}
-        -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda/targets/x86_64-linux
+        os: ${{ job.os }}
-        -DCUDA_DNN_ROOT_DIR=/usr/local/cuda/targets/x86_64-linux
+        consolidateBuildAndInstall: true
-        -DCMAKE_PREFIX_PATH=$(Pipeline.Workspace)/llvm;/usr/local/cuda/targets/x86_64-linux/lib
+        extraBuildFlags: >-
-        -DLLVM_EXTERNAL_LIT=$(Pipeline.Workspace)/llvm-project/llvm/build/bin/llvm-lit
+          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm/llvm;/usr/local/cuda/targets/x86_64-linux/lib
-      multithreadFlag: -- -j32
+          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang++
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+          -DHIPIFY_CLANG_TESTS=ON
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+          -DCMAKE_BUILD_TYPE=Release
-    parameters:
+          -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-12.9
-      componentName: HIPIFY
+          -DCUDA_DNN_ROOT_DIR=/usr/local/cuda-12.9
-      testDir: $(Build.SourcesDirectory)/build
+          -DCUDA_CUB_ROOT_DIR=/usr/local/cuda-12.9/targets/x86_64-linux/include/cub
-      testExecutable: make
+          -DCUDA_TENSOR_ROOT_DIR=$(Pipeline.Workspace)/cutensor/
-      testParameters: test-hipify
+        multithreadFlag: -- -j32
-      testPublishResults: false
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+      parameters:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        os: ${{ job.os }}
-    parameters:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-      aptPackages: ${{ parameters.aptPackages }}
+      parameters:
-      environment: combined
+        os: ${{ job.os }}
-      registerCUDAPackages: true
+    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      extraCopyDirectories:
+    #  parameters:
-        - llvm-project
+    #    componentName: HIPIFY
-      extraEnvVars:
+    #    testDir: $(Build.SourcesDirectory)/build
-        - UPSTREAM_LLVM_GIT_URL:::https://github.com/llvm/llvm-project.git
+    #    testExecutable: make
-        - UPSTREAM_LLVM_TAG:::llvmorg-18.1.2
+    #    testParameters: -j 32 test-hipify
    #    testPublishResults: false
    #    os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    - ${{ if eq(job.os, 'ubuntu2204') }}:
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
        parameters:
          aptPackages: ${{ parameters.aptPackages }}
          environment: combined
          registerCUDAPackages: true
          extraCopyDirectories:
            - llvm-project
--- a/.azuredevops/components/MIOpen.yml
+++ b/.azuredevops/components/MIOpen.yml
@@ -16,6 +16,7 @@ parameters:
    - cmake
    - jq
    - libdrm-dev
    - libmsgpack-dev
    - libsqlite3-dev
    - libstdc++-12-dev
    - ninja-build
--- a/.azuredevops/components/MIVisionX.yml
+++ b/.azuredevops/components/MIVisionX.yml
@@ -43,18 +43,20 @@ parameters:
 - name: rocmDependencies
  type: object
  default:
-    - rocm-cmake
+    - AMDMIGraphX
    - llvm-project
    - ROCR-Runtime
    - clr
    - half
    - hipBLAS-common
    - hipBLASLt
    - llvm-project
    - MIOpen
    - rocBLAS
    - rocDecode
    - rocm-cmake
    - rocminfo
    - rocprofiler-register
-    - half
+    - ROCR-Runtime
    - rocBLAS
    - MIOpen
    - AMDMIGraphX
    - rpp
    - rocDecode
 - name: rocmTestDependencies
  type: object
  default:
@@ -90,8 +92,7 @@ jobs:
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
-    pool:
+    pool: ${{ variables.MEDIUM_BUILD_POOL }}
      vmImage: ${{ variables.BASE_BUILD_POOL }}
    workspace:
      clean: all
    steps:
--- a/.azuredevops/components/ROCR-Runtime.yml
+++ b/.azuredevops/components/ROCR-Runtime.yml
@@ -20,7 +20,6 @@ parameters:
    - libnuma-dev
    - ninja-build
    - pkg-config
    - python3-pip
 - name: rocmDependencies
  type: object
  default:
@@ -36,51 +35,65 @@ parameters:
 - name: jobMatrix
  type: object
  default:
    buildJobs:
      - { os: ubuntu2204, packageManager: apt }
      - { os: almalinux8, packageManager: dnf }
    testJobs:
-      - gfx942:
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-        target: gfx942
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
      - gfx90a:
        target: gfx90a
 jobs:
- job: ROCR_Runtime_build
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
-  variables:
+  - job: ROCR_Runtime_build_${{ job.os }}
-  - group: common
+    pool:
-  - template: /.azuredevops/variables-global.yml
+      vmImage: 'ubuntu-22.04'
-  pool:
+    ${{ if eq(job.os, 'almalinux8') }}:
-    vmImage: ${{ variables.BASE_BUILD_POOL }}
+      container:
-  workspace:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-    clean: all
+        endpoint: ContainerService3
-  steps:
+    variables:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+    - group: common
-    parameters:
+    - template: /.azuredevops/variables-global.yml
-      aptPackages: ${{ parameters.aptPackages }}
+    workspace:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      clean: all
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+    steps:
-    parameters:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      checkoutRepo: ${{ parameters.checkoutRepo }}
+      parameters:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        aptPackages: ${{ parameters.aptPackages }}
-    parameters:
+        packageManager: ${{ job.packageManager }}
-      checkoutRef: ${{ parameters.checkoutRef }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-      dependencyList: ${{ parameters.rocmDependencies }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      aggregatePipeline: ${{ parameters.aggregatePipeline }}
+      parameters:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+        checkoutRepo: ${{ parameters.checkoutRepo }}
-    parameters:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      extraBuildFlags: >-
+      parameters:
-        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+        checkoutRef: ${{ parameters.checkoutRef }}
-        -DBUILD_SHARED_LIBS=ON
+        dependencyList: ${{ parameters.rocmDependencies }}
-        -DCMAKE_BUILD_TYPE=Release
+        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-        -GNinja
+        os: ${{ job.os }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+        os: ${{ job.os }}
-  # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        useAmdclang: false
-  #   parameters:
+        extraBuildFlags: >-
-  #     aptPackages: ${{ parameters.aptPackages }}
+          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
          -DBUILD_SHARED_LIBS=ON
          -DCMAKE_BUILD_TYPE=Release
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
    #   parameters:
    #     aptPackages: ${{ parameters.aptPackages }}
 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: ROCR_Runtime_test_${{ job.target }}
+  - job: ROCR_Runtime_test_${{ job.os }}_${{ job.target }}
-    dependsOn: ROCR_Runtime_build
+    dependsOn: ROCR_Runtime_build_${{ job.os }}
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -97,6 +110,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        packageManager: ${{ job.packageManager }}
    - task: Bash@3
      displayName: Install libhwloc5
      inputs:
@@ -107,12 +121,15 @@ jobs:
          sudo apt install -y --allow-downgrades ./libhwloc5_1.11.12-3_amd64.deb ./libhwloc-dev_1.11.12-3_amd64.deb
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
      parameters:
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmTestDependencies }}
        gpuTarget: ${{ job.target }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
@@ -121,11 +138,13 @@ jobs:
        runRocminfo: false
    - task: Bash@3
      displayName: Build kfdtest
      continueOnError: true
      inputs:
        targetType: 'inline'
        workingDirectory: $(Build.SourcesDirectory)/libhsakmt/tests/kfdtest
        script: |
          if [ -e /opt/rh/gcc-toolset-14/enable ]; then
            source /opt/rh/gcc-toolset-14/enable
          fi
          mkdir build && cd build
          cmake -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm ..
          make
@@ -135,13 +154,16 @@ jobs:
        testExecutable: BIN_DIR=$(Build.SourcesDirectory)/libhsakmt/tests/kfdtest/build ./run_kfdtest.sh
        testParameters: '-p core --gtest_output=xml:./test_output.xml --gtest_color=yes'
        testDir: $(Build.SourcesDirectory)/libhsakmt/tests/kfdtest/scripts
        os: ${{ job.os }}
    - task: Bash@3
      displayName: Build rocrtst
      continueOnError: true
      inputs:
        targetType: 'inline'
        workingDirectory: $(Build.SourcesDirectory)/rocrtst/suites/test_common
        script: |
          if [ -e /opt/rh/gcc-toolset-14/enable ]; then
            source /opt/rh/gcc-toolset-14/enable
          fi
          BASE_CLANG_DIR=$(Agent.BuildDirectory)/rocm/llvm/lib/clang
          export NEWEST_CLANG_VER=$(ls -1 $BASE_CLANG_DIR | sort -V | tail -n 1)
          mkdir build && cd build
@@ -159,6 +181,7 @@ jobs:
        testExecutable: ./rocrtst64
        testParameters: '--gtest_filter="-rocrtstNeg.Memory_Negative_Tests:rocrtstFunc.Memory_Max_Mem" --gtest_output=xml:./test_output.xml --gtest_color=yes'
        testDir: $(Build.SourcesDirectory)/rocrtst/suites/test_common/build/${{ job.target }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/ROCdbgapi.yml
+++ b/.azuredevops/components/ROCdbgapi.yml
@@ -15,7 +15,6 @@ parameters:
  default:
    - cmake
    - ninja-build
    - python3-pip
 - name: rocmDependencies
  type: object
  default:
@@ -24,37 +23,57 @@ parameters:
    - rocminfo
    - ROCR-Runtime
 - name: jobMatrix
  type: object
  default:
    buildJobs:
      - { os: ubuntu2204, packageManager: apt }
      - { os: almalinux8, packageManager: dnf }
 jobs:
- job: ROCdbgapi
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
-  variables:
+  - job: ROCdbgapi_build_${{ job.os }}
-  - group: common
+    pool:
-  - template: /.azuredevops/variables-global.yml
+      vmImage: 'ubuntu-22.04'
-  pool:
+    ${{ if eq(job.os, 'almalinux8') }}:
-    vmImage: ${{ variables.BASE_BUILD_POOL }}
+      container:
-  workspace:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-    clean: all
+        endpoint: ContainerService3
-  steps:
+    variables:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+    - group: common
-    parameters:
+    - template: /.azuredevops/variables-global.yml
-      aptPackages: ${{ parameters.aptPackages }}
+    workspace:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      clean: all
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+    steps:
-    parameters:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      checkoutRepo: ${{ parameters.checkoutRepo }}
+      parameters:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        aptPackages: ${{ parameters.aptPackages }}
-    parameters:
+        packageManager: ${{ job.packageManager }}
-      checkoutRef: ${{ parameters.checkoutRef }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-      dependencyList: ${{ parameters.rocmDependencies }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      aggregatePipeline: ${{ parameters.aggregatePipeline }}
+      parameters:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+        checkoutRepo: ${{ parameters.checkoutRepo }}
-    parameters:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      extraBuildFlags: >-
+      parameters:
-        -DCMAKE_BUILD_TYPE=Release
+        checkoutRef: ${{ parameters.checkoutRef }}
-        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+        dependencyList: ${{ parameters.rocmDependencies }}
-        -GNinja
+        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+        os: ${{ job.os }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+      parameters:
-  # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        os: ${{ job.os }}
-  #   parameters:
+        useAmdclang: false
-  #     aptPackages: ${{ parameters.aptPackages }}
+        extraBuildFlags: >-
          -DCMAKE_BUILD_TYPE=Release
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
    #   parameters:
    #     aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/ROCgdb.yml
+++ b/.azuredevops/components/ROCgdb.yml
@@ -1,4 +1,7 @@
 parameters:
 - name: componentName
  type: string
  default: ROCgdb
 - name: checkoutRepo
  type: string
  default: 'self'
@@ -23,8 +26,10 @@ parameters:
    - libgmp-dev
    - liblzma-dev
    - libmpfr-dev
    - pkg-config
    - ncurses-dev
    - pkg-config
    - python3-dev
    - python3-pip
    - texinfo
    - zlib1g-dev
 - name: rocmDependencies
@@ -40,67 +45,87 @@ parameters:
 - name: jobMatrix
  type: object
  default:
    buildJobs:
      - { os: ubuntu2204, packageManager: apt }
      - { os: almalinux8, packageManager: dnf }
    testJobs:
-      - gfx942:
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-        target: gfx942
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
      - gfx90a:
        target: gfx90a
 jobs:
- job: ROCgdb
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
-  variables:
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}
-  - group: common
+    variables:
-  - template: /.azuredevops/variables-global.yml
+    - group: common
-  - name: PKG_CONFIG_PATH
+    - template: /.azuredevops/variables-global.yml
-    value: $(Agent.BuildDirectory)/rocm/share/pkgconfig
+    - name: PKG_CONFIG_PATH
-  pool:
+      value: $(Agent.BuildDirectory)/rocm/share/pkgconfig
-    vmImage: ${{ variables.BASE_BUILD_POOL }}
+    pool:
-  workspace:
+      ${{ if eq(job.os, 'ubuntu2404') }}:
-    clean: all
+        name: rocm-ci_medium_build_pool_2404
-  steps:
+      ${{ else }}:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        name: ${{ variables.MEDIUM_BUILD_POOL }}
-    parameters:
+    ${{ if eq(job.os, 'almalinux8') }}:
-      aptPackages: ${{ parameters.aptPackages }}
+      container:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+        endpoint: ContainerService3
-    parameters:
+    workspace:
-      checkoutRepo: ${{ parameters.checkoutRepo }}
+      clean: all
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+    steps:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-    parameters:
+      parameters:
-      checkoutRef: ${{ parameters.checkoutRef }}
+        aptPackages: ${{ parameters.aptPackages }}
-      dependencyList: ${{ parameters.rocmDependencies }}
+        packageManager: ${{ job.packageManager }}
-      aggregatePipeline: ${{ parameters.aggregatePipeline }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-autotools.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-    parameters:
+      parameters:
-      configureFlags: >-
+        checkoutRepo: ${{ parameters.checkoutRepo }}
-        --program-prefix=roc
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-        --enable-64-bit-bfd
+      parameters:
-        --enable-targets="x86_64-linux-gnu,amdgcn-amd-amdhsa"
+        os: ${{ job.os }}
-        --disable-ld
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-        --disable-gas
+      parameters:
-        --disable-gdbserver
+        checkoutRef: ${{ parameters.checkoutRef }}
-        --disable-sim
+        dependencyList: ${{ parameters.rocmDependencies }}
-        --enable-tui
+        os: ${{ job.os }}
-        --disable-gdbtk
+        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-        --disable-shared
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-autotools.yml
-        --disable-gprofng
+      parameters:
-        --with-expat
+        os: ${{ job.os }}
-        --with-system-zlib
+        configureFlags: >-
-        --without-guile
+          --program-prefix=roc
-        --with-babeltrace
+          --enable-64-bit-bfd
-        --with-lzma
+          --enable-targets="x86_64-linux-gnu,amdgcn-amd-amdhsa"
-        --with-python=python3
+          --disable-ld
-        --with-rocm-dbgapi=$(Agent.BuildDirectory)/rocm
+          --disable-gas
-        LDFLAGS="-Wl,--enable-new-dtags,-rpath=$(Agent.BuildDirectory)/rocm/lib"
+          --disable-gdbserver
-      makeCallPrefix: LD_RUN_PATH='${ORIGIN}/../lib'
+          --disable-sim
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+          --enable-tui
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+          --disable-gdbtk
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+          --disable-shared
          --disable-gprofng
          --with-expat
          --with-system-zlib
          --without-guile
          --with-babeltrace
          --with-lzma
          --with-python=python3
          --with-rocm-dbgapi=$(Agent.BuildDirectory)/rocm
          LDFLAGS="-Wl,--enable-new-dtags,-rpath=$(Agent.BuildDirectory)/rocm/lib"
        makeCallPrefix: LD_RUN_PATH='${ORIGIN}/../lib'
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
        componentName: ${{ parameters.componentName }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
        componentName: ${{ parameters.componentName }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: ROCgdb_test_${{ job.target }}
+  - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
-    dependsOn: ROCgdb
+    dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -119,18 +144,23 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
      parameters:
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        os: ${{ job.os }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-autotools.yml
      parameters:
        os: ${{ job.os }}
        configureFlags: >-
          --program-prefix=roc
          --enable-64-bit-bfd
@@ -166,7 +196,9 @@ jobs:
      continueOnError: true
      inputs:
        targetType: inline
-        script: make check-gdb TESTS=gdb.rocm/simple.exp
+        script: |
          ${{ iif(eq(job.os, 'almalinux8'), 'source /opt/rh/gcc-toolset-14/enable', '') }}
          make check-gdb TESTS=gdb.rocm/simple.exp
        workingDirectory: $(Build.SourcesDirectory)
    - task: Bash@3
      displayName: print gdb log
--- a/.azuredevops/components/Tensile.yml
+++ b/.azuredevops/components/Tensile.yml
@@ -1,10 +1,29 @@
 parameters:
 - name: componentName
  type: string
  default: Tensile
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
 # monorepo related parameters
 - name: sparseCheckoutDir
  type: string
  default: ''
 - name: triggerDownstreamJobs
  type: boolean
  default: false
 - name: downstreamAggregateNames
  type: string
  default: ''
 - name: buildDependsOn
  type: object
  default: null
 - name: unifiedBuild
  type: boolean
  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -13,7 +32,6 @@ parameters:
 - name: aptPackages
  type: object
  default:
    - python3-pip
    - cmake
    - libmsgpack-dev
    - libboost-program-options-dev
@@ -38,75 +56,97 @@ parameters:
 - name: jobMatrix
  type: object
  default:
    buildJobs:
      - { os: ubuntu2204, packageManager: apt }
      - { os: almalinux8, packageManager: dnf }
    testJobs:
-      - gfx942:
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-        target: gfx942
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
      - gfx90a:
        target: gfx90a
 jobs:
- job: Tensile_build
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
-  variables:
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}
-  - group: common
+    ${{ if parameters.buildDependsOn }}:
-  - template: /.azuredevops/variables-global.yml
+      dependsOn: ${{ parameters.buildDependsOn[job.target] }}
-  - name: ROCM_PATH
+    variables:
-    value: $(Agent.BuildDirectory)/rocm
+    - group: common
-  pool:
+    - template: /.azuredevops/variables-global.yml
-    vmImage: ${{ variables.BASE_BUILD_POOL }}
+    - name: ROCM_PATH
-  workspace:
+      value: $(Agent.BuildDirectory)/rocm
-    clean: all
+    pool:
-  steps:
+      vmImage: ${{ variables.BASE_BUILD_POOL }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+    ${{ if eq(job.os, 'almalinux8') }}:
-    parameters:
+      container:
-      aptPackages: ${{ parameters.aptPackages }}
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-      pipModules: ${{ parameters.pipModules }}
+        endpoint: ContainerService3
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    workspace:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      clean: all
-    parameters:
+    steps:
-      checkoutRepo: ${{ parameters.checkoutRepo }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      parameters:
-    parameters:
+        aptPackages: ${{ parameters.aptPackages }}
-      checkoutRef: ${{ parameters.checkoutRef }}
+        pipModules: ${{ parameters.pipModules }}
-      dependencyList: ${{ parameters.rocmDependencies }}
+        packageManager: ${{ job.packageManager }}
-      aggregatePipeline: ${{ parameters.aggregatePipeline }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-  - task: Bash@3
+      parameters:
-    displayName: Create wheel file
+        checkoutRepo: ${{ parameters.checkoutRepo }}
-    inputs:
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
-      targetType: inline
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      script: python3 setup.py bdist_wheel
+      parameters:
-      workingDirectory: $(Build.SourcesDirectory)
+        checkoutRef: ${{ parameters.checkoutRef }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-prepare-package.yml
+        dependencyList: ${{ parameters.rocmDependencies }}
-    parameters:
+        os: ${{ job.os }}
-      sourceDir: $(Build.SourcesDirectory)/dist
+        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-      contentsString: '*.whl'
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-      targetDir: $(Build.ArtifactStagingDirectory)
+      parameters:
-      clean: false
+        os: ${{ job.os }}
-  - task: PublishPipelineArtifact@1
+    - task: Bash@3
-    displayName: 'wheel file Publish'
+      displayName: Create wheel file
-    retryCountOnTaskFailure: 3
+      inputs:
-    inputs:
+        targetType: inline
-      targetPath: $(Build.ArtifactStagingDirectory)
+        script: python3 setup.py bdist_wheel
-  - task: Bash@3
+        workingDirectory: $(Agent.BuildDirectory)/s
-    displayName: Save pipeline artifact file names
+    - task: Bash@3
-    inputs:
+      displayName: Rename wheel file with job OS
-      workingDirectory: $(Pipeline.Workspace)
+      inputs:
-      targetType: inline
+        targetType: inline
-      script: |
+        workingDirectory: $(Agent.BuildDirectory)/s
-        whlFile=$(find "$(Build.ArtifactStagingDirectory)" -type f -name "*.whl" | head -n 1)
+        script: |
-        if [ -n "$whlFile" ]; then
+          wheelFile=$(find "$(Agent.BuildDirectory)/s/dist" -type f -name "*.whl" | head -n 1)
-          echo $(basename "$whlFile") >> pipelineArtifacts.txt
+          newWheelFile="$(basename "$wheelFile" .whl)-${{ job.os }}.whl"
-        fi
+          mv "$wheelFile" "$(dirname "$wheelFile")/$newWheelFile"
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-prepare-package.yml
-  # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+      parameters:
-  #   parameters:
+        sourceDir: $(Agent.BuildDirectory)/s/dist
-  #     aptPackages: ${{ parameters.aptPackages }}
+        contentsString: '*.whl'
-  #     pipModules: ${{ parameters.pipModules }}
+        targetDir: $(Build.ArtifactStagingDirectory)
        clean: false
    - task: PublishPipelineArtifact@1
      displayName: 'wheel file Publish'
      retryCountOnTaskFailure: 3
      inputs:
        targetPath: $(Build.ArtifactStagingDirectory)
    - task: Bash@3
      displayName: Save pipeline artifact file names
      inputs:
        workingDirectory: $(Pipeline.Workspace)
        targetType: inline
        script: |
          whlFile=$(find "$(Build.ArtifactStagingDirectory)" -type f -name "*.whl" | head -n 1)
          if [ -n "$whlFile" ]; then
            echo $(basename "$whlFile") >> pipelineArtifacts.txt
          fi
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
    #   parameters:
    #     aptPackages: ${{ parameters.aptPackages }}
    #     pipModules: ${{ parameters.pipModules }}
 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: Tensile_test_${{ job.target }}
+  - job: Tensile_test_${{ job.os }}_${{ job.target }}
    timeoutInMinutes: 180
-    dependsOn: Tensile_build
+    dependsOn: Tensile_build_${{ job.os }}
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -126,20 +166,23 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - task: DownloadPipelineArtifact@2
      displayName: 'Download Pipeline Wheel Files'
      inputs:
-        itemPattern: '**/*.whl'
+        itemPattern: '**/*${{ job.os }}*.whl'
        targetPath: $(Agent.BuildDirectory)
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - task: Bash@3
      displayName: pip install
@@ -164,7 +207,7 @@ jobs:
      inputs:
        targetType: inline
        script: tox run -v -e ci -- -m pre_checkin
-        workingDirectory: $(Build.SourcesDirectory)
+        workingDirectory: $(Agent.BuildDirectory)/s
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/amdsmi.yml
+++ b/.azuredevops/components/amdsmi.yml
@@ -16,50 +16,66 @@ parameters:
    - cmake
    - libdrm-dev
    - ninja-build
    - python3-pip
    - pkg-config
 - name: jobMatrix
  type: object
  default:
    buildJobs:
      - { os: ubuntu2204, packageManager: apt }
      - { os: ubuntu2404, packageManager: apt }
      - { os: almalinux8, packageManager: dnf }
    testJobs:
-      - gfx942:
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-        target: gfx942
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
      - gfx90a:
        target: gfx90a
 jobs:
- job: amdsmi_build
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
-  variables:
+  - job: amdsmi_build_${{ job.os }}
-  - group: common
+    pool:
-  - template: /.azuredevops/variables-global.yml
+      ${{ if eq(job.os, 'ubuntu2404') }}:
-  pool:
+        vmImage: 'ubuntu-24.04'
-    vmImage: ${{ variables.BASE_BUILD_POOL }}
+      ${{ else }}:
-  workspace:
+        vmImage: 'ubuntu-22.04'
-    clean: all
+    ${{ if eq(job.os, 'almalinux8') }}:
-  steps:
+      container:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-    parameters:
+        endpoint: ContainerService3
-      aptPackages: ${{ parameters.aptPackages }}
+    variables:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - group: common
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+    - template: /.azuredevops/variables-global.yml
-    parameters:
+    workspace:
-      checkoutRepo: ${{ parameters.checkoutRepo }}
+      clean: all
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+    steps:
-    parameters:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      extraBuildFlags: >-
+      parameters:
-        -DBUILD_TESTS=ON
+        aptPackages: ${{ parameters.aptPackages }}
-        -GNinja
+        packageManager: ${{ job.packageManager }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+      parameters:
-  # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        checkoutRepo: ${{ parameters.checkoutRepo }}
-  #   parameters:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-  #     aptPackages: ${{ parameters.aptPackages }}
+      parameters:
        os: ${{ job.os }}
        useAmdclang: false
        extraBuildFlags: >-
          -DBUILD_TESTS=ON
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
    #   parameters:
    #     aptPackages: ${{ parameters.aptPackages }}
 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: amdsmi_test_${{ job.target }}
+  - job: amdsmi_test_${{ job.os }}_${{ job.target }}
-    dependsOn: amdsmi_build
+    dependsOn: amdsmi_build_${{ job.os }}
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -76,8 +92,11 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
      parameters:
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
      parameters:
        runRocminfo: false
@@ -85,8 +104,9 @@ jobs:
      parameters:
        componentName: amdsmi
        testDir: '$(Agent.BuildDirectory)'
-        testExecutable: './rocm/share/amd_smi/tests/amdsmitst'
+        testExecutable: 'sudo ./rocm/share/amd_smi/tests/amdsmitst'
        testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/aomp.yml
+++ b/.azuredevops/components/aomp.yml
@@ -1,4 +1,7 @@
 parameters:
 - name: componentName
  type: string
  default: aomp
 - name: checkoutRepo
  type: string
  default: 'self'
@@ -15,173 +18,187 @@ parameters:
 - name: aptPackages
  type: object
  default:
    - bison
    - ccache
    - cmake
-    - python3-pip
+    - flex
    - ninja-build
    - pkg-config
    - libpci-dev
    - libnuma-dev
    - libffi-dev
    - git
    - libopenmpi-dev
    - gawk
    - git
    - mesa-common-dev
-    - libtool
+    - ninja-build
    - libbabeltrace-dev
    - libbison-dev
    - libdrm-amdgpu1
    - libdrm-dev
    - libdw-dev
-    - libgtest-dev
+    - libffi-dev
-    - libsystemd-dev
+    - libgmp-dev
    - liblzma-dev
    - libmpfr-dev
    - libncurses5-dev
    - libnuma-dev
    - libopenmpi-dev
    - libpci-dev
    - libssl-dev
    - libstdc++-12-dev
-    - ccache
+    - libsystemd-dev
-    - libgmp-dev
+    - libtool
    - libmpfr-dev
    - texinfo
    - libbison-dev
    - bison
    - flex
    - libbabeltrace-dev
    - libncurses5-dev
    - liblzma-dev
    - python3-setuptools
    - python3-dev
    - libudev-dev
    - parallel
-  # Referencing comment snippet.
+    - pkg-config
-  #
+    - python3-dev
-  # snippet from https://github.com/ROCm/aomp/blob/aomp-dev/bin/build_aomp.sh#L131-L134
+    - python3-pip
-  #
+    - python3-setuptools
-  # For ROCM build (AOMP_STANDALONE_BUILD=0) the components roct, rocr,
+    - texinfo
  # libdevice, project, comgr, rocminfo, hipamd, rocdbgapi, rocgdb,
  # roctracer, rocprofiler, rocm_smi_lib, and amdsmi should be found
  # in ROCM in /opt/rocm.  The ROCM build only needs these components:
 - name: rocmDependencies
  type: object
  default:
-    - amdsmi
+    - llvm-project
    - ROCR-Runtime
 - name: rocmTestDependencies
  type: object
  default:
    - clr
    - llvm-project
    - ROCdbgapi
    - ROCgdb
    - rocm-cmake
    - rocm-core
    - rocminfo
    - rocm_smi_lib
    - rocprofiler
    - rocprofiler-register
    - rocprofiler-sdk
    - ROCR-Runtime
-    - roctracer
+    - rocprofiler-register
 - name: jobMatrix
  type: object
  default:
    buildJobs:
      - { os: ubuntu2204, packageManager: apt }
      - { os: almalinux8, packageManager: dnf }
    testJobs:
-      - gfx942:
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-        target: gfx942
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
      - gfx90a:
        target: gfx90a
 jobs:
- job: aomp
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
-  variables:
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}
-  - group: common
+    variables:
-  - template: /.azuredevops/variables-global.yml
+    - group: common
-  pool: ${{ variables.MEDIUM_BUILD_POOL }}
+    - template: /.azuredevops/variables-global.yml
-  workspace:
+    pool:
-    clean: all
+      ${{ if eq(job.os, 'ubuntu2404') }}:
-  steps:
+        name: rocm-ci_medium_build_pool_2404
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      ${{ else }}:
-    parameters:
+        name: ${{ variables.MEDIUM_BUILD_POOL }}
-      aptPackages: ${{ parameters.aptPackages }}
+    ${{ if eq(job.os, 'almalinux8') }}:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      container:
-# checkout the repos tied to openmp-extras, plus llvm-project
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+        endpoint: ContainerService3
-    parameters:
+    workspace:
-      checkoutRepo: ${{ parameters.checkoutRepo }}
+      clean: all
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+    steps:
-    parameters:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      checkoutRepo: aomp-extras_repo
+      parameters:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+        aptPackages: ${{ parameters.aptPackages }}
-    parameters:
+        packageManager: ${{ job.packageManager }}
-      checkoutRepo: flang_repo
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+    # checkout the repos tied to openmp-extras, plus llvm-project
-    parameters:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      checkoutRepo: llvm-project_repo
+      parameters:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        checkoutRepo: ${{ parameters.checkoutRepo }}
-    parameters:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      checkoutRef: ${{ parameters.checkoutRef }}
+      parameters:
-      dependencyList: ${{ parameters.rocmDependencies }}
+        checkoutRepo: aomp-extras_repo
-      aggregatePipeline: ${{ parameters.aggregatePipeline }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
-    parameters:
+        checkoutRepo: flang_repo
-      componentName: extras
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      cmakeBuildDir: '$(Build.SourcesDirectory)/aomp-extras/build'
+      parameters:
-      cmakeSourceDir: '$(Build.SourcesDirectory)/aomp-extras'
+        checkoutRepo: llvm-project_repo
-      installDir: '$(Build.BinariesDirectory)/llvm'
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
-      extraBuildFlags: >-
+      parameters:
-        -DLLVM_DIR=$(Agent.BuildDirectory)/rocm/llvm
+        dependencyList:
-        -DCMAKE_BUILD_TYPE=Release
+          - gtest
-        -DAOMP_STANDALONE_BUILD=0
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-        -DAOMP_VERSION_STRING=9.99.99
+      parameters:
-        -GNinja
+        checkoutRef: ${{ parameters.checkoutRef }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+        dependencyList: ${{ parameters.rocmDependencies }}
-    parameters:
+        os: ${{ job.os }}
-      componentName: openmp
+        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-      cmakeBuildDir: '$(Build.SourcesDirectory)/llvm-project/openmp/build'
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-      cmakeSourceDir: '$(Build.SourcesDirectory)/llvm-project/openmp'
+      parameters:
-      installDir: '$(Build.BinariesDirectory)/llvm'
+        os: ${{ job.os }}
-      extraBuildFlags: >-
+        useAmdclang: false
-        -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Build.BinariesDirectory)"
+        componentName: extras
-        -DCMAKE_BUILD_TYPE=Release
+        cmakeBuildDir: '$(Build.SourcesDirectory)/aomp-extras/build'
-        -DOPENMP_TEST_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang++
+        cmakeSourceDir: '$(Build.SourcesDirectory)/aomp-extras'
-        -DOPENMP_TEST_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang
+        installDir: '$(Build.BinariesDirectory)/llvm'
-        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang++
+        extraBuildFlags: >-
-        -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang
+          -DLLVM_DIR=$(Agent.BuildDirectory)/rocm/llvm
-        -DOPENMP_ENABLE_LIBOMPTARGET=1
+          -DCMAKE_BUILD_TYPE=Release
-        -DLIBOMP_COPY_EXPORTS=OFF
+          -DAOMP_STANDALONE_BUILD=0
-        -DLIBOMP_OMPT_SUPPORT=ON
+          -DAOMP_VERSION_STRING=9.99.99
-        -DLIBOMP_OMPD_SUPPORT=ON
+          -GNinja
-        -DCMAKE_SKIP_INSTALL_RPATH=TRUE
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-        -DLLVM_MAIN_INCLUDE_DIR=$(Build.SourcesDirectory)/llvm-project/llvm/include
+      parameters:
-        -DLIBOMP_FORTRAN_MODULES_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/flang
+        os: ${{ job.os }}
-        -DLIBOMP_MODULES_INSTALL_PATH=$(Build.BinariesDirectory)/llvm/include/flang/
+        componentName: openmp
-        -GNinja
+        cmakeBuildDir: '$(Build.SourcesDirectory)/llvm-project/openmp/build'
-  - task: Bash@3
+        cmakeSourceDir: '$(Build.SourcesDirectory)/llvm-project/openmp'
-    displayName: 'ROCm symbolic link'
+        installDir: '$(Build.BinariesDirectory)/llvm'
-    inputs:
+        extraBuildFlags: >-
-      targetType: inline
+          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Build.BinariesDirectory)"
-      script: sudo ln -s $(Agent.BuildDirectory)/rocm /opt/rocm
+          -DCMAKE_BUILD_TYPE=Release
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+          -DOPENMP_TEST_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang++
-    parameters:
+          -DOPENMP_TEST_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang
-      componentName: offload
+          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang++
-      cmakeBuildDir: '$(Build.SourcesDirectory)/llvm-project/offload/build'
+          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang
-      cmakeSourceDir: '$(Build.SourcesDirectory)/llvm-project/offload'
+          -DOPENMP_ENABLE_LIBOMPTARGET=1
-      installDir: '$(Build.BinariesDirectory)/llvm'
+          -DLIBOMP_COPY_EXPORTS=OFF
-      extraBuildFlags: >-
+          -DLIBOMP_OMPD_SUPPORT=ON
-        -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Build.BinariesDirectory)"
+          -DCMAKE_SKIP_INSTALL_RPATH=TRUE
-        -DCMAKE_BUILD_TYPE=Release
+          -DLLVM_MAIN_INCLUDE_DIR=$(Build.SourcesDirectory)/llvm-project/llvm/include
-        -DOPENMP_TEST_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang++
+          -DLIBOMP_FORTRAN_MODULES_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/flang
-        -DOPENMP_TEST_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang
+          -DLIBOMP_MODULES_INSTALL_PATH=$(Build.BinariesDirectory)/llvm/include/flang/
-        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang++
+        multithreadFlag: -- -j32
-        -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang
+    - task: Bash@3
-        -DCMAKE_SKIP_INSTALL_RPATH=TRUE
+      displayName: 'ROCm symbolic link'
-        -DLLVM_MAIN_INCLUDE_DIR=$(Build.SourcesDirectory)/llvm-project/llvm/include
+      inputs:
-        -DLIBOMPTARGET_LLVM_INCLUDE_DIRS=$(Build.SourcesDirectory)/llvm-project/llvm/include
+        targetType: inline
-        -DCMAKE_EXE_LINKER_FLAGS="-L$(Agent.BuildDirectory)/rocm/llvm/lib"
+        script: sudo ln -s $(Agent.BuildDirectory)/rocm /opt/rocm
-        -DCMAKE_SHARED_LINKER_FLAGS="-L$(Agent.BuildDirectory)/rocm/llvm/lib"
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-        -GNinja
+      parameters:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+        os: ${{ job.os }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+        componentName: offload
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+        cmakeBuildDir: '$(Build.SourcesDirectory)/llvm-project/offload/build'
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        cmakeSourceDir: '$(Build.SourcesDirectory)/llvm-project/offload'
-    parameters:
+        installDir: '$(Build.BinariesDirectory)/llvm'
-      aptPackages: ${{ parameters.aptPackages }}
+        extraBuildFlags: >-
          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Build.BinariesDirectory)"
          -DCMAKE_BUILD_TYPE=Release
          -DOPENMP_TEST_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang++
          -DOPENMP_TEST_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang++
          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang
          -DCMAKE_SKIP_INSTALL_RPATH=TRUE
          -DLLVM_MAIN_INCLUDE_DIR=$(Build.SourcesDirectory)/llvm-project/llvm/include
          -DLIBOMPTARGET_LLVM_INCLUDE_DIRS=$(Build.SourcesDirectory)/llvm-project/llvm/include
          -DCMAKE_EXE_LINKER_FLAGS="-L$(Agent.BuildDirectory)/rocm/llvm/lib"
          -DCMAKE_SHARED_LINKER_FLAGS="-L$(Agent.BuildDirectory)/rocm/llvm/lib"
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
        componentName: ${{ parameters.componentName }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
        componentName: ${{ parameters.componentName }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    - ${{ if eq(job.os, 'ubuntu2204') }}:
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
        parameters:
          aptPackages: ${{ parameters.aptPackages }}
 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: aomp_test_${{ job.target }}
+  - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
-    dependsOn: aomp
+    dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -198,12 +215,16 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
      parameters:
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmDependencies }}
+        dependencyList: ${{ parameters.rocmTestDependencies }}
        os: ${{ job.os }}
    - task: Bash@3
      displayName: ROCm symbolic link
      inputs:
@@ -215,7 +236,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: aomp-extras_repo
-  # these copy steps are from the aomp prototype script for test prep
+    # these copy steps are from the aomp prototype script for test prep
    - task: CopyFiles@2
      displayName: 'Copy AOMP contents'
      inputs:
--- a/.azuredevops/components/copyHIP.yml
+++ b/.azuredevops/components/copyHIP.yml
@@ -1,36 +1,42 @@
 parameters:
- name: checkoutRepo
+- name: jobMatrix
-  type: string
+  type: object
-  default: 'self'
+  default:
- name: checkoutRef
+    copyJobs:
-  type: string
+      - { os: ubuntu2204, backend: amd }
-  default: ''
+      - { os: almalinux8, backend: amd }
      - { os: ubuntu2204, backend: nvidia }
      - { os: almalinux8, backend: nvidia }
 # hip and clr are tightly-coupled
 # run this same template for both repos
 # any changes for clr should just trigger HIP pipeline
 jobs:
- job: hip_clr_combined
+- ${{ each job in parameters.jobMatrix.copyJobs }}:
-  variables:
+  - job: hip_clr_combined_${{ job.os }}_${{ job.backend }}
-  - group: common
+    variables:
-  - template: /.azuredevops/variables-global.yml
+    - group: common
-  pool:
+    - template: /.azuredevops/variables-global.yml
-    vmImage: ${{ variables.BASE_BUILD_POOL }}
+    pool:
-  workspace:
+      vmImage: ${{ variables.BASE_BUILD_POOL }}
-    clean: all
+    workspace:
-  steps:
+      clean: all
-# checkout nothing, just copy artifacts from triggering HIP job
+    steps:
-# and then publish for this clr job or for this hipother job to maintain latest
+  # checkout nothing, just copy artifacts from triggering HIP job
-  - checkout: none
+  # and then publish for this clr job or for this hipother job to maintain latest
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-download.yml
+    - checkout: none
-    parameters:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-download.yml
-      componentName: HIP
+      parameters:
-      pipelineId: $(HIP_PIPELINE_ID)
+        componentName: HIP
-  - task: Bash@3
+        pipelineId: $(HIP_PIPELINE_ID)
-    displayName: Copy HIP artifacts
+        fileFilter: ${{ job.os }}*${{ job.backend }}
-    inputs:
+    - task: Bash@3
-      targetType: inline
+      displayName: Copy HIP artifacts
-      script: cp -a $(Agent.BuildDirectory)/rocm/* $(Build.BinariesDirectory)/
+      inputs:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+        targetType: inline
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+        script: cp -a $(Agent.BuildDirectory)/rocm/* $(Build.BinariesDirectory)/
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
--- a/.azuredevops/components/hipBLAS-common.yml
+++ b/.azuredevops/components/hipBLAS-common.yml
@@ -1,10 +1,29 @@
 parameters:
 - name: componentName
  type: string
  default: hipBLAS-common
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
 # monorepo related parameters
 - name: sparseCheckoutDir
  type: string
  default: ''
 - name: triggerDownstreamJobs
  type: boolean
  default: false
 - name: downstreamAggregateNames
  type: string
  default: ''
 - name: buildDependsOn
  type: object
  default: null
 - name: unifiedBuild
  type: boolean
  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -14,54 +33,103 @@ parameters:
  type: object
  default:
    - cmake
    - ninja-build
    - git
    - ninja-build
    - wget
    - python3-pip
 - name: rocmDependencies
  type: object
  default:
    - rocm-cmake
    - llvm-project
    - ROCR-Runtime
    - clr
    - llvm-project
    - rocm-cmake
    - rocminfo
    - ROCR-Runtime
 - name: jobMatrix
  type: object
  default:
    buildJobs:
      - { os: ubuntu2204, packageManager: apt }
      - { os: almalinux8, packageManager: dnf }
 # - name: downstreamComponentMatrix
 #   type: object
 #   default:
 #     - hipBLASLt:
 #       name: hipBLASLt
 #       sparseCheckoutDir: projects/hipblaslt
 #       skipUnifiedBuild: 'false'
 #       buildDependsOn:
 #         - hipBLAS_common_build
 jobs:
- job: hipBLAS_common
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
-  variables:
+  - job: hipBLAS_common_build_${{ job.os }}
-  - group: common
+    ${{ if parameters.buildDependsOn }}:
-  - name: ROCM_PATH
+      dependsOn:
-    value: $(Agent.BuildDirectory)/rocm
+        - ${{ each build in parameters.buildDependsOn }}:
-  - template: /.azuredevops/variables-global.yml
+          - ${{ build }}_${{ job.os }}
-  pool:
+    variables:
-    vmImage: ${{ variables.BASE_BUILD_POOL }}
+    - group: common
-  workspace:
+    - template: /.azuredevops/variables-global.yml
-    clean: all
+    - name: ROCM_PATH
-  steps:
+      value: $(Agent.BuildDirectory)/rocm
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+    pool:
-    parameters:
+      vmImage: ${{ variables.BASE_BUILD_POOL }}
-      aptPackages: ${{ parameters.aptPackages }}
+    ${{ if eq(job.os, 'almalinux8') }}:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      container:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-    parameters:
+        endpoint: ContainerService3
-      checkoutRepo: ${{ parameters.checkoutRepo }}
+    workspace:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      clean: all
-    parameters:
+    steps:
-      checkoutRef: ${{ parameters.checkoutRef }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      dependencyList: ${{ parameters.rocmDependencies }}
+      parameters:
-      aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        aptPackages: ${{ parameters.aptPackages }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+        packageManager: ${{ job.packageManager }}
-    parameters:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-      extraBuildFlags: >-
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+      parameters:
-        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
+        checkoutRepo: ${{ parameters.checkoutRepo }}
-        -GNinja
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+        checkoutRef: ${{ parameters.checkoutRef }}
-  # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        dependencyList: ${{ parameters.rocmDependencies }}
-  #   parameters:
+        os: ${{ job.os }}
-  #     aptPackages: ${{ parameters.aptPackages }}
+        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-  #     extraEnvVars:
+        ${{ if parameters.triggerDownstreamJobs }}:
-  #       - ROCM_PATH:::/home/user/workspace/rocm
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        os: ${{ job.os }}
        extraBuildFlags: >-
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
        componentName: ${{ parameters.componentName }}
        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
        os: ${{ job.os }}
        componentName: ${{ parameters.componentName }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
    #   parameters:
    #     aptPackages: ${{ parameters.aptPackages }}
    #     extraEnvVars:
    #       - ROCM_PATH:::/home/user/workspace/rocm
 # - ${{ if parameters.triggerDownstreamJobs }}:
 #   - ${{ each component in parameters.downstreamComponentMatrix }}:
 #     - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
 #       - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
 #         parameters:
 #           checkoutRepo: ${{ parameters.checkoutRepo }}
 #           sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
 #           buildDependsOn: ${{ component.buildDependsOn }}
 #           downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
 #           triggerDownstreamJobs: true
 #           unifiedBuild: ${{ parameters.unifiedBuild }}
--- a/.azuredevops/components/hipBLASLt.yml
+++ b/.azuredevops/components/hipBLASLt.yml
@@ -1,10 +1,29 @@
 parameters:
 - name: componentName
  type: string
  default: hipBLASLt
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
 # monorepo related parameters
 - name: sparseCheckoutDir
  type: string
  default: ''
 - name: triggerDownstreamJobs
  type: boolean
  default: false
 - name: downstreamAggregateNames
  type: string
  default: ''
 - name: buildDependsOn
  type: object
  default: null
 - name: unifiedBuild
  type: boolean
  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -13,6 +32,8 @@ parameters:
 - name: aptPackages
  type: object
  default:
    - ccache
    - gfortran
    - git
    - libdrm-dev
    - libmsgpack-dev
@@ -20,9 +41,6 @@ parameters:
    - ninja-build
    - python3-pip
    - python3-venv
    - gfortran
    - libblas-dev
    - ccache
 - name: pipModules
  type: object
  default:
@@ -37,6 +55,7 @@ parameters:
    - hipBLAS-common
    - llvm-project
    - rocminfo
    - rocm-cmake
    - rocm_smi_lib
    - rocprofiler-register
    - ROCR-Runtime
@@ -58,20 +77,37 @@ parameters:
  type: object
  default:
    buildJobs:
-      - gfx942:
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-        target: gfx942
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
-      - gfx90a:
+      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
-        target: gfx90a
+      # - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
      # - { os: almalinux8, packageManager: dnf, target: gfx942 }
      # - { os: almalinux8, packageManager: dnf, target: gfx90a }
      # - { os: almalinux8, packageManager: dnf, target: gfx1201 }
      # - { os: almalinux8, packageManager: dnf, target: gfx1100 }
      # - { os: almalinux8, packageManager: dnf, target: gfx1030 }
    testJobs:
-      - gfx942:
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-        target: gfx942
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
-      - gfx90a:
+# - name: downstreamComponentMatrix
-        target: gfx90a
+#   type: object
 #   default:
 #     - rocBLAS:
 #       name: rocBLAS
 #       sparseCheckoutDir: projects/rocblas
 #       skipUnifiedBuild: 'false'
 #       buildDependsOn:
 #         - hipBLASLt_build
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: hipBLASLt_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
    timeoutInMinutes: 300
    ${{ if parameters.buildDependsOn }}:
      dependsOn:
        - ${{ each build in parameters.buildDependsOn }}:
          - ${{ build }}_${{ job.os }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -86,6 +122,10 @@ jobs:
    - name: DAY_STRING
      value: $[format('{0:ddMMyyyy}', pipeline.startTime)]
    pool: ${{ variables.ULTRA_BUILD_POOL }}
    ${{ if eq(job.os, 'almalinux8') }}:
      container:
        image: rocmexternalcicd.azurecr.io/manylinux228:latest
        endpoint: ContainerService3
    workspace:
      clean: all
    steps:
@@ -93,17 +133,22 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
        ${{ if parameters.triggerDownstreamJobs }}:
          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - task: Bash@3
      displayName: Add ROCm binaries to PATH
      inputs:
@@ -111,22 +156,20 @@ jobs:
        script: |
          echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/bin"
          echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/llvm/bin"
-  # Build and install gtest, lapack, hipBLAS-common
+    # hipBLASLt has a script for gtest and lapack
-  # $(Pipeline.Workspace)/deps is a temporary folder for the build process
+    # https://github.com/ROCm/hipBLASLt/blob/develop/deps/CMakeLists.txt
-  # $(Pipeline.Workspace)/s/deps is part of the hipBLASLt repo
+    # $(Agent.BuildDirectory)/deps is a temporary folder for the build process
-    - script: mkdir $(Pipeline.Workspace)/deps
+    # $(Agent.BuildDirectory)/s/deps is part of the hipBLASLt repo
-      displayName: Create temp folder for external dependencies
+    - task: Bash@3
-  # hipBLASLt already has a CMake script for external deps, so we can just run that
+      displayName: Build and install external dependencies
-  # https://github.com/ROCm/hipBLASLt/blob/develop/deps/CMakeLists.txt
+      inputs:
-    - script: cmake $(Pipeline.Workspace)/s/deps
+        targetType: inline
-      displayName: Configure hipBLASLt external dependencies
+        script: |
-      workingDirectory: $(Pipeline.Workspace)/deps
+          mkdir -p $(Agent.BuildDirectory)/deps
-    - script: make
+          cd $(Agent.BuildDirectory)/deps
-      displayName: Build hipBLASLt external dependencies
+          cmake -DCMAKE_POSITION_INDEPENDENT_CODE=ON $(Agent.BuildDirectory)/s/deps
-      workingDirectory: $(Pipeline.Workspace)/deps
+          make
-    - script: sudo make install
+          sudo make install
      displayName: Install hipBLASLt external dependencies
      workingDirectory: $(Pipeline.Workspace)/deps
    - script: |
        mkdir -p $(CCACHE_DIR)
        echo "##vso[task.prependpath]/usr/lib/ccache"
@@ -134,93 +177,117 @@ jobs:
    - task: Cache@2
      displayName: Ccache caching
      inputs:
-        key: hipBLASLt | $(Agent.OS) | ${{ job.target }} | $(DAY_STRING) | $(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
+        key: hipBLASLt | ${{ job.os }} | ${{ job.target }} | $(DAY_STRING) | $(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
        path: $(CCACHE_DIR)
        restoreKeys: |
-          hipBLASLt | $(Agent.OS) | ${{ job.target }} | $(DAY_STRING)
+          hipBLASLt | ${{ job.os }} | ${{ job.target }} | $(DAY_STRING)
-          hipBLASLt | $(Agent.OS) | ${{ job.target }}
+          hipBLASLt | ${{ job.os }} | ${{ job.target }}
-          hipBLASLt | $(Agent.OS)
+          hipBLASLt | ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        os: ${{ job.os }}
        extraBuildFlags: >-
          -DCMAKE_BUILD_TYPE=Release
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
          -DCMAKE_INCLUDE_PATH=$(Agent.BuildDirectory)/rocm/llvm/include
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
          -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
          -DCMAKE_C_COMPILER_LAUNCHER=ccache
          -DAMDGPU_TARGETS=${{ job.target }}
          -DTensile_LOGIC=
          -DTensile_CPU_THREADS=
          -DTensile_LIBRARY_FORMAT=msgpack
          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm"
          -DBUILD_CLIENTS_TESTS=ON
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
        componentName: ${{ parameters.componentName }}
        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
        componentName: ${{ parameters.componentName }}
        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+    - ${{ if eq(job.os, 'ubuntu2204') }}:
-      parameters:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-        aptPackages: ${{ parameters.aptPackages }}
+        parameters:
-        pipModules: ${{ parameters.pipModules }}
+          aptPackages: ${{ parameters.aptPackages }}
-        gpuTarget: ${{ job.target }}
+          pipModules: ${{ parameters.pipModules }}
-        extraPaths: /home/user/workspace/rocm/llvm/bin:/home/user/workspace/rocm/bin
+          gpuTarget: ${{ job.target }}
-        installLatestCMake: true
+          extraPaths: /home/user/workspace/rocm/llvm/bin:/home/user/workspace/rocm/bin
-        extraEnvVars:
+          installLatestCMake: true
-          - HIP_ROCCLR_HOME:::/home/user/workspace/rocm
+          extraEnvVars:
-          - TENSILE_ROCM_ASSEMBLER_PATH:::/home/user/workspace/rocm/llvm/bin/amdclang
+            - HIP_ROCCLR_HOME:::/home/user/workspace/rocm
-          - TENSILE_ROCM_OFFLOAD_BUNDLER_PATH:::/home/user/workspace/rocm/llvm/bin/clang-offload-bundler
+            - TENSILE_ROCM_ASSEMBLER_PATH:::/home/user/workspace/rocm/llvm/bin/amdclang
-          - ROCM_PATH:::/home/user/workspace/rocm
+            - TENSILE_ROCM_OFFLOAD_BUNDLER_PATH:::/home/user/workspace/rocm/llvm/bin/clang-offload-bundler
-        extraCopyDirectories:
+            - ROCM_PATH:::/home/user/workspace/rocm
-          - deps
+          extraCopyDirectories:
            - deps
- ${{ each job in parameters.jobMatrix.testJobs }}:
+- ${{ if eq(parameters.unifiedBuild, False) }}:
-  - job: hipBLASLt_test_${{ job.target }}
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
-    timeoutInMinutes: 300
+    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
-    dependsOn: hipBLASLt_build_${{ job.target }}
+      timeoutInMinutes: 300
-    condition:
+      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
-      and(succeeded(),
+      condition:
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+        and(succeeded(),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        eq(${{ parameters.aggregatePipeline }}, False)
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
-      )
+          eq(${{ parameters.aggregatePipeline }}, False)
-    variables:
+        )
-    - group: common
+      variables:
-    - template: /.azuredevops/variables-global.yml
+      - group: common
-    - name: ROCM_PATH
+      - template: /.azuredevops/variables-global.yml
-      value: $(Agent.BuildDirectory)/rocm
+      - name: ROCM_PATH
-    pool: ${{ job.target }}_test_pool
+        value: $(Agent.BuildDirectory)/rocm
-    workspace:
+      pool: ${{ job.target }}_test_pool
-      clean: all
+      workspace:
-    steps:
+        clean: all
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      steps:
-      parameters:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-        aptPackages: ${{ parameters.aptPackages }}
+        parameters:
-        pipModules: ${{ parameters.pipModules }}
+          aptPackages: ${{ parameters.aptPackages }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+          pipModules: ${{ parameters.pipModules }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-      parameters:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-        gpuTarget: ${{ job.target }}
+        parameters:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+          preTargetFilter: ${{ parameters.componentName }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+          os: ${{ job.os }}
-      parameters:
+          gpuTarget: ${{ job.target }}
-        checkoutRef: ${{ parameters.checkoutRef }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-        dependencyList: ${{ parameters.rocmTestDependencies }}
+        parameters:
-        gpuTarget: ${{ job.target }}
+          os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
-      parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
-        componentName: hipBLASLt
+          dependencyList: ${{ parameters.rocmTestDependencies }}
-        testDir: '$(Agent.BuildDirectory)/rocm/bin'
+          os: ${{ job.os }}
-        testExecutable: './hipblaslt-test'
+          gpuTarget: ${{ job.target }}
-        testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes --gtest_filter=*pre_checkin*'
+          ${{ if parameters.triggerDownstreamJobs }}:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
-      parameters:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-        aptPackages: ${{ parameters.aptPackages }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-        pipModules: ${{ parameters.pipModules }}
+        parameters:
-        environment: test
+          componentName: ${{ parameters.componentName }}
-        gpuTarget: ${{ job.target }}
+          os: ${{ job.os }}
          testDir: '$(Agent.BuildDirectory)/rocm/bin'
          testExecutable: './hipblaslt-test'
          testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes --gtest_filter=*pre_checkin*'
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
        parameters:
          aptPackages: ${{ parameters.aptPackages }}
          pipModules: ${{ parameters.pipModules }}
          environment: test
          gpuTarget: ${{ job.target }}
 # - ${{ if parameters.triggerDownstreamJobs }}:
 #   - ${{ each component in parameters.downstreamComponentMatrix }}:
 #     - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
 #       - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
 #         parameters:
 #           checkoutRepo: ${{ parameters.checkoutRepo }}
 #           sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
 #           buildDependsOn: ${{ component.buildDependsOn }}
 #           downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
 #           triggerDownstreamJobs: true
 #           unifiedBuild: ${{ parameters.unifiedBuild }}
--- a/.azuredevops/components/hipCUB.yml
+++ b/.azuredevops/components/hipCUB.yml
@@ -1,10 +1,29 @@
 parameters:
 - name: componentName
  type: string
  default: hipCUB
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
 # monorepo related parameters
 - name: sparseCheckoutDir
  type: string
  default: ''
 - name: triggerDownstreamJobs
  type: boolean
  default: false
 - name: downstreamAggregateNames
  type: string
  default: ''
 - name: buildDependsOn
  type: object
  default: null
 - name: unifiedBuild
  type: boolean
  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -14,9 +33,8 @@ parameters:
  type: object
  default:
    - cmake
    - ninja-build
    - libgtest-dev
    - git
    - ninja-build
    - python3-pip
 - name: rocmDependencies
  type: object
@@ -33,103 +51,143 @@ parameters:
    - llvm-project
    - rocminfo
    - rocPRIM
    - ROCR-Runtime
    - rocprofiler-register
    - ROCR-Runtime
 - name: jobMatrix
  type: object
  default:
    buildJobs:
-      - gfx942:
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-        target: gfx942
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
-      - gfx90a:
+      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
      - { os: almalinux8, packageManager: dnf, target: gfx942 }
      - { os: almalinux8, packageManager: dnf, target: gfx90a }
      - { os: almalinux8, packageManager: dnf, target: gfx1201 }
      - { os: almalinux8, packageManager: dnf, target: gfx1100 }
      - { os: almalinux8, packageManager: dnf, target: gfx1030 }
    testJobs:
-      - gfx942:
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-        target: gfx942
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
      - gfx90a:
        target: gfx90a
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: hipCUB_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
    ${{ if parameters.buildDependsOn }}:
      dependsOn:
        - ${{ each build in parameters.buildDependsOn }}:
          - ${{ build }}_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
    pool: ${{ variables.MEDIUM_BUILD_POOL }}
    ${{ if eq(job.os, 'almalinux8') }}:
      container:
        image: rocmexternalcicd.azurecr.io/manylinux228:latest
        endpoint: ContainerService3
    workspace:
      clean: all
    steps:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
      parameters:
        dependencyList:
          - gtest
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
        os: ${{ job.os }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
        ${{ if parameters.triggerDownstreamJobs }}:
          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        os: ${{ job.os }}
        consolidateBuildAndInstall: true
        extraBuildFlags: >-
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor"
          -DBUILD_BENCHMARK=ON
          -DBUILD_TEST=ON
          -DAMDGPU_TARGETS=${{ job.target }}
          -GNinja
        extraCxxFlags: -Wno-deprecated-declarations
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
        componentName: ${{ parameters.componentName }}
        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        gpuTarget: ${{ job.target }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+    - ${{ if eq(job.os, 'ubuntu2204') }}:
-      parameters:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-        aptPackages: ${{ parameters.aptPackages }}
+        parameters:
-        gpuTarget: ${{ job.target }}
+          aptPackages: ${{ parameters.aptPackages }}
          gpuTarget: ${{ job.target }}
- ${{ each job in parameters.jobMatrix.testJobs }}:
+- ${{ if eq(parameters.unifiedBuild, False) }}:
-  - job: hipCUB_test_${{ job.target }}
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
-    dependsOn: hipCUB_build_${{ job.target }}
+    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
-    condition:
+      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
-      and(succeeded(),
+      condition:
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+        and(succeeded(),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        eq(${{ parameters.aggregatePipeline }}, False)
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
-      )
+          eq(${{ parameters.aggregatePipeline }}, False)
-    variables:
+        )
-    - group: common
+      variables:
-    - template: /.azuredevops/variables-global.yml
+      - group: common
-    pool: ${{ job.target }}_test_pool
+      - template: /.azuredevops/variables-global.yml
-    workspace:
+      pool: ${{ job.target }}_test_pool
-      clean: all
+      workspace:
-    steps:
+        clean: all
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      steps:
-      parameters:
+      - checkout: none
-        aptPackages: ${{ parameters.aptPackages }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+        parameters:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+          aptPackages: ${{ parameters.aptPackages }}
-      parameters:
+          packageManager: ${{ job.packageManager }}
-        gpuTarget: ${{ job.target }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        parameters:
-      parameters:
+          preTargetFilter: ${{ parameters.componentName }}
-        checkoutRef: ${{ parameters.checkoutRef }}
+          gpuTarget: ${{ job.target }}
-        dependencyList: ${{ parameters.rocmTestDependencies }}
+          os: ${{ job.os }}
-        gpuTarget: ${{ job.target }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
-      parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
-        componentName: hipCUB
+          dependencyList: ${{ parameters.rocmTestDependencies }}
-        testDir: '$(Agent.BuildDirectory)/rocm/bin/hipcub'
+          gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+          os: ${{ job.os }}
-      parameters:
+          ${{ if parameters.triggerDownstreamJobs }}:
-        aptPackages: ${{ parameters.aptPackages }}
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
-        environment: test
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-        gpuTarget: ${{ job.target }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
        parameters:
          componentName: ${{ parameters.componentName }}
          testDir: '$(Agent.BuildDirectory)/rocm/bin/hipcub'
          os: ${{ job.os }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
        parameters:
          aptPackages: ${{ parameters.aptPackages }}
          environment: test
          gpuTarget: ${{ job.target }}
--- a/.azuredevops/components/hipFFT.yml
+++ b/.azuredevops/components/hipFFT.yml
@@ -1,10 +1,29 @@
 parameters:
 - name: componentName
  type: string
  default: hipFFT
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
 # monorepo related parameters
 - name: sparseCheckoutDir
  type: string
  default: ''
 - name: triggerDownstreamJobs
  type: boolean
  default: false
 - name: downstreamAggregateNames
  type: string
  default: ''
 - name: buildDependsOn
  type: object
  default: null
 - name: unifiedBuild
  type: boolean
  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -61,7 +80,11 @@ parameters:
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: hipFFT_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.target }}
    ${{ if parameters.buildDependsOn }}:
      dependsOn:
        - ${{ each build in parameters.buildDependsOn }}:
          - ${{ build }}_${{ job.target }} # todo: add OS
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -79,12 +102,15 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
        ${{ if parameters.triggerDownstreamJobs }}:
          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        extraBuildFlags: >-
@@ -102,9 +128,11 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
@@ -113,8 +141,8 @@ jobs:
    #     gpuTarget: ${{ job.target }}
 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: hipFFT_test_${{ job.target }}
+  - job: ${{ parameters.componentName }}_test_${{ job.target }}
-    dependsOn: hipFFT_build_${{ job.target }}
+    dependsOn: ${{ parameters.componentName }}_build_${{ job.target }}
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -134,6 +162,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
      parameters:
        preTargetFilter: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
@@ -141,10 +170,12 @@ jobs:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmTestDependencies }}
        gpuTarget: ${{ job.target }}
        ${{ if parameters.triggerDownstreamJobs }}:
          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
      parameters:
-        componentName: hipFFT
+        componentName: ${{ parameters.componentName }}
        testDir: '$(Agent.BuildDirectory)/rocm/bin'
        testExecutable: './hipfft-test'
        testParameters: '--test_prob 0.002 --gtest_output=xml:./test_output.xml --gtest_color=yes'
--- a/.azuredevops/components/hipRAND.yml
+++ b/.azuredevops/components/hipRAND.yml
@@ -1,10 +1,29 @@
 parameters:
 - name: componentName
  type: string
  default: hipRAND
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
 # monorepo related parameters
 - name: sparseCheckoutDir
  type: string
  default: ''
 - name: triggerDownstreamJobs
  type: boolean
  default: false
 - name: downstreamAggregateNames
  type: string
  default: ''
 - name: buildDependsOn
  type: object
  default: null
 - name: unifiedBuild
  type: boolean
  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -14,18 +33,18 @@ parameters:
  type: object
  default:
    - cmake
    - ninja-build
    - googletest
    - git
    - ninja-build
    - python3-pip
 - name: rocmDependencies
  type: object
  default:
    - llvm-project
    - ROCR-Runtime
    - clr
    - llvm-project
    - rocm-cmake
    - rocminfo
    - rocRAND
    - ROCR-Runtime
 - name: rocmTestDependencies
  type: object
  default:
@@ -33,110 +52,168 @@ parameters:
    - llvm-project
    - rocminfo
    - rocprofiler-register
    - ROCR-Runtime
    - rocRAND
    - ROCR-Runtime
 - name: jobMatrix
  type: object
  default:
    buildJobs:
-      - gfx942:
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-        target: gfx942
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
-      - gfx90a:
+      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
      - { os: almalinux8, packageManager: dnf, target: gfx942 }
      - { os: almalinux8, packageManager: dnf, target: gfx90a }
      - { os: almalinux8, packageManager: dnf, target: gfx1201 }
      - { os: almalinux8, packageManager: dnf, target: gfx1100 }
      - { os: almalinux8, packageManager: dnf, target: gfx1030 }
    testJobs:
-      - gfx942:
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-        target: gfx942
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
-      - gfx90a:
+# - name: downstreamComponentMatrix
-        target: gfx90a
+#   type: object
 #   default:
 #     - rocFFT:
 #       name: rocFFT
 #       sparseCheckoutDir: projects/rocfft
 #       skipUnifiedBuild: 'false'
 #       buildDependsOn:
 #         - hipRAND_build
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: hipRAND_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
    ${{ if parameters.buildDependsOn }}:
      dependsOn:
        - ${{ each build in parameters.buildDependsOn }}:
          - ${{ build }}_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
    - name: HIP_ROCCLR_HOME
      value: $(Build.BinariesDirectory)/rocm
-    pool:
+    pool: ${{ variables.MEDIUM_BUILD_POOL }}
-      vmImage: ${{ variables.BASE_BUILD_POOL }}
+    ${{ if eq(job.os, 'almalinux8') }}:
      container:
        image: rocmexternalcicd.azurecr.io/manylinux228:latest
        endpoint: ContainerService3
    workspace:
      clean: all
    steps:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
      parameters:
        dependencyList:
          - gtest
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
        os: ${{ job.os }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
        ${{ if parameters.triggerDownstreamJobs }}:
          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        os: ${{ job.os }}
        consolidateBuildAndInstall: true
        extraBuildFlags: >-
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
          -DBUILD_TEST=ON
          -DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm/lib/cmake/hip
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor"
          -DCMAKE_BUILD_TYPE=Release
          -DAMDGPU_TARGETS=${{ job.target }}
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
        componentName: ${{ parameters.componentName }}
        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        gpuTarget: ${{ job.target }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+    - ${{ if eq(job.os, 'ubuntu2204') }}:
-    #   parameters:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-    #     aptPackages: ${{ parameters.aptPackages }}
+        parameters:
-    #     gpuTarget: ${{ job.target }}
+          aptPackages: ${{ parameters.aptPackages }}
-    #     extraEnvVars:
+          gpuTarget: ${{ job.target }}
-    #       - HIP_ROCCLR_HOME:::/home/user/workspace/rocm
+          extraEnvVars:
            - HIP_ROCCLR_HOME:::/home/user/workspace/rocm
- ${{ each job in parameters.jobMatrix.testJobs }}:
+- ${{ if eq(parameters.unifiedBuild, False) }}:
-  - job: hipRAND_test_${{ job.target }}
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
-    dependsOn: hipRAND_build_${{ job.target }}
+    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
-    condition:
+      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
-        and(succeeded(),
+      condition:
-          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          and(succeeded(),
-          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
+            eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-          eq(${{ parameters.aggregatePipeline }}, False)
+            not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
-        )
+            eq(${{ parameters.aggregatePipeline }}, False)
-    variables:
+          )
-    - group: common
+      variables:
-    - template: /.azuredevops/variables-global.yml
+      - group: common
-    pool: ${{ job.target }}_test_pool
+      - template: /.azuredevops/variables-global.yml
-    workspace:
+      pool: ${{ job.target }}_test_pool
-      clean: all
+      workspace:
-    steps:
+        clean: all
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      steps:
-      parameters:
+      - checkout: none
-        aptPackages: ${{ parameters.aptPackages }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+        parameters:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+          aptPackages: ${{ parameters.aptPackages }}
-      parameters:
+          packageManager: ${{ job.packageManager }}
-        gpuTarget: ${{ job.target }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        parameters:
-      parameters:
+          preTargetFilter: ${{ parameters.componentName }}
-        checkoutRef: ${{ parameters.checkoutRef }}
+          os: ${{ job.os }}
-        dependencyList: ${{ parameters.rocmTestDependencies }}
+          gpuTarget: ${{ job.target }}
-        gpuTarget: ${{ job.target }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
-      parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
-        componentName: hipRAND
+          dependencyList: ${{ parameters.rocmTestDependencies }}
-        testDir: '$(Agent.BuildDirectory)/rocm/bin/hipRAND'
+          gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+          os: ${{ job.os }}
-      parameters:
+          ${{ if parameters.triggerDownstreamJobs }}:
-        aptPackages: ${{ parameters.aptPackages }}
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
-        environment: test
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-        gpuTarget: ${{ job.target }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
        parameters:
          componentName: ${{ parameters.componentName }}
          testDir: '$(Agent.BuildDirectory)/rocm/bin/hipRAND'
          os: ${{ job.os }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
        parameters:
          aptPackages: ${{ parameters.aptPackages }}
          environment: test
          gpuTarget: ${{ job.target }}
 # - ${{ if parameters.triggerDownstreamJobs }}:
 #   - ${{ each component in parameters.downstreamComponentMatrix }}:
 #     - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
 #       - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
 #         parameters:
 #           checkoutRepo: ${{ parameters.checkoutRepo }}
 #           sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
 #           buildDependsOn: ${{ component.buildDependsOn }}
 #           downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
 #           triggerDownstreamJobs: true
 #           unifiedBuild: ${{ parameters.unifiedBuild }}
--- a/.azuredevops/components/llvm-project.yml
+++ b/.azuredevops/components/llvm-project.yml
@@ -14,146 +14,188 @@ parameters:
  type: object
  default:
    - cmake
    - python3-pip
    - libnuma-dev
    - ninja-build
    - python-is-python3
    - zlib1g-dev
    - pkg-config
    - python-is-python3
    - python3-pip
    - zlib1g-dev
 - name: rocmDependencies
  type: object
  default:
    - rocm-cmake
 - name: jobMatrix
  type: object
  default:
    buildJobs:
      - { os: ubuntu2204, packageManager: apt }
      - { os: ubuntu2404, packageManager: apt }
      - { os: almalinux8, packageManager: dnf }
 jobs:
- job: llvm_project
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
-  variables:
+  - job: llvm_project_${{ job.os }}
-  - group: common
+    pool:
-  - template: /.azuredevops/variables-global.yml
+      ${{ if eq(job.os, 'ubuntu2404') }}:
-  - name: HIP_DEVICE_LIB_PATH
+        name: 'rocm-ci_high_build_pool_2404' #temporarily using 'high' pool while 'ultra' is down
-    value: '$(Build.BinariesDirectory)/amdgcn/bitcode'
+      ${{ else }}:
-  - name: HIP_PATH
+        name: 'rocm-ci_ultra_build_pool'
-    value: '$(Agent.BuildDirectory)/rocm'
+    ${{ if eq(job.os, 'almalinux8') }}:
-  pool: ${{ variables.ULTRA_BUILD_POOL }}
+      container:
-  workspace:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-    clean: all
+        endpoint: ContainerService3
-  steps:
+    variables:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+    - group: common
-    parameters:
+    - template: /.azuredevops/variables-global.yml
-      aptPackages: ${{ parameters.aptPackages }}
+    - name: HIP_DEVICE_LIB_PATH
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      value: '$(Build.BinariesDirectory)/amdgcn/bitcode'
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+    - name: HIP_PATH
-    parameters:
+      value: '$(Agent.BuildDirectory)/rocm'
-      checkoutRepo: ${{ parameters.checkoutRepo }}
+    workspace:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      clean: all
-    parameters:
+    steps:
-      checkoutRef: ${{ parameters.checkoutRef }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      dependencyList: ${{ parameters.rocmDependencies }}
+      parameters:
-      skipLlvmSymlink: true
+        aptPackages: ${{ parameters.aptPackages }}
-      aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        packageManager: ${{ job.packageManager }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    parameters:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      componentName: rocm-llvm
+      parameters:
-      extraBuildFlags: >-
+        checkoutRepo: ${{ parameters.checkoutRepo }}
-        -DCMAKE_PREFIX_PATH="$(Build.BinariesDirectory)/llvm;$(Build.BinariesDirectory)"
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-        -DCMAKE_BUILD_TYPE=Release
+      parameters:
-        -DLLVM_ENABLE_PROJECTS=clang;lld;clang-tools-extra;mlir;flang
+        checkoutRef: ${{ parameters.checkoutRef }}
-        -DLLVM_ENABLE_RUNTIMES=compiler-rt;libunwind;libcxx;libcxxabi
+        dependencyList: ${{ parameters.rocmDependencies }}
-        -DCLANG_ENABLE_AMDCLANG=ON
+        skipLlvmSymlink: true
-        -DLLVM_TARGETS_TO_BUILD=AMDGPU;X86
+        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-        -DLIBCXX_ENABLE_SHARED=OFF
+        os: ${{ job.os }}
-        -DLIBCXX_ENABLE_STATIC=ON
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-        -DLIBCXX_INSTALL_LIBRARY=OFF
+      parameters:
-        -DLIBCXX_INSTALL_HEADERS=OFF
+        componentName: rocm-llvm
-        -DLIBCXXABI_ENABLE_SHARED=OFF
+        os: ${{ job.os }}
-        -DLIBCXXABI_ENABLE_STATIC=ON
+        useAmdclang: false
-        -DLIBCXXABI_INSTALL_STATIC_LIBRARY=OFF
+        extraBuildFlags: >-
-        -DLLVM_BUILD_DOCS=OFF
+          -DCMAKE_PREFIX_PATH="$(Build.BinariesDirectory)/llvm;$(Build.BinariesDirectory)"
-        -DLLVM_ENABLE_SPHINX=OFF
+          -DCMAKE_BUILD_TYPE=Release
-        -DLLVM_ENABLE_ASSERTIONS=OFF
+          -DLLVM_ENABLE_PROJECTS=clang;lld;clang-tools-extra;mlir;flang
-        -DLLVM_ENABLE_Z3_SOLVER=OFF
+          -DLLVM_ENABLE_RUNTIMES=compiler-rt;libunwind;libcxx;libcxxabi
-        -DLLVM_ENABLE_ZLIB=ON
+          -DCLANG_ENABLE_AMDCLANG=ON
-        -DCLANG_DEFAULT_LINKER=lld
+          -DLLVM_TARGETS_TO_BUILD=AMDGPU;X86
-        -DCLANG_DEFAULT_RTLIB=compiler-rt
+          -DLIBCXX_ENABLE_SHARED=OFF
-        -DCLANG_DEFAULT_UNWINDLIB=libgcc
+          -DLIBCXX_ENABLE_STATIC=ON
-        -DSANITIZER_AMDGPU=OFF
+          -DLIBCXX_INSTALL_LIBRARY=OFF
-        -DPACKAGE_VENDOR=AMD
+          -DLIBCXX_INSTALL_HEADERS=OFF
-        -DCLANG_LINK_FLANG_LEGACY=ON
+          -DLIBCXXABI_ENABLE_SHARED=OFF
-        -DCMAKE_CXX_STANDARD=17
+          -DLIBCXXABI_ENABLE_STATIC=ON
-        -DROCM_LLVM_BACKWARD_COMPAT_LINK=$(Build.BinariesDirectory)/llvm
+          -DLIBCXXABI_INSTALL_STATIC_LIBRARY=OFF
-        -DROCM_LLVM_BACKWARD_COMPAT_LINK_TARGET=./lib/llvm
+          -DLLVM_BUILD_DOCS=OFF
-        -GNinja
+          -DLLVM_ENABLE_SPHINX=OFF
-      cmakeBuildDir: '$(Build.SourcesDirectory)/llvm/build'
+          -DLLVM_ENABLE_ASSERTIONS=OFF
-      cmakeSourceDir: '$(Build.SourcesDirectory)/llvm'
+          -DLLVM_ENABLE_Z3_SOLVER=OFF
-      installDir: '$(Build.BinariesDirectory)/llvm'
+          -DLLVM_ENABLE_ZLIB=ON
-# use llvm-lit to run unit tests for llvm, clang, and lld
+          -DCLANG_DEFAULT_LINKER=lld
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+          -DCLANG_DEFAULT_RTLIB=compiler-rt
-    parameters:
+          -DCLANG_DEFAULT_UNWINDLIB=libgcc
-      componentName: check-llvm
+          -DSANITIZER_AMDGPU=OFF
-      testDir: 'llvm/build'
+          -DPACKAGE_VENDOR=AMD
-      testExecutable: './bin/llvm-lit'
+          -DCLANG_LINK_FLANG_LEGACY=ON
-      testParameters: '-q --xunit-xml-output=llvm_test_output.xml --filter-out="live-debug-values-spill-tracking" ./test'
+          -DCMAKE_CXX_STANDARD=17
-      testOutputFile: llvm_test_output.xml
+          -DROCM_LLVM_BACKWARD_COMPAT_LINK=$(Build.BinariesDirectory)/llvm
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+          -DROCM_LLVM_BACKWARD_COMPAT_LINK_TARGET=./lib/llvm
-    parameters:
+          -GNinja
-      componentName: check-clang
+        cmakeBuildDir: '$(Build.SourcesDirectory)/llvm/build'
-      testDir: 'llvm/build'
+        cmakeSourceDir: '$(Build.SourcesDirectory)/llvm'
-      testExecutable: './bin/llvm-lit'
+        installDir: '$(Build.BinariesDirectory)/llvm'
-      testParameters: '-q --xunit-xml-output=clang_test_output.xml ./tools/clang/test'
+    # use llvm-lit to run unit tests for llvm, clang, and lld
-      testOutputFile: clang_test_output.xml
+    - task: Bash@3
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+      displayName: 'Copy llvm-lit to install directory'
-    parameters:
+      inputs:
-      componentName: check-lld
+        targetType: inline
-      testDir: 'llvm/build'
+        script: |
-      testExecutable: './bin/llvm-lit'
+          cp $(Build.SourcesDirectory)/llvm/build/bin/llvm-lit $(Build.BinariesDirectory)/llvm/bin/
-      testParameters: '-q --xunit-xml-output=lld_test_output.xml ./tools/lld/test'
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      testOutputFile: lld_test_output.xml
+      parameters:
-  - task: CopyFiles@2
+        componentName: check-llvm
-    displayName: Copy FileCheck for Publishing
+        testDir: 'llvm/build'
-    inputs:
+        testExecutable: './bin/llvm-lit'
-      CleanTargetFolder: false
+        testParameters: '-q --xunit-xml-output=llvm_test_output.xml --filter-out="live-debug-values-spill-tracking" ./test'
-      SourceFolder: llvm/build/bin
+        testOutputFile: llvm_test_output.xml
-      Contents: FileCheck
+        os: ${{ job.os }}
-      TargetFolder: $(Build.BinariesDirectory)/llvm/bin
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      retryCount: 3
+      parameters:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+        componentName: check-clang
-    parameters:
+        testDir: 'llvm/build'
-      componentName: device-libs
+        testExecutable: './bin/llvm-lit'
-      extraBuildFlags: >-
+        testParameters: '-q --xunit-xml-output=clang_test_output.xml ./tools/clang/test'
-        -DCMAKE_PREFIX_PATH="$(Build.SourcesDirectory)/llvm/build"
+        testOutputFile: clang_test_output.xml
-        -DCMAKE_BUILD_TYPE=Release
+        os: ${{ job.os }}
-      cmakeBuildDir: '$(Build.SourcesDirectory)/amd/device-libs/build'
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      cmakeSourceDir: '$(Build.SourcesDirectory)/amd/device-libs'
+      parameters:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+        componentName: check-lld
-    parameters:
+        testDir: 'llvm/build'
-      componentName: comgr
+        testExecutable: './bin/llvm-lit'
-      extraBuildFlags: >-
+        testParameters: '-q --xunit-xml-output=lld_test_output.xml ./tools/lld/test'
-        -DCMAKE_PREFIX_PATH="$(Build.SourcesDirectory)/llvm/build;$(Build.SourcesDirectory)/amd/device-libs/build"
+        testOutputFile: lld_test_output.xml
-        -DCOMGR_DISABLE_SPIRV=1
+        os: ${{ job.os }}
-        -DCMAKE_BUILD_TYPE=Release
+    - task: CopyFiles@2
-      cmakeBuildDir: '$(Build.SourcesDirectory)/amd/comgr/build'
+      displayName: Copy FileCheck for Publishing
-      cmakeSourceDir: '$(Build.SourcesDirectory)/amd/comgr'
+      inputs:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        CleanTargetFolder: false
-    parameters:
+        SourceFolder: llvm/build/bin
-      componentName: comgr
+        Contents: FileCheck
-      testParameters: '--output-on-failure --force-new-ctest-process --output-junit comgr_test_output.xml'
+        TargetFolder: $(Build.BinariesDirectory)/llvm/bin
-      testDir: 'amd/comgr/build'
+        retryCount: 3
-      testOutputFile: comgr_test_output.xml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
-    parameters:
+        componentName: device-libs
-      componentName: hipcc
+        os: ${{ job.os }}
-      extraBuildFlags: >-
+        useAmdclang: false
-        -DCMAKE_BUILD_TYPE=Release
+        extraBuildFlags: >-
-        -DHIPCC_BACKWARD_COMPATIBILITY=OFF
+          -DCMAKE_PREFIX_PATH="$(Build.SourcesDirectory)/llvm/build"
-      cmakeBuildDir: '$(Build.SourcesDirectory)/amd/hipcc/build'
+          -DCMAKE_BUILD_TYPE=Release
-      cmakeSourceDir: '$(Build.SourcesDirectory)/amd/hipcc'
+        cmakeBuildDir: '$(Build.SourcesDirectory)/amd/device-libs/build'
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+        cmakeSourceDir: '$(Build.SourcesDirectory)/amd/device-libs'
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+      parameters:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        componentName: comgr
-    parameters:
+        os: ${{ job.os }}
-      aptPackages: ${{ parameters.aptPackages }}
+        useAmdclang: false
-      environment: combined
+        extraBuildFlags: >-
-      extraEnvVars:
+          -DCMAKE_PREFIX_PATH="$(Build.SourcesDirectory)/llvm/build;$(Build.SourcesDirectory)/amd/device-libs/build"
-        - HIP_DEVICE_LIB_PATH:::/home/user/workspace/bin/amdgcn/bitcode
+          -DCOMGR_DISABLE_SPIRV=1
-        - HIP_PATH:::/home/user/workspace/rocm
+          -DCMAKE_BUILD_TYPE=Release
        cmakeBuildDir: '$(Build.SourcesDirectory)/amd/comgr/build'
        cmakeSourceDir: '$(Build.SourcesDirectory)/amd/comgr'
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
      parameters:
        componentName: comgr
        testParameters: '--output-on-failure --force-new-ctest-process --output-junit comgr_test_output.xml'
        testDir: 'amd/comgr/build'
        testOutputFile: comgr_test_output.xml
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        componentName: hipcc
        os: ${{ job.os }}
        useAmdclang: false
        extraBuildFlags: >-
          -DCMAKE_BUILD_TYPE=Release
          -DHIPCC_BACKWARD_COMPATIBILITY=OFF
        cmakeBuildDir: '$(Build.SourcesDirectory)/amd/hipcc/build'
        cmakeSourceDir: '$(Build.SourcesDirectory)/amd/hipcc'
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    - ${{ if eq(job.os, 'ubuntu2204') }}:
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
        parameters:
          aptPackages: ${{ parameters.aptPackages }}
          environment: combined
          extraEnvVars:
            - HIP_DEVICE_LIB_PATH:::/home/user/workspace/bin/amdgcn/bitcode
            - HIP_PATH:::/home/user/workspace/rocm
--- a/.azuredevops/components/rccl.yml
+++ b/.azuredevops/components/rccl.yml
@@ -15,7 +15,6 @@ parameters:
  default:
    - cmake
    - git
    - googletest
    - libboost-program-options-dev
    - libdrm-dev
    - libfftw3-dev
@@ -90,6 +89,10 @@ jobs:
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
        submoduleBehaviour: recursive
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
      parameters:
        dependencyList:
          - gtest
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
@@ -101,12 +104,11 @@ jobs:
        extraBuildFlags: >-
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/bin/hipcc
          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/bin/hipcc
          -DHALF_INCLUDE_DIR=$(Agent.BuildDirectory)/rocm/include
          -DCMAKE_BUILD_TYPE=Release
          -DROCM_PATH=$(Agent.BuildDirectory)/rocm
          -DBUILD_TESTS=ON
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/rocm/share/rocm/cmake;$(Agent.BuildDirectory)/rocm/libexec/hipify
-          -DAMDGPU_TARGETS=${{ job.target }}
+          -DGPU_TARGETS=${{ job.target }}
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
--- a/.azuredevops/components/rocBLAS.yml
+++ b/.azuredevops/components/rocBLAS.yml
@@ -1,10 +1,29 @@
 parameters:
 - name: componentName
  type: string
  default: rocBLAS
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
 # monorepo related parameters
 - name: sparseCheckoutDir
  type: string
  default: ''
 - name: triggerDownstreamJobs
  type: boolean
  default: false
 - name: downstreamAggregateNames
  type: string
  default: ''
 - name: buildDependsOn
  type: object
  default: null
 - name: unifiedBuild
  type: boolean
  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -64,19 +83,43 @@ parameters:
  type: object
  default:
    buildJobs:
-      - gfx942:
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-        target: gfx942
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
-      - gfx90a:
+      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
-        target: gfx90a
+      # - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
      # - { os: almalinux8, packageManager: dnf, target: gfx942 }
      # - { os: almalinux8, packageManager: dnf, target: gfx90a }
      # - { os: almalinux8, packageManager: dnf, target: gfx1201 }
      # - { os: almalinux8, packageManager: dnf, target: gfx1100 }
      # - { os: almalinux8, packageManager: dnf, target: gfx1030 }
    testJobs:
-      - gfx942:
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-        target: gfx942
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
-      - gfx90a:
+# - name: downstreamComponentMatrix
-        target: gfx90a
+#   type: object
 #   default:
 #     # rocSOLVER depends on both rocBLAS and rocPRIM
 #     # for a unified build, rocBLAS will be the one to call rocSOLVER
 #     - rocSOLVER:
 #       name: rocSOLVER
 #       sparseCheckoutDir: projects/rocsolver
 #       skipUnifiedBuild: 'false'
 #       buildDependsOn:
 #         - rocBLAS_build
 #       unifiedBuild:
 #         downstreamAggregateNames: rocBLAS+rocPRIM
 #         buildDependsOn:
 #           - rocBLAS_build
 #           - rocPRIM_build
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: rocBLAS_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
    ${{ if parameters.buildDependsOn }}:
      dependsOn:
        - ${{ each build in parameters.buildDependsOn }}:
          - ${{ build }}_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -89,6 +132,10 @@ jobs:
    - name: ROCM_PATH
      value: $(Agent.BuildDirectory)/rocm
    pool: ${{ variables.MEDIUM_BUILD_POOL }}
    ${{ if eq(job.os, 'almalinux8') }}:
      container:
        image: rocmexternalcicd.azurecr.io/manylinux228:latest
        endpoint: ContainerService3
    workspace:
      clean: all
    steps:
@@ -96,19 +143,26 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aocl.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
        ${{ if parameters.triggerDownstreamJobs }}:
          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        os: ${{ job.os }}
        extraBuildFlags: >-
          -DCMAKE_TOOLCHAIN_FILE=toolchain-linux.cmake
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm/llvm;$(Agent.BuildDirectory)/rocm
@@ -128,63 +182,94 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
        componentName: ${{ parameters.componentName }}
        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
        componentName: ${{ parameters.componentName }}
        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+    - ${{ if eq(job.os, 'ubuntu2204') }}:
-      parameters:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-        aptPackages: ${{ parameters.aptPackages }}
+        parameters:
-        pipModules: ${{ parameters.pipModules }}
+          aptPackages: ${{ parameters.aptPackages }}
-        installAOCL: true
+          pipModules: ${{ parameters.pipModules }}
-        gpuTarget: ${{ job.target }}
+          installAOCL: true
-        extraEnvVars:
+          gpuTarget: ${{ job.target }}
-          - HIP_ROCCLR_HOME:::/home/user/workspace/rocm
+          extraEnvVars:
-          - TENSILE_ROCM_ASSEMBLER_PATH:::/home/user/workspace/rocm/llvm/bin/clang
+            - HIP_ROCCLR_HOME:::/home/user/workspace/rocm
-          - TENSILE_ROCM_OFFLOAD_BUNDLER_PATH:::/home/user/workspace/rocm/llvm/bin/clang-offload-bundler
+            - TENSILE_ROCM_ASSEMBLER_PATH:::/home/user/workspace/rocm/llvm/bin/clang
-          - ROCM_PATH:::/home/user/workspace/rocm
+            - TENSILE_ROCM_OFFLOAD_BUNDLER_PATH:::/home/user/workspace/rocm/llvm/bin/clang-offload-bundler
            - ROCM_PATH:::/home/user/workspace/rocm
- ${{ each job in parameters.jobMatrix.testJobs }}:
+- ${{ if eq(parameters.unifiedBuild, False) }}:
-  - job: rocBLAS_test_${{ job.target }}
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
-    dependsOn: rocBLAS_build_${{ job.target }}
+    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
-    condition:
+      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
-      and(succeeded(),
+      condition:
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+        and(succeeded(),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        eq(${{ parameters.aggregatePipeline }}, False)
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
-      )
+          eq(${{ parameters.aggregatePipeline }}, False)
-    variables:
+        )
-    - group: common
+      variables:
-    - template: /.azuredevops/variables-global.yml
+      - group: common
-    pool: ${{ job.target }}_test_pool
+      - template: /.azuredevops/variables-global.yml
-    workspace:
+      pool: ${{ job.target }}_test_pool
-      clean: all
+      workspace:
-    steps:
+        clean: all
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      steps:
-      parameters:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-        aptPackages: ${{ parameters.aptPackages }}
+        parameters:
-        pipModules: ${{ parameters.pipModules }}
+          aptPackages: ${{ parameters.aptPackages }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+          pipModules: ${{ parameters.pipModules }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-      parameters:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-        gpuTarget: ${{ job.target }}
+        parameters:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+          preTargetFilter: ${{ parameters.componentName }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+          os: ${{ job.os }}
-      parameters:
+          gpuTarget: ${{ job.target }}
-        checkoutRef: ${{ parameters.checkoutRef }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-        dependencyList: ${{ parameters.rocmTestDependencies }}
+        parameters:
-        gpuTarget: ${{ job.target }}
+          os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
-      parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
-        componentName: rocBLAS
+          dependencyList: ${{ parameters.rocmTestDependencies }}
-        testDir: '$(Agent.BuildDirectory)/rocm/bin'
+          os: ${{ job.os }}
-        testExecutable: './rocblas-test'
+          gpuTarget: ${{ job.target }}
-        testParameters: '--yaml rocblas_smoke.yaml --gtest_output=xml:./test_output.xml --gtest_color=yes'
+          ${{ if parameters.triggerDownstreamJobs }}:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
-      parameters:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-        aptPackages: ${{ parameters.aptPackages }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-        pipModules: ${{ parameters.pipModules }}
+        parameters:
-        environment: test
+          componentName: ${{ parameters.componentName }}
-        gpuTarget: ${{ job.target }}
+          os: ${{ job.os }}
          testDir: '$(Agent.BuildDirectory)/rocm/bin'
          testExecutable: './rocblas-test'
          testParameters: '--yaml rocblas_smoke.yaml --gtest_output=xml:./test_output.xml --gtest_color=yes'
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
        parameters:
          aptPackages: ${{ parameters.aptPackages }}
          pipModules: ${{ parameters.pipModules }}
          environment: test
          gpuTarget: ${{ job.target }}
 # - ${{ if parameters.triggerDownstreamJobs }}:
 #   - ${{ each component in parameters.downstreamComponentMatrix }}:
 #     - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
 #       - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
 #         parameters:
 #           checkoutRepo: ${{ parameters.checkoutRepo }}
 #           sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
 #           triggerDownstreamJobs: true
 #           unifiedBuild: ${{ parameters.unifiedBuild }}
 #           ${{ if parameters.unifiedBuild }}:
 #             buildDependsOn: ${{ component.unifiedBuild.buildDependsOn }}
 #             downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ component.unifiedBuild.downstreamAggregateNames }}
 #           ${{ else }}:
 #             buildDependsOn: ${{ component.buildDependsOn }}
 #             downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
--- a/.azuredevops/components/rocDecode.yml
+++ b/.azuredevops/components/rocDecode.yml
@@ -1,4 +1,7 @@
 parameters:
 - name: componentName
  type: string
  default: rocDecode
 - name: checkoutRepo
  type: string
  default: 'self'
@@ -13,29 +16,28 @@ parameters:
 - name: aptPackages
  type: object
  default:
    - python3-pip
    - cmake
    - ninja-build
    - pkg-config
    - ffmpeg
    - libavcodec-dev
    - libavformat-dev
    - libavutil-dev
    - libdrm-dev
    - libstdc++-12-dev
    - libva-amdgpu-dev
    - mesa-amdgpu-va-drivers
-    - libdrm-dev
+    - ninja-build
    - pkg-config
    - python3-pip
 - name: rocmDependencies
  type: object
  default:
    - rocm-cmake
    - llvm-project
    - ROCR-Runtime
    - clr
-    - rocminfo
+    - llvm-project
    - rocm-cmake
    - rocm-core
    - rocminfo
    - rocprofiler-register
    - ROCR-Runtime
 - name: rocmTestDependencies
  type: object
  default:
@@ -48,53 +50,70 @@ parameters:
 - name: jobMatrix
  type: object
  default:
    buildJobs:
      - { os: ubuntu2204, packageManager: apt }
      - { os: almalinux8, packageManager: dnf }
    testJobs:
-      - gfx942:
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-        target: gfx942
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
      - gfx90a:
        target: gfx90a
 jobs:
- job: rocDecode_build
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
-  variables:
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}
-  - group: common
+    variables:
-  - template: /.azuredevops/variables-global.yml
+    - group: common
-  - name: ROCM_PATH
+    - template: /.azuredevops/variables-global.yml
-    value: $(Agent.BuildDirectory)/rocm
+    - name: ROCM_PATH
-  pool:
+      value: $(Agent.BuildDirectory)/rocm
-    vmImage: ${{ variables.BASE_BUILD_POOL }}
+    pool:
-  workspace:
+      vmImage: ${{ variables.BASE_BUILD_POOL }}
-    clean: all
+    ${{ if eq(job.os, 'almalinux8') }}:
-  steps:
+      container:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-    parameters:
+        endpoint: ContainerService3
-      aptPackages: ${{ parameters.aptPackages }}
+    workspace:
-      registerROCmPackages: true
+      clean: all
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    steps:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-    parameters:
+      parameters:
-      checkoutRepo: ${{ parameters.checkoutRepo }}
+        aptPackages: ${{ parameters.aptPackages }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        packageManager: ${{ job.packageManager }}
-    parameters:
+        registerROCmPackages: true
-      checkoutRef: ${{ parameters.checkoutRef }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-      dependencyList: ${{ parameters.rocmDependencies }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      aggregatePipeline: ${{ parameters.aggregatePipeline }}
+      parameters:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+        checkoutRepo: ${{ parameters.checkoutRepo }}
-    parameters:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      extraBuildFlags: >-
+      parameters:
-        -DCMAKE_BUILD_TYPE=Release
+        checkoutRef: ${{ parameters.checkoutRef }}
-        -GNinja
+        dependencyList: ${{ parameters.rocmDependencies }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+        os: ${{ job.os }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-  # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+      parameters:
-  #   parameters:
+        os: ${{ job.os }}
-  #     aptPackages: ${{ parameters.aptPackages }}
+        consolidateBuildAndInstall: true
-  #     registerROCmPackages: true
+        extraBuildFlags: >-
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
          -DCMAKE_BUILD_TYPE=Release
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
        componentName: ${{ parameters.componentName }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
        componentName: ${{ parameters.componentName }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
    #   parameters:
    #     aptPackages: ${{ parameters.aptPackages }}
    #     registerROCmPackages: true
 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: rocDecode_test_${{ job.target }}
+  - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
-    dependsOn: rocDecode_build
+    dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -114,20 +133,27 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        packageManager: ${{ job.packageManager }}
        registerROCmPackages: true
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
      parameters:
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
      parameters:
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmTestDependencies }}
        gpuTarget: ${{ job.target }}
        os: ${{ job.os }}
    - task: Bash@3
      displayName: Build rocDecode tests
      inputs:
        targetType: inline
        script: |
          ${{ iif(eq(job.os, 'almalinux8'), 'source /opt/rh/gcc-toolset-14/enable', '') }}
          mkdir rocDecode-tests
          cd rocDecode-tests
          cmake $(Agent.BuildDirectory)/rocm/share/rocdecode/test
@@ -136,6 +162,7 @@ jobs:
      parameters:
        componentName: rocDecode
        testDir: 'rocDecode-tests'
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/rocFFT.yml
+++ b/.azuredevops/components/rocFFT.yml
@@ -1,10 +1,29 @@
 parameters:
 - name: componentName
  type: string
  default: rocFFT
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
 # monorepo related parameters
 - name: sparseCheckoutDir
  type: string
  default: ''
 - name: triggerDownstreamJobs
  type: boolean
  default: false
 - name: downstreamAggregateNames
  type: string
  default: ''
 - name: buildDependsOn
  type: object
  default: null
 - name: unifiedBuild
  type: boolean
  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -59,10 +78,23 @@ parameters:
        target: gfx942
      - gfx90a:
        target: gfx90a
 # - name: downstreamComponentMatrix
 #   type: object
 #   default:
 #     - hipFFT:
 #       name: hipFFT
 #       sparseCheckoutDir: projects/hipfft
 #       skipUnifiedBuild: 'false'
 #       buildDependsOn:
 #         - rocFFT_build
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: rocFFT_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.target }}
    ${{ if parameters.buildDependsOn }}:
      dependsOn:
        - ${{ each build in parameters.buildDependsOn }}:
          - ${{ build }}_ubuntu2204_${{ job.target }} # todo: un-hardcode OS
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -79,12 +111,15 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
        ${{ if parameters.triggerDownstreamJobs }}:
          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        extraBuildFlags: >-
@@ -101,9 +136,11 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
@@ -114,8 +151,8 @@ jobs:
          - HIP_ROCCLR_HOME:::/home/user/workspace/rocm
 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: rocFFT_test_${{ job.target }}
+  - job: ${{ parameters.componentName }}_test_${{ job.target }}
-    dependsOn: rocFFT_build_${{ job.target }}
+    dependsOn: ${{ parameters.componentName }}_build_${{ job.target }}
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -135,6 +172,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
      parameters:
        preTargetFilter: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
@@ -142,10 +180,12 @@ jobs:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmTestDependencies }}
        gpuTarget: ${{ job.target }}
        ${{ if parameters.triggerDownstreamJobs }}:
          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
      parameters:
-        componentName: rocFFT
+        componentName: ${{ parameters.componentName }}
        testDir: '$(Agent.BuildDirectory)/rocm/bin'
        testExecutable: './rocfft-test'
        testParameters: '--test_prob 0.004 --gtest_output=xml:./test_output.xml --gtest_color=yes'
@@ -154,3 +194,15 @@ jobs:
        aptPackages: ${{ parameters.aptPackages }}
        environment: test
        gpuTarget: ${{ job.target }}
 # - ${{ if parameters.triggerDownstreamJobs }}:
 #   - ${{ each component in parameters.downstreamComponentMatrix }}:
 #     - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
 #       - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
 #         parameters:
 #           checkoutRepo: ${{ parameters.checkoutRepo }}
 #           sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
 #           buildDependsOn: ${{ component.buildDependsOn }}
 #           downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
 #           triggerDownstreamJobs: true
 #           unifiedBuild: ${{ parameters.unifiedBuild }}
--- a/.azuredevops/components/rocJPEG.yml
+++ b/.azuredevops/components/rocJPEG.yml
@@ -1,4 +1,7 @@
 parameters:
 - name: componentName
  type: string
  default: rocJPEG
 - name: checkoutRepo
  type: string
  default: 'self'
@@ -44,32 +47,44 @@ parameters:
  type: object
  default:
    buildJobs:
-      - gfx942:
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-        target: gfx942
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
-      - gfx90a:
+      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
      - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
      - { os: almalinux8, packageManager: dnf, target: gfx942 }
      - { os: almalinux8, packageManager: dnf, target: gfx90a }
      - { os: almalinux8, packageManager: dnf, target: gfx1201 }
      - { os: almalinux8, packageManager: dnf, target: gfx1100 }
      - { os: almalinux8, packageManager: dnf, target: gfx1030 }
    testJobs:
-      - gfx942:
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-        target: gfx942
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
      - gfx90a:
        target: gfx90a
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: rocJPEG_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
    - name: ROCM_PATH
      value: $(Agent.BuildDirectory)/rocm
    pool:
-      vmImage: ${{ variables.BASE_BUILD_POOL }}
+      ${{ if eq(job.os, 'ubuntu2404') }}:
        name: rocm-ci_medium_build_pool_2404
      ${{ else }}:
        name: ${{ variables.MEDIUM_BUILD_POOL }}
    ${{ if eq(job.os, 'almalinux8') }}:
      container:
        image: rocmexternalcicd.azurecr.io/manylinux228:latest
        endpoint: ContainerService3
    workspace:
      clean: all
    steps:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        packageManager: ${{ job.packageManager }}
        registerROCmPackages: true
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
@@ -80,17 +95,26 @@ jobs:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
        os: ${{ job.os }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        os: ${{ job.os }}
        consolidateBuildAndInstall: true
        extraBuildFlags: >-
          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm"
          -DCMAKE_BUILD_TYPE=Release
          -DGPU_TARGETS=${{ job.target }}
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
        componentName: ${{ parameters.componentName }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
    #   parameters:
@@ -99,8 +123,8 @@ jobs:
    #     registerROCmPackages: true
 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: rocJPEG_test_${{ job.target }}
+  - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
-    dependsOn: rocJPEG_build_${{ job.target }}
+    dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -120,22 +144,28 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        packageManager: ${{ job.packageManager }}
        registerROCmPackages: true
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
      parameters:
        gpuTarget: ${{ job.target }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
      parameters:
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmTestDependencies }}
        gpuTarget: ${{ job.target }}
        os: ${{ job.os }}
    - task: Bash@3
      displayName: Build rocJPEG tests
      inputs:
        targetType: inline
        script: |
          ${{ iif(eq(job.os, 'almalinux8'), 'source /opt/rh/gcc-toolset-14/enable', '') }}
          mkdir rocJPEG-tests
          cd rocJPEG-tests
          cmake $(Agent.BuildDirectory)/rocm/share/rocjpeg/test
@@ -144,6 +174,7 @@ jobs:
      parameters:
        componentName: rocJPEG
        testDir: 'rocJPEG-tests'
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/rocMLIR.yml
+++ b/.azuredevops/components/rocMLIR.yml
@@ -27,6 +27,7 @@ parameters:
    - numpy
    - tomli
    - scipy
    - pybind11
 - name: rocmDependencies
  type: object
  default:
--- a/.azuredevops/components/rocPRIM.yml
+++ b/.azuredevops/components/rocPRIM.yml
@@ -1,16 +1,29 @@
 parameters:
 - name: componentName
  type: string
  default: rocPRIM
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
- name: sparseCheckout
+# monorepo related parameters
  type: boolean
  default: false
 - name: sparseCheckoutDir
  type: string
  default: ''
 - name: triggerDownstreamJobs
  type: boolean
  default: false
 - name: downstreamAggregateNames
  type: string
  default: ''
 - name: buildDependsOn
  type: object
  default: null
 - name: unifiedBuild
  type: boolean
  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -20,18 +33,17 @@ parameters:
  type: object
  default:
    - cmake
    - ninja-build
    - libgtest-dev
    - git
    - ninja-build
    - python3-pip
 - name: rocmDependencies
  type: object
  default:
    - rocm-cmake
    - llvm-project
    - ROCR-Runtime
    - clr
    - llvm-project
    - rocm-cmake
    - rocminfo
    - ROCR-Runtime
 - name: rocmTestDependencies
  type: object
  default:
@@ -45,98 +57,175 @@ parameters:
  type: object
  default:
    buildJobs:
-      - gfx942:
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-        target: gfx942
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
-      - gfx90a:
+      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
      - { os: almalinux8, packageManager: dnf, target: gfx942 }
      - { os: almalinux8, packageManager: dnf, target: gfx90a }
      - { os: almalinux8, packageManager: dnf, target: gfx1201 }
      - { os: almalinux8, packageManager: dnf, target: gfx1100 }
      - { os: almalinux8, packageManager: dnf, target: gfx1030 }
    testJobs:
-      - gfx942:
+      - { os: ubuntu2204, packageManager: apt, target: gfx942, shard: 1, shardCount: 3 }
-        target: gfx942
+      - { os: ubuntu2204, packageManager: apt, target: gfx942, shard: 2, shardCount: 3 }
-      - gfx90a:
+      - { os: ubuntu2204, packageManager: apt, target: gfx942, shard: 3, shardCount: 3 }
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a, shard: 1, shardCount: 3 }
      - { os: ubuntu2204, packageManager: apt, target: gfx90a, shard: 2, shardCount: 3 }
      - { os: ubuntu2204, packageManager: apt, target: gfx90a, shard: 3, shardCount: 3 }
 - name: downstreamComponentMatrix
  type: object
  default:
    - rocThrust:
      name: rocThrust
      sparseCheckoutDir: projects/rocthrust
      skipUnifiedBuild: 'false'
      buildDependsOn:
        - rocPRIM_build
    - hipCUB:
      name: hipCUB
      sparseCheckoutDir: projects/hipcub
      skipUnifiedBuild: 'false'
      buildDependsOn:
        - rocPRIM_build
    # rocSOLVER depends on both rocBLAS and rocPRIM
    # for a unified build, rocBLAS will be the one to call rocSOLVER
    # - rocSOLVER:
    #   name: rocSOLVER
    #   sparseCheckoutDir: projects/rocsolver
    #   skipUnifiedBuild: 'true'
    #   buildDependsOn:
    #     - rocPRIM_build
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: rocPRIM_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
    ${{ if parameters.buildDependsOn }}:
      dependsOn:
        - ${{ each build in parameters.buildDependsOn }}:
          - ${{ build }}_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
    pool: ${{ variables.MEDIUM_BUILD_POOL }}
    ${{ if eq(job.os, 'almalinux8') }}:
      container:
        image: rocmexternalcicd.azurecr.io/manylinux228:latest
        endpoint: ContainerService3
    workspace:
      clean: all
    steps:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
        sparseCheckout: ${{ parameters.sparseCheckout }}
        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
      parameters:
        dependencyList:
          - gtest
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
        os: ${{ job.os }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
        ${{ if parameters.triggerDownstreamJobs }}:
          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        os: ${{ job.os }}
        consolidateBuildAndInstall: true
        extraBuildFlags: >-
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor"
          -DBUILD_BENCHMARK=ON
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
          -DAMDGPU_TARGETS=${{ job.target }}
          -DBUILD_BENCHMARK=ON
          -DBUILD_TEST=ON
          -GNinja
        extraCxxFlags: -Wno-deprecated-declarations
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
        componentName: ${{ parameters.componentName }}
        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        gpuTarget: ${{ job.target }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+    - ${{ if eq(job.os, 'ubuntu2204') }}:
-      parameters:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-        aptPackages: ${{ parameters.aptPackages }}
+        parameters:
-        gpuTarget: ${{ job.target }}
+          aptPackages: ${{ parameters.aptPackages }}
          gpuTarget: ${{ job.target }}
- ${{ each job in parameters.jobMatrix.testJobs }}:
+- ${{ if eq(parameters.unifiedBuild, False) }}:
-  - job: rocPRIM_test_${{ job.target }}
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
-    dependsOn: rocPRIM_build_${{ job.target }}
+    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}_shard_${{ job.shard }}
-    condition:
+      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
-      and(succeeded(),
+      condition:
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+        and(succeeded(),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        eq(${{ parameters.aggregatePipeline }}, False)
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
-      )
+          eq(${{ parameters.aggregatePipeline }}, False)
-    variables:
+        )
-    - group: common
+      variables:
-    - template: /.azuredevops/variables-global.yml
+      - group: common
-    pool: ${{ job.target }}_test_pool
+      - template: /.azuredevops/variables-global.yml
-    workspace:
+      pool: ${{ job.target }}_test_pool
-      clean: all
+      workspace:
-    steps:
+        clean: all
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      steps:
-      parameters:
+      - checkout: none
-        aptPackages: ${{ parameters.aptPackages }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+        parameters:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+          aptPackages: ${{ parameters.aptPackages }}
-      parameters:
+          packageManager: ${{ job.packageManager }}
-        gpuTarget: ${{ job.target }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        parameters:
-      parameters:
+          preTargetFilter: ${{ parameters.componentName }}
-        checkoutRef: ${{ parameters.checkoutRef }}
+          gpuTarget: ${{ job.target }}
-        dependencyList: ${{ parameters.rocmTestDependencies }}
+          os: ${{ job.os }}
-        gpuTarget: ${{ job.target }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
-      parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
-        componentName: rocPRIM
+          dependencyList: ${{ parameters.rocmTestDependencies }}
-        testDir: '$(Agent.BuildDirectory)/rocm/bin/rocprim'
+          gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+          os: ${{ job.os }}
-      parameters:
+          ${{ if parameters.triggerDownstreamJobs }}:
-        aptPackages: ${{ parameters.aptPackages }}
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
-        environment: test
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-        gpuTarget: ${{ job.target }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
        parameters:
          componentName: ${{ parameters.componentName }}
          testDir: '$(Agent.BuildDirectory)/rocm/bin/rocprim'
          extraTestParameters: '-I ${{ job.shard }},,${{ job.shardCount }}'
          os: ${{ job.os }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
        parameters:
          aptPackages: ${{ parameters.aptPackages }}
          environment: test
          gpuTarget: ${{ job.target }}
 - ${{ if parameters.triggerDownstreamJobs }}:
  - ${{ each component in parameters.downstreamComponentMatrix }}:
    - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
      - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
        parameters:
          checkoutRepo: ${{ parameters.checkoutRepo }}
          sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
          buildDependsOn: ${{ component.buildDependsOn }}
          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
          triggerDownstreamJobs: true
          unifiedBuild: ${{ parameters.unifiedBuild }}
--- a/.azuredevops/components/rocRAND.yml
+++ b/.azuredevops/components/rocRAND.yml
@@ -1,10 +1,29 @@
 parameters:
 - name: componentName
  type: string
  default: rocRAND
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
 # monorepo related parameters
 - name: sparseCheckoutDir
  type: string
  default: ''
 - name: triggerDownstreamJobs
  type: boolean
  default: false
 - name: downstreamAggregateNames
  type: string
  default: ''
 - name: buildDependsOn
  type: object
  default: null
 - name: unifiedBuild
  type: boolean
  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -15,18 +34,16 @@ parameters:
  default:
    - cmake
    - git
    - googletest
    - libgtest-dev
    - ninja-build
    - python3-pip
 - name: rocmDependencies
  type: object
  default:
    - rocm-cmake
    - llvm-project
    - ROCR-Runtime
    - clr
    - llvm-project
    - rocm-cmake
    - rocminfo
    - ROCR-Runtime
 - name: rocmTestDependencies
  type: object
  default:
@@ -40,56 +57,96 @@ parameters:
  type: object
  default:
    buildJobs:
-      - gfx942:
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-        target: gfx942
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
-      - gfx90a:
+      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
      - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
      - { os: almalinux8, packageManager: dnf, target: gfx942 }
      - { os: almalinux8, packageManager: dnf, target: gfx90a }
      - { os: almalinux8, packageManager: dnf, target: gfx1201 }
      - { os: almalinux8, packageManager: dnf, target: gfx1100 }
      - { os: almalinux8, packageManager: dnf, target: gfx1030 }
    testJobs:
-      - gfx942:
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-        target: gfx942
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
-      - gfx90a:
+- name: downstreamComponentMatrix
-        target: gfx90a
+  type: object
  default:
    - hipRAND:
      name: hipRAND
      sparseCheckoutDir: projects/hiprand
      skipUnifiedBuild: 'false'
      buildDependsOn:
        - rocRAND_build
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: rocRAND_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
    ${{ if parameters.buildDependsOn }}:
      dependsOn:
        - ${{ each build in parameters.buildDependsOn }}:
          - ${{ build }}_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
    - name: HIP_ROCCLR_HOME
      value: $(Build.BinariesDirectory)/rocm
    pool:
-      vmImage: ${{ variables.BASE_BUILD_POOL }}
+      ${{ if eq(job.os, 'ubuntu2404') }}:
        vmImage: 'ubuntu-24.04'
      ${{ else }}:
        name: ${{ variables.MEDIUM_BUILD_POOL }}
    ${{ if eq(job.os, 'almalinux8') }}:
      container:
        image: rocmexternalcicd.azurecr.io/manylinux228:latest
        endpoint: ContainerService3
    workspace:
      clean: all
    steps:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
      parameters:
        dependencyList:
          - gtest
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
        os: ${{ job.os }}
        ${{ if parameters.triggerDownstreamJobs }}:
          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        os: ${{ job.os }}
        consolidateBuildAndInstall: true
        extraBuildFlags: >-
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor
          -DBUILD_TEST=ON
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
          -DAMDGPU_TARGETS=${{ job.target }}
          -DBUILD_TEST=ON
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
        componentName: ${{ parameters.componentName }}
        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        gpuTarget: ${{ job.target }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
    #   parameters:
@@ -98,42 +155,63 @@ jobs:
    #     extraEnvVars:
    #       - HIP_ROCCLR_HOME:::/home/user/workspace/rocm
- ${{ each job in parameters.jobMatrix.testJobs }}:
+- ${{ if eq(parameters.unifiedBuild, False) }}:
-  - job: rocRAND_test_${{ job.target }}
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
-    dependsOn: rocRAND_build_${{ job.target }}
+    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
-    condition:
+      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
-      and(succeeded(),
+      condition:
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+        and(succeeded(),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        eq(${{ parameters.aggregatePipeline }}, False)
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
-      )
+          eq(${{ parameters.aggregatePipeline }}, False)
-    variables:
+        )
-    - group: common
+      variables:
-    - template: /.azuredevops/variables-global.yml
+      - group: common
-    pool: ${{ job.target }}_test_pool
+      - template: /.azuredevops/variables-global.yml
-    workspace:
+      pool: ${{ job.target }}_test_pool
-      clean: all
+      workspace:
-    steps:
+        clean: all
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      steps:
-      parameters:
+      - checkout: none
-        aptPackages: ${{ parameters.aptPackages }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+        parameters:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+          aptPackages: ${{ parameters.aptPackages }}
-      parameters:
+          packageManager: ${{ job.packageManager }}
-        gpuTarget: ${{ job.target }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        parameters:
-      parameters:
+          preTargetFilter: ${{ parameters.componentName }}
-        checkoutRef: ${{ parameters.checkoutRef }}
+          gpuTarget: ${{ job.target }}
-        dependencyList: ${{ parameters.rocmTestDependencies }}
+          os: ${{ job.os }}
-        gpuTarget: ${{ job.target }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
-      parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
-        componentName: rocRAND
+          dependencyList: ${{ parameters.rocmTestDependencies }}
-        testDir: '$(Agent.BuildDirectory)/rocm/bin/rocRAND'
+          gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+          os: ${{ job.os }}
-      parameters:
+          ${{ if parameters.triggerDownstreamJobs }}:
-        aptPackages: ${{ parameters.aptPackages }}
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
-        environment: test
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-        gpuTarget: ${{ job.target }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
        parameters:
          componentName: ${{ parameters.componentName }}
          testDir: '$(Agent.BuildDirectory)/rocm/bin/rocRAND'
          os: ${{ job.os }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
        parameters:
          aptPackages: ${{ parameters.aptPackages }}
          environment: test
          gpuTarget: ${{ job.target }}
 - ${{ if parameters.triggerDownstreamJobs }}:
  - ${{ each component in parameters.downstreamComponentMatrix }}:
    - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
      - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
        parameters:
          checkoutRepo: ${{ parameters.checkoutRepo }}
          sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
          buildDependsOn: ${{ component.buildDependsOn }}
          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
          triggerDownstreamJobs: true
          unifiedBuild: ${{ parameters.unifiedBuild }}
--- a/.azuredevops/components/rocSOLVER.yml
+++ b/.azuredevops/components/rocSOLVER.yml
@@ -1,10 +1,29 @@
 parameters:
 - name: componentName
  type: string
  default: rocSOLVER
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
 # monorepo related parameters
 - name: sparseCheckoutDir
  type: string
  default: ''
 - name: triggerDownstreamJobs
  type: boolean
  default: false
 - name: downstreamAggregateNames
  type: string
  default: ''
 - name: buildDependsOn
  type: object
  default: null
 - name: unifiedBuild
  type: boolean
  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -26,14 +45,12 @@ parameters:
  type: object
  default:
    - clr
    - hipSPARSE
    - llvm-project
    - rocBLAS
    - rocm-cmake
    - rocminfo
    - rocPRIM
    - ROCR-Runtime
    - rocSPARSE
 - name: rocmTestDependencies
  type: object
  default:
@@ -55,33 +72,47 @@ parameters:
  type: object
  default:
    buildJobs:
-      - gfx942:
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-        target: gfx942
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
-      - gfx90a:
+      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
-        target: gfx90a
+      # - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
      # - { os: almalinux8, packageManager: dnf, target: gfx942 }
      # - { os: almalinux8, packageManager: dnf, target: gfx90a }
      # - { os: almalinux8, packageManager: dnf, target: gfx1201 }
      # - { os: almalinux8, packageManager: dnf, target: gfx1100 }
      # - { os: almalinux8, packageManager: dnf, target: gfx1030 }
    testJobs:
-      - gfx942:
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-        target: gfx942
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
      - gfx90a:
        target: gfx90a
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: rocSOLVER_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
    ${{ if parameters.buildDependsOn }}:
      dependsOn:
        - ${{ each build in parameters.buildDependsOn }}:
          - ${{ build }}_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
    pool: ${{ variables.MEDIUM_BUILD_POOL }}
    ${{ if eq(job.os, 'almalinux8') }}:
      container:
        image: rocmexternalcicd.azurecr.io/manylinux228:latest
        endpoint: ContainerService3
    workspace:
      clean: all
    steps:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - task: Bash@3
      displayName: 'Clone lapack'
      inputs:
@@ -92,11 +123,15 @@ jobs:
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
        ${{ if parameters.triggerDownstreamJobs }}:
          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        componentName: lapack
        os: ${{ job.os }}
        extraBuildFlags: >-
          -DCMAKE_BUILD_TYPE=Release
          -DCMAKE_Fortran_FLAGS=-fno-optimize-sibling-calls
@@ -109,6 +144,7 @@ jobs:
        installDir: '$(Pipeline.Workspace)/deps-install'
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        os: ${{ job.os }}
        extraBuildFlags: >-
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Pipeline.Workspace)/deps-install
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
@@ -120,56 +156,71 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
        componentName: ${{ parameters.componentName }}
        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
        componentName: ${{ parameters.componentName }}
        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+    - ${{ if eq(job.os, 'ubuntu2204') }}:
-      parameters:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-        aptPackages: ${{ parameters.aptPackages }}
+        parameters:
-        gpuTarget: ${{ job.target }}
+          aptPackages: ${{ parameters.aptPackages }}
-        extraCopyDirectories:
+          gpuTarget: ${{ job.target }}
-          - deps-install
+          extraCopyDirectories:
            - deps-install
- ${{ each job in parameters.jobMatrix.testJobs }}:
+- ${{ if eq(parameters.unifiedBuild, False) }}:
-  - job: rocSOLVER_test_${{ job.target }}
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
-    dependsOn: rocSOLVER_build_${{ job.target }}
+    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
-    condition:
+      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
-      and(succeeded(),
+      condition:
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+        and(succeeded(),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        eq(${{ parameters.aggregatePipeline }}, False)
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
-      )
+          eq(${{ parameters.aggregatePipeline }}, False)
-    variables:
+        )
-    - group: common
+      variables:
-    - template: /.azuredevops/variables-global.yml
+      - group: common
-    pool: ${{ job.target }}_test_pool
+      - template: /.azuredevops/variables-global.yml
-    workspace:
+      pool: ${{ job.target }}_test_pool
-      clean: all
+      workspace:
-    steps:
+        clean: all
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      steps:
-      parameters:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-        aptPackages: ${{ parameters.aptPackages }}
+        parameters:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+          aptPackages: ${{ parameters.aptPackages }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-      parameters:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-        gpuTarget: ${{ job.target }}
+        parameters:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+          preTargetFilter: ${{ parameters.componentName }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+          os: ${{ job.os }}
-      parameters:
+          gpuTarget: ${{ job.target }}
-        checkoutRef: ${{ parameters.checkoutRef }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-        dependencyList: ${{ parameters.rocmTestDependencies }}
+        parameters:
-        gpuTarget: ${{ job.target }}
+          os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
-      parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
-        componentName: rocSOLVER
+          dependencyList: ${{ parameters.rocmTestDependencies }}
-        testDir: '$(Agent.BuildDirectory)/rocm/bin'
+          os: ${{ job.os }}
-        testExecutable: './rocsolver-test'
+          gpuTarget: ${{ job.target }}
-        testParameters: '--gtest_filter="*checkin*" --gtest_output=xml:./test_output.xml --gtest_color=yes'
+          ${{ if parameters.triggerDownstreamJobs }}:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
-      parameters:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-        aptPackages: ${{ parameters.aptPackages }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-        environment: test
+        parameters:
-        gpuTarget: ${{ job.target }}
+          componentName: ${{ parameters.componentName }}
          os: ${{ job.os }}
          testDir: '$(Agent.BuildDirectory)/rocm/bin'
          testExecutable: './rocsolver-test'
          testParameters: '--gtest_filter="*checkin*" --gtest_output=xml:./test_output.xml --gtest_color=yes'
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
        parameters:
          aptPackages: ${{ parameters.aptPackages }}
          environment: test
          gpuTarget: ${{ job.target }}
--- a/.azuredevops/components/rocThrust.yml
+++ b/.azuredevops/components/rocThrust.yml
@@ -1,10 +1,29 @@
 parameters:
 - name: componentName
  type: string
  default: rocThrust
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
 # monorepo related parameters
 - name: sparseCheckoutDir
  type: string
  default: ''
 - name: triggerDownstreamJobs
  type: boolean
  default: false
 - name: downstreamAggregateNames
  type: string
  default: ''
 - name: buildDependsOn
  type: object
  default: null
 - name: unifiedBuild
  type: boolean
  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -14,18 +33,17 @@ parameters:
  type: object
  default:
    - cmake
    - git
    - ninja-build
    - libboost-program-options-dev
    - googletest
    - libfftw3-dev
    - git
    - python3-pip
 - name: rocmDependencies
  type: object
  default:
    - clr
    - hipRAND
    - llvm-project
    - rocm-cmake
    - rocminfo
    - rocPRIM
    - ROCR-Runtime
@@ -36,104 +54,142 @@ parameters:
    - llvm-project
    - rocminfo
    - rocPRIM
    - ROCR-Runtime
    - hipRAND
    - rocprofiler-register
    - ROCR-Runtime
 - name: jobMatrix
  type: object
  default:
    buildJobs:
-      - gfx942:
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-        target: gfx942
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
-      - gfx90a:
+      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
      - { os: almalinux8, packageManager: dnf, target: gfx942 }
      - { os: almalinux8, packageManager: dnf, target: gfx90a }
      - { os: almalinux8, packageManager: dnf, target: gfx1201 }
      - { os: almalinux8, packageManager: dnf, target: gfx1100 }
      - { os: almalinux8, packageManager: dnf, target: gfx1030 }
    testJobs:
-      - gfx942:
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-        target: gfx942
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
      - gfx90a:
        target: gfx90a
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: rocThrust_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
    ${{ if parameters.buildDependsOn }}:
      dependsOn:
        - ${{ each build in parameters.buildDependsOn }}:
          - ${{ build }}_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
    pool: ${{ variables.MEDIUM_BUILD_POOL }}
    ${{ if eq(job.os, 'almalinux8') }}:
      container:
        image: rocmexternalcicd.azurecr.io/manylinux228:latest
        endpoint: ContainerService3
    workspace:
      clean: all
    steps:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
      parameters:
        dependencyList:
          - gtest
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
        os: ${{ job.os }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
        ${{ if parameters.triggerDownstreamJobs }}:
          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        os: ${{ job.os }}
        consolidateBuildAndInstall: true
        extraBuildFlags: >-
          -GNinja
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
          -DROCM_PATH=$(Agent.BuildDirectory)/rocm
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor"
          -DAMDGPU_TARGETS=${{ job.target }}
          -DBUILD_TEST=ON
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
        componentName: ${{ parameters.componentName }}
        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        gpuTarget: ${{ job.target }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+    - ${{ if eq(job.os, 'ubuntu2204') }}:
-      parameters:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-        aptPackages: ${{ parameters.aptPackages }}
+        parameters:
-        gpuTarget: ${{ job.target }}
+          aptPackages: ${{ parameters.aptPackages }}
          gpuTarget: ${{ job.target }}
- ${{ each job in parameters.jobMatrix.testJobs }}:
+- ${{ if eq(parameters.unifiedBuild, False) }}:
-  - job: rocThrust_test_${{ job.target }}
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
-    dependsOn: rocThrust_build_${{ job.target }}
+    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
-    condition:
+      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
-      and(succeeded(),
+      condition:
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+        and(succeeded(),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        eq(${{ parameters.aggregatePipeline }}, False)
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
-      )
+          eq(${{ parameters.aggregatePipeline }}, False)
-    variables:
+        )
-    - group: common
+      variables:
-    - template: /.azuredevops/variables-global.yml
+      - group: common
-    pool: ${{ job.target }}_test_pool
+      - template: /.azuredevops/variables-global.yml
-    workspace:
+      pool: ${{ job.target }}_test_pool
-      clean: all
+      workspace:
-    steps:
+        clean: all
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      steps:
-      parameters:
+      - checkout: none
-        aptPackages: ${{ parameters.aptPackages }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+        parameters:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+          aptPackages: ${{ parameters.aptPackages }}
-      parameters:
+          packageManager: ${{ job.packageManager }}
-        gpuTarget: ${{ job.target }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        parameters:
-      parameters:
+          preTargetFilter: ${{ parameters.componentName }}
-        checkoutRef: ${{ parameters.checkoutRef }}
+          gpuTarget: ${{ job.target }}
-        dependencyList: ${{ parameters.rocmTestDependencies }}
+          os: ${{ job.os }}
-        gpuTarget: ${{ job.target }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
-      parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
-        componentName: rocThrust
+          dependencyList: ${{ parameters.rocmTestDependencies }}
-        testDir: '$(Agent.BuildDirectory)/rocm/bin/rocthrust'
+          gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+          os: ${{ job.os }}
-      parameters:
+          ${{ if parameters.triggerDownstreamJobs }}:
-        aptPackages: ${{ parameters.aptPackages }}
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
-        environment: test
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-        gpuTarget: ${{ job.target }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
        parameters:
          componentName: ${{ parameters.componentName }}
          testDir: '$(Agent.BuildDirectory)/rocm/bin/rocthrust'
          testParameters: '--output-on-failure --force-new-ctest-process --output-junit test_output.xml --exclude-regex "scan.hip"'
          os: ${{ job.os }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
        parameters:
          aptPackages: ${{ parameters.aptPackages }}
          environment: test
          gpuTarget: ${{ job.target }}
--- a/.azuredevops/components/rocm-cmake.yml
+++ b/.azuredevops/components/rocm-cmake.yml
@@ -16,8 +16,6 @@ parameters:
    - doxygen
    - doxygen-doc
    - ninja-build
    - python3-pip
    - python3-sphinx
 - name: pipModules
  type: object
  default:
@@ -25,49 +23,75 @@ parameters:
    - cmake==3.20.5
    - ninja
    - rocm-docs-core
    - sphinx
 - name: jobMatrix
  type: object
  default:
    buildJobs:
      - { os: ubuntu2204, packageManager: apt }
      - { os: ubuntu2404, packageManager: apt }
      - { os: almalinux8, packageManager: dnf }
 jobs:
- job: rocm_cmake
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
-  variables:
+  - job: rocm_cmake_${{ job.os }}
-  - group: common
+    pool:
-  - template: /.azuredevops/variables-global.yml
+      ${{ if eq(job.os, 'ubuntu2404') }}:
-  pool:
+        vmImage: 'ubuntu-24.04'
-    vmImage: ${{ variables.BASE_BUILD_POOL }}
+      ${{ else }}:
-  workspace:
+        vmImage: 'ubuntu-22.04'
-    clean: all
+    ${{ if eq(job.os, 'almalinux8') }}:
-  steps:
+      container:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-    parameters:
+        endpoint: ContainerService3
-      aptPackages: ${{ parameters.aptPackages }}
+    variables:
-      pipModules: ${{ parameters.pipModules }}
+    - group: common
-  - task: Bash@3
+    - template: /.azuredevops/variables-global.yml
-    displayName: Add CMake to PATH
+    workspace:
-    inputs:
+      clean: all
-      targetType: inline
+    steps:
-      script: echo "##vso[task.prependpath]$(python3 -m site --user-base)/bin"
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      parameters:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+        aptPackages: ${{ parameters.aptPackages }}
-    parameters:
+        pipModules: ${{ parameters.pipModules }}
-      checkoutRepo: ${{ parameters.checkoutRepo }}
+        packageManager: ${{ job.packageManager }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+    - task: Bash@3
-  - task: Bash@3
+      displayName: Add CMake to PATH
-    displayName: CTest setup
+      inputs:
-    inputs:
+        targetType: inline
-      targetType: inline
+        script: echo "##vso[task.prependpath]$(python3 -m site --user-base)/bin"
-      script: |
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-        python -m pip install -r $(Build.SourcesDirectory)/docs/requirements.txt
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-        python -m pip install -r $(Build.SourcesDirectory)/test/docsphinx/docs/.sphinx/requirements.txt
+      parameters:
-        git config --global user.email "you@example.com"
+        checkoutRepo: ${{ parameters.checkoutRepo }}
-        git config --global user.name "Your Name"
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+      parameters:
-    parameters:
+        os: ${{ job.os }}
-      componentName: rocm-cmake
+        useAmdclang: false
-      testParameters: '-E "pass-version-parent" --output-on-failure --force-new-ctest-process --output-junit test_output.xml'
+    - task: Bash@3
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+      displayName: CTest setup
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      inputs:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+        targetType: inline
-  # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        script: |
-  #   parameters:
+          python -m pip install -r $(Build.SourcesDirectory)/docs/requirements.txt
-  #     aptPackages: ${{ parameters.aptPackages }}
+          python -m pip install -r $(Build.SourcesDirectory)/test/docsphinx/docs/.sphinx/requirements.txt
-  #     pipModules: ${{ parameters.pipModules }}
+          git config --global user.email "you@example.com"
-  #     environment: combined
+          git config --global user.name "Your Name"
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
      parameters:
        componentName: rocm-cmake
        testParameters: '-E "pass-version-parent" --output-on-failure --force-new-ctest-process --output-junit test_output.xml'
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
    #   parameters:
    #     aptPackages: ${{ parameters.aptPackages }}
    #     pipModules: ${{ parameters.pipModules }}
    #     environment: combined
--- a/.azuredevops/components/rocm-core.yml
+++ b/.azuredevops/components/rocm-core.yml
@@ -15,39 +15,61 @@ parameters:
  default:
    - cmake
    - ninja-build
-    - python3-pip
+
 - name: jobMatrix
  type: object
  default:
    buildJobs:
      - { os: ubuntu2204, packageManager: apt }
      - { os: ubuntu2404, packageManager: apt }
      - { os: almalinux8, packageManager: dnf }
 jobs:
- job: rocm_core
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
-  variables:
+  - job: rocm_core_${{ job.os }}
-  - group: common
+    pool:
-  - template: /.azuredevops/variables-global.yml
+      ${{ if eq(job.os, 'ubuntu2404') }}:
-  pool:
+        vmImage: 'ubuntu-24.04'
-    vmImage: ${{ variables.BASE_BUILD_POOL }}
+      ${{ else }}:
-  workspace:
+        vmImage: 'ubuntu-22.04'
-    clean: all
+    ${{ if eq(job.os, 'almalinux8') }}:
-  steps:
+      container:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-    parameters:
+        endpoint: ContainerService3
-      aptPackages: ${{ parameters.aptPackages }}
+    variables:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - group: common
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+    - template: /.azuredevops/variables-global.yml
-    parameters:
+    workspace:
-      checkoutRepo: ${{ parameters.checkoutRepo }}
+      clean: all
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+    steps:
-    parameters:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      extraBuildFlags: >-
+      parameters:
-        -DCMAKE_CURRENT_BINARY_DIR=$PWD
+        aptPackages: ${{ parameters.aptPackages }}
-        -DCMAKE_CURRENT_SOURCE_DIR=$PWD/../
+        packageManager: ${{ job.packageManager }}
-        -DCMAKE_VERBOSE_MAKEFILE=1
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-        -DCPACK_GENERATOR=DEB
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-        -DCPACK_DEBIAN_PACKAGE_RELEASE="local.9999~99.99"
+      parameters:
-        -DCPACK_RPM_PACKAGE_RELEASE="local.9999"
+        checkoutRepo: ${{ parameters.checkoutRepo }}
-        -DROCM_VERSION="$(NEXT_RELEASE_VERSION)"
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-        -GNinja
+      parameters:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+        os: ${{ job.os }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+        useAmdclang: false
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+        extraBuildFlags: >-
-  # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+          -DCMAKE_CURRENT_BINARY_DIR=$PWD
-  #   parameters:
+          -DCMAKE_CURRENT_SOURCE_DIR=$PWD/../
-  #     aptPackages: ${{ parameters.aptPackages }}
+          -DCMAKE_VERBOSE_MAKEFILE=1
          -DCPACK_GENERATOR=DEB
          -DCPACK_DEBIAN_PACKAGE_RELEASE="local.9999~99.99"
          -DCPACK_RPM_PACKAGE_RELEASE="local.9999"
          -DROCM_VERSION="$(NEXT_RELEASE_VERSION)"
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
    #   parameters:
    #     aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/rocm-examples.yml
+++ b/.azuredevops/components/rocm-examples.yml
@@ -15,6 +15,7 @@ parameters:
  default:
    - cmake
    - libglfw3-dev
    - libmsgpack-dev
    - libtbb-dev
    - ninja-build
    - python3-pip
--- a/.azuredevops/components/rocm_smi_lib.yml
+++ b/.azuredevops/components/rocm_smi_lib.yml
@@ -17,50 +17,66 @@ parameters:
    - libdrm-dev
    - ninja-build
    - pkg-config
    - python3-pip
 - name: jobMatrix
  type: object
  default:
    buildJobs:
      - { os: ubuntu2204, packageManager: apt }
      - { os: ubuntu2404, packageManager: apt }
      - { os: almalinux8, packageManager: dnf }
    testJobs:
-      - gfx942:
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-        target: gfx942
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
      - gfx90a:
        target: gfx90a
 jobs:
- job: rocm_smi_lib_build
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
-  variables:
+  - job: rocm_smi_lib_build_${{ job.os }}
-  - group: common
+    pool:
-  - template: /.azuredevops/variables-global.yml
+      ${{ if eq(job.os, 'ubuntu2404') }}:
-  pool:
+        vmImage: 'ubuntu-24.04'
-    vmImage: ${{ variables.BASE_BUILD_POOL }}
+      ${{ else }}:
-  workspace:
+        vmImage: 'ubuntu-22.04'
-    clean: all
+    ${{ if eq(job.os, 'almalinux8') }}:
-  steps:
+      container:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-    parameters:
+        endpoint: ContainerService3
-      aptPackages: ${{ parameters.aptPackages }}
+    variables:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - group: common
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+    - template: /.azuredevops/variables-global.yml
-    parameters:
+    workspace:
-      checkoutRepo: ${{ parameters.checkoutRepo }}
+      clean: all
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+    steps:
-    parameters:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      extraBuildFlags: >-
+      parameters:
-        -DBUILD_TESTS=ON
+        aptPackages: ${{ parameters.aptPackages }}
-        -DROCM_DEP_ROCMCORE=ON
+        packageManager: ${{ job.packageManager }}
-        -GNinja
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+        checkoutRepo: ${{ parameters.checkoutRepo }}
-  # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-  #   parameters:
+      parameters:
-  #     aptPackages: ${{ parameters.aptPackages }}
+        os: ${{ job.os }}
        useAmdclang: false
        extraBuildFlags: >-
          -DBUILD_TESTS=ON
          -DROCM_DEP_ROCMCORE=ON
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
    #   parameters:
    #     aptPackages: ${{ parameters.aptPackages }}
 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: rocm_smi_lib_test_${{ job.target }}
+  - job: rocm_smi_lib_test_${{ job.os }}_${{ job.target }}
-    dependsOn: rocm_smi_lib_build
+    dependsOn: rocm_smi_lib_build_${{ job.os }}
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -77,8 +93,11 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
      parameters:
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
      parameters:
        runRocminfo: false
@@ -86,8 +105,9 @@ jobs:
      parameters:
        componentName: rocm_smi_lib
        testDir: '$(Agent.BuildDirectory)'
-        testExecutable: './rocm/share/rocm_smi/rsmitst_tests/rsmitst'
+        testExecutable: 'sudo ./rocm/share/rocm_smi/rsmitst_tests/rsmitst'
        testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/rocminfo.yml
+++ b/.azuredevops/components/rocminfo.yml
@@ -17,7 +17,6 @@ parameters:
    - libdrm-amdgpu-dev
    - libdrm-dev
    - ninja-build
    - python3-pip
 - name: rocmDependencies
  type: object
  default:
@@ -32,49 +31,63 @@ parameters:
 - name: jobMatrix
  type: object
  default:
    buildJobs:
      - { os: ubuntu2204, packageManager: apt }
      - { os: almalinux8, packageManager: dnf }
    testJobs:
-      - gfx942:
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-        target: gfx942
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
      - gfx90a:
        target: gfx90a
 jobs:
- job: rocminfo
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
-  variables:
+  - job: rocminfo_build_${{ job.os }}
-  - group: common
+    pool:
-  - template: /.azuredevops/variables-global.yml
+      vmImage: 'ubuntu-22.04'
-  pool:
+    ${{ if eq(job.os, 'almalinux8') }}:
-    vmImage: ${{ variables.BASE_BUILD_POOL }}
+      container:
-  workspace:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-    clean: all
+        endpoint: ContainerService3
-  steps:
+    variables:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+    - group: common
-    parameters:
+    - template: /.azuredevops/variables-global.yml
-      aptPackages: ${{ parameters.aptPackages }}
+    workspace:
-      registerROCmPackages: true
+      clean: all
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    steps:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-    parameters:
+      parameters:
-      checkoutRepo: ${{ parameters.checkoutRepo }}
+        aptPackages: ${{ parameters.aptPackages }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        packageManager: ${{ job.packageManager }}
-    parameters:
+        registerROCmPackages: true
-      checkoutRef: ${{ parameters.checkoutRef }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-      dependencyList: ${{ parameters.rocmDependencies }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      aggregatePipeline: ${{ parameters.aggregatePipeline }}
+      parameters:
-      skipLlvmSymlink: true
+        checkoutRepo: ${{ parameters.checkoutRepo }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-    parameters:
+      parameters:
-      extraBuildFlags: >-
+        checkoutRef: ${{ parameters.checkoutRef }}
-        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+        dependencyList: ${{ parameters.rocmDependencies }}
-        -DROCRTST_BLD_TYPE=release
+        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-        -GNinja
+        skipLlvmSymlink: true
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+        os: ${{ job.os }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+      parameters:
        os: ${{ job.os }}
        useAmdclang: false
        extraBuildFlags: >-
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
          -DROCRTST_BLD_TYPE=release
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
 - ${{ each job in parameters.jobMatrix.testJobs }}:
  - job: rocminfo_test_${{ job.target }}
-    dependsOn: rocminfo
+    dependsOn: rocminfo_build_${{ job.os }}
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -91,14 +104,18 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        packageManager: ${{ job.packageManager }}
        registerROCmPackages: true
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
      parameters:
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmTestDependencies }}
        gpuTarget: ${{ job.target }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
      parameters:
        runRocminfo: false
@@ -109,6 +126,7 @@ jobs:
        testExecutable: './rocm/bin/rocminfo'
        testParameters: ''
        testPublishResults: false
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
      parameters:
        componentName: rocm_agent_enumerator
@@ -116,6 +134,7 @@ jobs:
        testExecutable: './rocm/bin/rocm_agent_enumerator'
        testParameters: ''
        testPublishResults: false
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/rocprofiler-compute.yml
+++ b/.azuredevops/components/rocprofiler-compute.yml
@@ -24,24 +24,28 @@ parameters:
  default:
    - astunparse==1.6.2
    - colorlover
-    - "dash>=1.12.0"
+    - dash-bootstrap-components
    - dash-svg
    - "dash>=3.0.0"
    - kaleido==0.2.1
    - matplotlib
    - "numpy>=1.17.5"
    - "pandas>=1.4.3"
    - plotext
    - plotille
    - pymongo
    - pyyaml
    - tabulate
    - tqdm
    - dash-svg
    - dash-bootstrap-components
    - kaleido
    - setuptools
-    - plotille
+    - tabulate
    - textual
    - textual_plotext
    - textual-fspicker
    - tqdm
    - mock
    - pytest
    - pytest-cov
    - pytest-xdist
- name: rocmDependencies
+- name: rocmTestDependencies
  type: object
  default:
    - amdsmi
@@ -114,14 +118,6 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        dependencySource: ${{ job.dependencySource }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        extraBuildFlags: >-
@@ -165,14 +161,6 @@ jobs:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - task: Bash@3
      displayName: Add en_US.UTF-8 locale
      inputs:
        targetType: inline
        script: |
          sudo locale-gen en_US.UTF-8
          sudo update-locale
          locale -a
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
@@ -184,9 +172,17 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmDependencies }}
+        dependencyList: ${{ parameters.rocmTestDependencies }}
        dependencySource: ${{ job.dependencySource }}
        gpuTarget: ${{ job.target }}
    - task: Bash@3
      displayName: Add en_US.UTF-8 locale
      inputs:
        targetType: inline
        script: |
          sudo locale-gen en_US.UTF-8
          sudo update-locale
          locale -a
    - task: Bash@3
      displayName: Add ROCm binaries to PATH
      inputs:
--- a/.azuredevops/components/rocprofiler-register.yml
+++ b/.azuredevops/components/rocprofiler-register.yml
@@ -15,40 +15,62 @@ parameters:
  default:
    - cmake
    - ninja-build
-    - python3-pip
+
 - name: jobMatrix
  type: object
  default:
    buildJobs:
      - { os: ubuntu2204, packageManager: apt }
      - { os: ubuntu2404, packageManager: apt }
      - { os: almalinux8, packageManager: dnf }
 jobs:
- job: rocprofiler_register
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
-  variables:
+  - job: rocprofiler_register_${{ job.os }}
-  - group: common
+    pool:
-  - template: /.azuredevops/variables-global.yml
+      ${{ if eq(job.os, 'ubuntu2404') }}:
-  pool:
+        vmImage: 'ubuntu-24.04'
-    vmImage: ${{ variables.BASE_BUILD_POOL }}
+      ${{ else }}:
-  workspace:
+        vmImage: 'ubuntu-22.04'
-    clean: all
+    ${{ if eq(job.os, 'almalinux8') }}:
-  steps:
+      container:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-    parameters:
+        endpoint: ContainerService3
-      aptPackages: ${{ parameters.aptPackages }}
+    variables:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - group: common
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+    - template: /.azuredevops/variables-global.yml
-    parameters:
+    workspace:
-      checkoutRepo: ${{ parameters.checkoutRepo }}
+      clean: all
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+    steps:
-    parameters:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      componentName: rocprofiler-register
+      parameters:
-      extraBuildFlags: >-
+        aptPackages: ${{ parameters.aptPackages }}
-        -DCMAKE_PREFIX_PATH=$(Build.BinariesDirectory)
+        packageManager: ${{ job.packageManager }}
-        -DROCPROFILER_REGISTER_BUILD_TESTS=ON
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-        -DROCPROFILER_REGISTER_BUILD_SAMPLES=ON
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-        -GNinja
+      parameters:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        checkoutRepo: ${{ parameters.checkoutRepo }}
-    parameters:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-      componentName: rocprofiler-register
+      parameters:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+        componentName: rocprofiler-register
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+        os: ${{ job.os }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+        useAmdclang: false
-  # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        extraBuildFlags: >-
-  #   parameters:
+          -DCMAKE_PREFIX_PATH=$(Build.BinariesDirectory)
-  #     aptPackages: ${{ parameters.aptPackages }}
+          -DROCPROFILER_REGISTER_BUILD_TESTS=ON
-  #     environment: combined
+          -DROCPROFILER_REGISTER_BUILD_SAMPLES=ON
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
      parameters:
        componentName: rocprofiler-register
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
    #   parameters:
    #     aptPackages: ${{ parameters.aptPackages }}
    #     environment: combined
--- a/.azuredevops/components/rocprofiler-sdk.yml
+++ b/.azuredevops/components/rocprofiler-sdk.yml
@@ -14,10 +14,12 @@ parameters:
  type: object
  default:
    - build-essential
    - cmake
    - libdrm-amdgpu-dev
    - libdrm-dev
    - libdw-dev
    - libelf-dev
    - libsqlite3-dev
    - libva-dev
    - ninja-build
    - pkg-config
@@ -74,8 +76,7 @@ jobs:
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
-    pool:
+    pool: ${{ variables.MEDIUM_BUILD_POOL }}
      vmImage: ${{ variables.BASE_BUILD_POOL }}
    workspace:
      clean: all
    steps:
--- a/.azuredevops/components/rocprofiler.yml
+++ b/.azuredevops/components/rocprofiler.yml
@@ -1,4 +1,7 @@
 parameters:
 - name: componentName
  type: string
  default: rocprofiler
 - name: checkoutRepo
  type: string
  default: 'self'
@@ -15,7 +18,6 @@ parameters:
  type: object
  default:
    - cmake
    - libgtest-dev
    - libdrm-dev
    - libdw-dev
    - libsystemd-dev
@@ -26,13 +28,13 @@ parameters:
 - name: pipModules
  type: object
  default:
    - pyyaml==5.3.1
    - Cppheaderparser
    - websockets
    - matplotlib
    - lxml
    - barectf
    - Cppheaderparser
    - lxml
    - matplotlib
    - pandas
    - pyyaml==5.3.1
    - websockets
 - name: rocmDependencies
  type: object
  default:
@@ -41,29 +43,33 @@ parameters:
    - ROCdbgapi
    - rocm-cmake
    - rocm-core
    - rocm_smi_lib
    - rocminfo
-    - ROCR-Runtime
+    - rocm_smi_lib
    - rocprofiler-register
    - ROCR-Runtime
    - roctracer
 - name: jobMatrix
  type: object
  default:
    buildJobs:
-      - gfx942:
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-        target: gfx942
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
-      - gfx90a:
+      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
      - { os: almalinux8, packageManager: dnf, target: gfx942 }
      - { os: almalinux8, packageManager: dnf, target: gfx90a }
      - { os: almalinux8, packageManager: dnf, target: gfx1201 }
      - { os: almalinux8, packageManager: dnf, target: gfx1100 }
      - { os: almalinux8, packageManager: dnf, target: gfx1030 }
    testJobs:
-      - gfx942:
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-        target: gfx942
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
      - gfx90a:
        target: gfx90a
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: rocprofiler_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -72,6 +78,10 @@ jobs:
    - name: ROCM_PATH
      value: $(Agent.BuildDirectory)/rocm
    pool: ${{ variables.MEDIUM_BUILD_POOL }}
    ${{ if eq(job.os, 'almalinux8') }}:
      container:
        image: rocmexternalcicd.azurecr.io/manylinux228:latest
        endpoint: ContainerService3
    workspace:
      clean: all
    steps:
@@ -79,46 +89,59 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
      parameters:
        dependencyList:
          - gtest
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
      parameters:
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
        os: ${{ job.os }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        os: ${{ job.os }}
        useAmdclang: false
        extraBuildFlags: >-
-          -DCMAKE_MODULE_PATH=$(Build.SourcesDirectory)/cmake_modules;$(Agent.BuildDirectory)/rocm/lib/cmake;$(Agent.BuildDirectory)/rocm/lib/cmake/hip
+          -DCMAKE_MODULE_PATH=$(Build.SourcesDirectory)/cmake_modules;$(Agent.BuildDirectory)/rocm/lib/cmake;$(Agent.BuildDirectory)/rocm/lib/cmake/hip;$(Agent.BuildDirectory)/rocm/lib64/cmake;$(Agent.BuildDirectory)/rocm/lib64/cmake/hip
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor"
          -DCMAKE_POSITION_INDEPENDENT_CODE=ON
          -DENABLE_LDCONFIG=OFF
          -DUSE_PROF_API=1
          -DGPU_TARGETS=${{ job.target }}
          -DAMDGPU_TARGETS=${{ job.target }}
        multithreadFlag: -- -j32
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
        gpuTarget: ${{ job.target }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
        gpuTarget: ${{ job.target }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+    - ${{ if eq(job.os, 'ubuntu2204') }}:
-      parameters:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-        aptPackages: ${{ parameters.aptPackages }}
+        parameters:
-        pipModules: ${{ parameters.pipModules }}
+          aptPackages: ${{ parameters.aptPackages }}
-        gpuTarget: ${{ job.target }}
+          pipModules: ${{ parameters.pipModules }}
-        extraEnvVars:
+          gpuTarget: ${{ job.target }}
-          - HIP_ROCCLR_HOME:::/home/user/workspace/rocm
+          extraEnvVars:
-          - ROCM_PATH:::/home/user/workspace/rocm
+            - HIP_ROCCLR_HOME:::/home/user/workspace/rocm
            - ROCM_PATH:::/home/user/workspace/rocm
 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: rocprofiler_test_${{ job.target }}
+  - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
-    dependsOn: rocprofiler_build_${{ job.target }}
+    dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -139,16 +162,21 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
      parameters:
        gpuTarget: ${{ job.target }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
      parameters:
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
      parameters:
@@ -157,12 +185,14 @@ jobs:
        testExecutable:  ./run.sh
        testParameters: ''
        testPublishResults: false
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
      parameters:
        componentName: rocprofilerV2
        testDir: $(Agent.BuildDirectory)/rocm
        testExecutable:  share/rocprofiler/tests/runUnitTests
        testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/roctracer.yml
+++ b/.azuredevops/components/roctracer.yml
@@ -1,4 +1,7 @@
 parameters:
 - name: componentName
  type: string
  default: roctracer
 - name: checkoutRepo
  type: string
  default: 'self'
@@ -18,7 +21,7 @@ parameters:
    - graphviz
    - libdrm-amdgpu-dev
    - ninja-build
-    - python3-pip
+    - zlib1g-dev
 - name: pipModules
  type: object
  default:
@@ -45,26 +48,32 @@ parameters:
  type: object
  default:
    buildJobs:
-      - gfx942:
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-        target: gfx942
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
-      - gfx90a:
+      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
      - { os: almalinux8, packageManager: dnf, target: gfx942 }
      - { os: almalinux8, packageManager: dnf, target: gfx90a }
      - { os: almalinux8, packageManager: dnf, target: gfx1201 }
      - { os: almalinux8, packageManager: dnf, target: gfx1100 }
      - { os: almalinux8, packageManager: dnf, target: gfx1030 }
    testJobs:
-      - gfx942:
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-        target: gfx942
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
      - gfx90a:
        target: gfx90a
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: roctracer_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
    - name: HIP_ROCCLR_HOME
      value: $(Build.BinariesDirectory)/rocm
    pool:
      vmImage: ${{ variables.BASE_BUILD_POOL }}
    ${{ if eq(job.os, 'almalinux8') }}:
      container:
        image: rocmexternalcicd.azurecr.io/manylinux228:latest
        endpoint: ContainerService3
    workspace:
      clean: all
    steps:
@@ -72,6 +81,7 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
        packageManager: ${{ job.packageManager }}
        registerROCmPackages: true
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
@@ -83,21 +93,27 @@ jobs:
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
        os: ${{ job.os }}
    # the linker flags will not affect ubuntu2204 builds as the paths do not exist
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        os: ${{ job.os }}
        useAmdclang: false
        extraBuildFlags: >-
          -DCMAKE_BUILD_TYPE=release
          -DROCM_PATH=$(Agent.BuildDirectory)/rocm
-          -DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm/lib/cmake/hip
+          -DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm/lib/cmake/hip;$(Agent.BuildDirectory)/rocm/lib64/cmake/hip
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
          -DGPU_TARGETS=${{ job.target }}
-          -DAMDGPU_TARGETS=${{ job.target }}
+          -DCMAKE_POSITION_INDEPENDENT_CODE=ON
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
@@ -108,8 +124,8 @@ jobs:
    #     registerROCmPackages: true
 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: roctracer_test_${{ job.target }}
+  - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
-    dependsOn: roctracer_build_${{ job.target }}
+    dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -127,17 +143,20 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
        packageManager: ${{ job.packageManager }}
        registerROCmPackages: true
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
      parameters:
        gpuTarget: ${{ job.target }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmTestDependencies }}
        gpuTarget: ${{ job.target }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
      parameters:
@@ -146,6 +165,7 @@ jobs:
        testParameters: ''
        testDir: $(Agent.BuildDirectory)
        testPublishResults: false
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/dependencies/gtest.yml
+++ b/.azuredevops/dependencies/gtest.yml
@@ -11,36 +11,54 @@ parameters:
 - name: aptPackages
  type: object
  default:
    - git
    - cmake
    - git
    - ninja-build
 - name: jobMatrix
  type: object
  default:
    buildJobs:
      - { os: ubuntu2204, packageManager: apt }
      - { os: almalinux8, packageManager: dnf }
 jobs:
- job: gtest
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
-  variables:
+  - job: gtest_${{ job.os }}
-  - group: common
+    variables:
-  - template: /.azuredevops/variables-global.yml
+    - group: common
-  pool:
+    - template: /.azuredevops/variables-global.yml
-    vmImage: ${{ variables.BASE_BUILD_POOL }}
+    pool:
-  workspace:
+      vmImage: 'ubuntu-22.04'
-    clean: all
+    ${{ if eq(job.os, 'almalinux8') }}:
-  steps:
+      container:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-    parameters:
+        endpoint: ContainerService3
-      aptPackages: ${{ parameters.aptPackages }}
+    workspace:
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      clean: all
-  - task: Bash@3
+    steps:
-    displayName: 'git clone gtest'
+    - checkout: none
-    inputs:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      targetType: inline
+      parameters:
-      script: git clone -b ${{ parameters.gtestVersion }} https://github.com/google/googletest --depth=1 --shallow-submodules --recurse-submodules
+        aptPackages: ${{ parameters.aptPackages }}
-      workingDirectory: $(Agent.BuildDirectory)
+        packageManager: ${{ job.packageManager }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    parameters:
+    - task: Bash@3
-      cmakeBuildDir: $(Agent.BuildDirectory)/googletest/build
+      displayName: Clone GTest ${{ parameters.gtestVersion }}
-      cmakeSourceDir: $(Agent.BuildDirectory)/googletest
+      inputs:
-      extraBuildFlags: >-
+        targetType: inline
-        -DGTEST_FORCE_SHARED_CRT=ON
+        script: git clone https://github.com/google/googletest -b ${{ parameters.gtestVersion }} --depth=1 --shallow-submodules --recurse-submodules
-        -DCMAKE_DEBUG_POSTFIX=d
+        workingDirectory: $(Agent.BuildDirectory)
-        -GNinja
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
        os: ${{ job.os }}
        cmakeBuildDir: $(Agent.BuildDirectory)/googletest/build
        cmakeSourceDir: $(Agent.BuildDirectory)/googletest
        useAmdclang: false
        extraBuildFlags: >-
          -DGTEST_FORCE_SHARED_CRT=ON
          -DCMAKE_DEBUG_POSTFIX=d
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
        os: ${{ job.os }}
--- a/.azuredevops/nightly/pytorch.yml
+++ b/.azuredevops/nightly/pytorch.yml
@@ -4,71 +4,71 @@ parameters:
 - name: aptPackages
  type: object
  default:
-    - build-essential
+    - autoconf
    - git
    - ninja-build
    - openjdk-8-jdk
    - ca-certificates
    - bc
    - bridge-utils
    - build-essential
    - ca-certificates
    - ccache
    - devscripts
    - dkms
    - doxygen
    - fakeroot
    - ffmpeg
    - gfortran
    - git
    - gnutls-bin
    - libamd2
    - libavformat-dev
    - libblas3
    - libcamd2
    - libccolamd2
    - libcholmod3
    - libcolamd2
    - libdpkg-dev
    - libdpkg-perl
    - libdrm-amdgpu1
    - libdrm-dev
    - libelf-dev
    - libfreetype-dev
    - libgfortran5
    - libgomp1
    - libjpeg-dev
    - libjpeg-turbo-official
    - liblapack-dev
    - liblapack3
    - libmetis5
    - libncurses-dev
    - libnuma-dev
    - libopenblas-dev
    - libpth-dev
    - libquadmath0
    - libssh-dev
    - libstdc++-12-dev
    - libsuitesparseconfig5
    - libswscale-dev
    - libtinfo-dev
    - libunwind-dev
    - libwebp-dev
    - llvm-dev
    - ncurses-base
    - ninja-build
    - numactl
    - openjdk-8-jdk
    - python-is-python3
    - python3-dev
    - python3-pip
    - python3-venv
    - wget
    - ncurses-base
    - libncurses-dev
    - numactl
    - libnuma-dev
    - libssh-dev
    - libunwind-dev
    - llvm-dev
    - libpth-dev
    - qemu-kvm
    - re2c
    - subversion
-    - fakeroot
+    - wget
    - autoconf
    - libgomp1
    - libtinfo-dev
    - libcholmod3
    - libsuitesparseconfig5
    - libstdc++-12-dev
    - python-is-python3
    - gfortran
    - libgfortran5
    - liblapack3
    - libblas3
    - libquadmath0
    - libmetis5
    - libamd2
    - libcamd2
    - libcolamd2
    - libccolamd2
    - libdrm-amdgpu1
    - ccache
    - zip
    - libjpeg-turbo-official
    - libjpeg-dev
    - libwebp-dev
    - libfreetype-dev
    - gnutls-bin
    - ffmpeg
    - libopenblas-dev
    - liblapack-dev
    - libswscale-dev
    - libavformat-dev
 - name: pipModules
  type: object
  default:
    - cmake
    - astunparse
-    - "expecttest>=0.2.1"
+    - "expecttest>=0.3.0"
    - hypothesis
    - numpy
    - psutil
@@ -76,8 +76,8 @@ parameters:
    - requests
    - setuptools==75.8.0
    - types-dataclasses
-    - "typing-extensions>=4.8.0"
+    - "typing-extensions>=4.10.0"
-    - "sympy>=1.13.0"
+    - "sympy>=1.13.3"
    - filelock
    - networkx
    - jinja2
@@ -97,36 +97,39 @@ parameters:
 - name: rocmDependencies
  type: object
  default:
    - rocminfo
    - MIOpen
    - clr
    - hipBLAS
    - hipBLASLt
    - hipFFT
    - hipRAND
    - hipSOLVER
    - hipSPARSE
-    - ROCR-Runtime
+    - hipSPARSELt
    - llvm-project
    - MIOpen
    - rccl
    - rocBLAS
    - rocFFT
    - rocm-core
    - rocminfo
    - rocm_smi_lib
    - rocPRIM
    - rocprofiler-register
    - rocRAND
    - ROCR-Runtime
    - rocSOLVER
    - rocSPARSE
    - roctracer
    - hipBLASLt
    - rocprofiler-register
    - rocm-core
    - rocPRIM
    # below are additional dependencies not called out by build script, but throw errors during cmake
    - composable_kernel
    - hipBLAS-common
    - hipCUB
    - rocThrust
    - hipBLAS-common
    - composable_kernel
 - name: rocmTestDependencies
  type: object
  default:
    # rocroller.so needed and is not included in the wheel
    - hipBLASLt
    - rocminfo
 # Reference on what tests to run for torchvision found in private repo:
 # https://github.com/ROCm/rocAutomation/blob/jenkins-pipelines/pytorch/pytorch_ci/test_pytorch_test1.sh#L54
@@ -240,12 +243,6 @@ jobs:
        git clone https://github.com/pytorch/builder.git --depth=1 --recurse-submodules
        sudo ln -s $(Build.SourcesDirectory)/builder /builder
      workingDirectory: $(Build.SourcesDirectory)
  - task: Bash@3
    displayName: Temporarily Patch CK Submodule
    inputs:
      targetType: inline
      script: git pull origin develop
      workingDirectory: $(Build.SourcesDirectory)/pytorch/third_party/composable_kernel
  - task: Bash@3
    displayName: Install patchelf
    inputs:
@@ -267,6 +264,11 @@ jobs:
      script: |
        sudo bash pytorch/.ci/docker/common/install_rocm_magma.sh $(MAGMA_ROCM)
      workingDirectory: $(Build.SourcesDirectory)
  - task: Bash@3
    displayName: Install targeted typing_extensions for build
    inputs:
      targetType: inline
      script: pip install --target=$(Build.SourcesDirectory)/pytorch/torch/.. typing_extensions
  - task: Bash@3
    displayName: Run ROCm Build Script
    inputs:
@@ -281,7 +283,6 @@ jobs:
        PYTORCH_ROOT=$(PYTORCH_ROOT)
        CMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
        DESIRED_DEVTOOLSET=$(DESIRED_DEVTOOLSET)
        TORCH_PACKAGE_NAME=torch.$(ROCM_BRANCH).$(JOB_GPU_TARGET)
        PYTORCH_BUILD_VERSION=$(cat $(Build.SourcesDirectory)/pytorch/version.txt | cut -da -f1)
        PYTORCH_BUILD_NUMBER=$(date -u +%Y%m%d)
        SKIP_ALL_TESTS=1
@@ -322,8 +323,6 @@ jobs:
      inputs:
        targetType: inline
        script: >-
          TORCH_PACKAGE_NAME=torch.$(ROCM_BRANCH).$(JOB_GPU_TARGET)
          TORCHVISION_PACKAGE_NAME=torchvision.$(ROCM_BRANCH).$(JOB_GPU_TARGET)
          PYTORCH_VERSION=$(cat $(Build.SourcesDirectory)/pytorch/version.txt | cut -da -f1)post$(date -u +%Y%m%d)
          BUILD_VERSION=$(cat $(Build.SourcesDirectory)/vision/version.txt | cut -da -f1)post$(date -u +%Y%m%d)
          python3 setup.py bdist_wheel
@@ -400,11 +399,9 @@ jobs:
  - task: DownloadPipelineArtifact@2
    displayName: 'Download Pipeline Wheel Files'
    inputs:
-      itemPattern: '**/*$(JOB_GPU_TARGET)*.whl'
+      itemPattern: '**/*.whl'
      targetPath: $(Agent.BuildDirectory)
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
    parameters:
      dependencySource: staging
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
    parameters:
      dependencyList: ${{ parameters.rocmTestDependencies }}
--- a/.azuredevops/nightly/rocm-nightly.yml
+++ b/.azuredevops/nightly/rocm-nightly.yml
@@ -3,12 +3,21 @@ parameters:
 - name: jobList
  type: object
  default:
-    - gfx942-staging:
+    - { os: ubuntu2204, target: gfx942, source: staging }
-      target: gfx942
+    - { os: ubuntu2204, target: gfx90a, source: staging }
-      source: staging
+    - { os: ubuntu2204, target: gfx1201, source: staging }
-    - gfx90a-staging:
+    - { os: ubuntu2204, target: gfx1100, source: staging }
-      target: gfx90a
+    - { os: ubuntu2204, target: gfx1030, source: staging }
-      source: staging
+    - { os: ubuntu2404, target: gfx942, source: staging }
    - { os: ubuntu2404, target: gfx90a, source: staging }
    - { os: ubuntu2404, target: gfx1201, source: staging }
    - { os: ubuntu2404, target: gfx1100, source: staging }
    - { os: ubuntu2404, target: gfx1030, source: staging }
    - { os: almalinux8, target: gfx942, source: staging }
    - { os: almalinux8, target: gfx90a, source: staging }
    - { os: almalinux8, target: gfx1201, source: staging }
    - { os: almalinux8, target: gfx1100, source: staging }
    - { os: almalinux8, target: gfx1030, source: staging }
 - name: rocmDependencies
  type: object
  default:
@@ -16,9 +25,9 @@ parameters:
    - amdsmi
    - aomp-extras
    - aomp
    - clr
    - composable_kernel
    - half
    - HIP
    - hip-tests
    - hipBLAS
    - hipBLAS-common
@@ -83,7 +92,7 @@ schedules:
 jobs:
 - ${{ each job in parameters.jobList }}:
-  - job: rocm_nightly_${{ job.target }}_${{ job.source }}
+  - job: rocm_nightly_${{ job.os }}_${{ job.target }}_${{ job.source }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -108,9 +117,9 @@ jobs:
      parameters:
        dependencySource: ${{ job.source }}
        dependencyList: ${{ parameters.rocmDependencies }}
        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
        skipLibraryLinking: true
        skipLlvmSymlink: true
    - script: df -h
      displayName: System disk space after ROCm
    - script: du -sh $(Agent.BuildDirectory)/rocm
--- a/.azuredevops/tag-builds/clr.yml
+++ b/.azuredevops/tag-builds/clr.yml
@@ -28,12 +28,22 @@ resources:
    endpoint: ROCm
    name: ROCm/hipother
    ref: ${{ parameters.checkoutRef }}
  pipelines:
  - pipeline: hip_pipeline
    source: \experimental\HIP
    trigger: true
  - pipeline: hipother_pipeline
    source: \experimental\hipother
    trigger: true
 trigger: none
 pr: none
 jobs:
-  - template: ${{ variables.CI_COMPONENT_PATH }}/HIP.yml
+  - ${{ if eq(variables['Build.Reason'], 'ResourceTrigger') }}:
-    parameters:
+    - template: ${{ variables.CI_COMPONENT_PATH }}/copyHIP.yml@pipelines_repo
-      checkoutRepo: release_repo
+  - ${{ if ne(variables['Build.Reason'], 'ResourceTrigger') }}:
-      checkoutRef: ${{ parameters.checkoutRef }}
+    - template: ${{ variables.CI_COMPONENT_PATH }}/HIP.yml@pipelines_repo
      parameters:
        checkoutRepo: release_repo
        checkoutRef: ${{ parameters.checkoutRef }}
--- a/.azuredevops/templates/steps/artifact-download.yml
+++ b/.azuredevops/templates/steps/artifact-download.yml
@@ -19,36 +19,27 @@ parameters:
  default: false
 steps:
 - task: Bash@3
  displayName: Set allowPartiallySucceededBuilds
  inputs:
    targetType: inline
    script: |
      if [[ ",$ALLOWED_PARTIAL_SUCCEED_BUILDS," == *",${{ parameters.componentName }},"* ]]; then
        echo "##vso[task.setvariable variable=allowPartiallySucceededBuilds;]true"
      else
        echo "##vso[task.setvariable variable=allowPartiallySucceededBuilds;]false"
      fi
 - task: DownloadPipelineArtifact@2
  displayName: Download ${{ parameters.componentName }}
  inputs:
-    ${{ if eq(parameters.aggregatePipeline, false) }}:
+    ${{ if parameters.aggregatePipeline }}:
      buildType: 'specific'
      project: ROCm-CI
      definition: ${{ parameters.pipelineId }}
      specificBuildWithTriggering: true
      itemPattern: '**/*${{ parameters.fileFilter }}*'
      # aomp is a special case, since the trigger file is under ROCm/ROCm instead of the component repo
      ${{ if notIn(parameters.componentName, 'aomp') }}:
        buildVersionToDownload: latestFromBranch # default is 'latest'
      branchName: refs/heads/${{ parameters.branchName }}
      allowPartiallySucceededBuilds: $(allowPartiallySucceededBuilds)
      targetPath: '$(Pipeline.Workspace)/d'
    ${{ else }}:
      buildType: 'current'
      itemPattern: '**/${{ parameters.componentName }}*${{ parameters.fileFilter }}*'
-      allowPartiallySucceededBuilds: $(allowPartiallySucceededBuilds)
+      allowPartiallySucceededBuilds: true
      targetPath: '$(Pipeline.Workspace)/d'
    ${{ else }}:
      buildType: 'specific'
      project: ROCm-CI
      specificBuildWithTriggering: true
      allowPartiallySucceededBuilds: true
      definition: ${{ parameters.pipelineId }}
      itemPattern: '**/*${{ parameters.fileFilter }}*'
      targetPath: '$(Pipeline.Workspace)/d'
      branchName: refs/heads/${{ parameters.branchName }}
      ${{ if eq(parameters.componentName, 'aomp') }}:
        buildVersionToDownload: latest # aomp trigger lives in ROCm/ROCm, so cannot use ROCm/aomp branch names
      ${{ else }}:
        buildVersionToDownload: latestFromBranch
 - task: ExtractFiles@1
  displayName: Extract ${{ parameters.componentName }}
  inputs:
--- a/.azuredevops/templates/steps/artifact-upload.yml
+++ b/.azuredevops/templates/steps/artifact-upload.yml
@@ -3,15 +3,21 @@
 # publish can be toggled off for jobs that produce multiple tarballs
 # for those cases, only publish the last call which puts all the tarballs in one container folder
 parameters:
- name: artifactName
+- name: componentName
  type: string
-  default: 'drop'
+  default: $(Build.DefinitionName)
 - name: publish
  type: boolean
  default: true
 - name: gpuTarget
  type: string
  default: ''
 - name: artifactName
  type: string
  default: drop
 - name: publish
  type: boolean
  default: true
 - name: os
  type: string
  default: 'ubuntu2204'
 steps:
 - task: ArchiveFiles@2
@@ -20,7 +26,7 @@ steps:
    includeRootFolder: false
    archiveType: 'tar'
    tarCompression: 'gz'
-    archiveFile: '$(Build.ArtifactStagingDirectory)/$(Build.DefinitionName)_$(Build.BuildId)_$(Build.BuildNumber)_ubuntu2204_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.tar.gz'
+    archiveFile: '$(Build.ArtifactStagingDirectory)/${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}_$(System.JobAttempt).tar.gz'
 - task: DeleteFiles@1
  displayName: 'Cleanup Staging Area'
  inputs:
@@ -32,7 +38,7 @@ steps:
  inputs:
    workingDirectory: $(Pipeline.Workspace)
    targetType: inline
-    script: echo "$(Build.DefinitionName)_$(Build.BuildId)_$(Build.BuildNumber)_ubuntu2204_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.tar.gz" >> pipelineArtifacts.txt
+    script: echo "${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}_$(System.JobAttempt).tar.gz" >> pipelineArtifacts.txt
 # then publish it
 - ${{ if parameters.publish }}:
  - task: PublishPipelineArtifact@1
@@ -40,4 +46,5 @@ steps:
    displayName: '${{ parameters.artifactName }} Publish'
    retryCountOnTaskFailure: 3
    inputs:
      artifactName: ${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}_$(System.JobAttempt)
      targetPath: '$(Build.ArtifactStagingDirectory)'
--- a/.azuredevops/templates/steps/build-autotools.yml
+++ b/.azuredevops/templates/steps/build-autotools.yml
@@ -1,4 +1,7 @@
 parameters:
 - name: os
  type: string
  default: 'ubuntu2204'
 - name: componentName
  type: string
  default: ''
@@ -20,17 +23,23 @@ steps:
  displayName: '${{ parameters.componentName }} configure flags'
  inputs:
    targetType: inline
    script: ./configure --prefix=${{ parameters.installDir }} ${{ parameters.configureFlags }}
    workingDirectory: ${{ parameters.buildDir }}
    script: |
      ${{ iif(eq(parameters.os, 'almalinux8'), 'source /opt/rh/gcc-toolset-14/enable', '') }}
      ./configure --prefix=${{ parameters.installDir }} ${{ parameters.configureFlags }}
 - task: Bash@3
  displayName: '${{ parameters.componentName }} make'
  inputs:
    targetType: inline
    script: ${{ parameters.makeCallPrefix }} make -j$(nproc)
    workingDirectory: ${{ parameters.buildDir }}
    script: |
      ${{ iif(eq(parameters.os, 'almalinux8'), 'source /opt/rh/gcc-toolset-14/enable', '') }}
      ${{ parameters.makeCallPrefix }} make -j$(nproc)
 - task: Bash@3
  displayName: '${{ parameters.componentName }} make install'
  inputs:
    targetType: inline
    script: make install
    workingDirectory: ${{ parameters.buildDir }}
    script: |
      ${{ iif(eq(parameters.os, 'almalinux8'), 'source /opt/rh/gcc-toolset-14/enable', '') }}
      make install
--- a/.azuredevops/templates/steps/build-cmake.yml
+++ b/.azuredevops/templates/steps/build-cmake.yml
@@ -1,10 +1,16 @@
 parameters:
 - name: os
  type: string
  default: 'ubuntu2204'
 - name: componentName
  type: string
  default: ''
 - name: extraBuildFlags
  type: string
  default: ''
 - name: extraCxxFlags
  type: string
  default: ''
 - name: multithreadFlag
  type: string
  default: ''
@@ -32,41 +38,81 @@ parameters:
 - name: installEnabled
  type: boolean
  default: true
 # for jobs that rebuild during install step and use ninja
 # set to true to save time, only applies for almalinux8
 - name: consolidateBuildAndInstall
  type: boolean
  default: false
 - name: printDiskSpace
  type: boolean
  default: true
 # todo: make this control cxx and c compiler flags
 - name: useAmdclang
  type: boolean
  default: true
 # for cmake calls, set env variables for AlmaLinux 8
 # to simulate running source /opt/rh/gcc-toolset-14/enable for the session
 steps:
 # create workingDirectory if it does not exist and change into it
 # call cmake from within that directory using $cmakeArgs as its parameters
 - task: CMake@1
  displayName: '${{parameters.componentName }} CMake Flags'
  ${{ if eq(parameters.os, 'almalinux8')}}:
    env:
      PATH: "/opt/rh/gcc-toolset-14/root/usr/bin:$(PATH)"
      MANPATH: "/opt/rh/gcc-toolset-14/root/usr/share/man:$(MANPATH)"
      INFOPATH: "/opt/rh/gcc-toolset-14/root/usr/share/info:$(INFOPATH)"
      PCP_DIR: "/opt/rh/gcc-toolset-14/root"
      LD_LIBRARY_PATH: "/opt/rh/gcc-toolset-14/root/usr/lib64:/opt/rh/gcc-toolset-14/root/usr/lib:$(LD_LIBRARY_PATH)"
      PKG_CONFIG_PATH: "/opt/rh/gcc-toolset-14/root/usr/lib64/pkgconfig:$(PKG_CONFIG_PATH)"
  inputs:
    workingDirectory: ${{ parameters.cmakeBuildDir }}
-    ${{ if eq(parameters.customInstallPath, true) }}:
+    cmakeArgs: >-
-      cmakeArgs: -DCMAKE_INSTALL_PREFIX=${{ parameters.installDir }} ${{ parameters.extraBuildFlags }} ${{ parameters.cmakeSourceDir }}
+      ${{ iif(parameters.customInstallPath, join('', format('-DCMAKE_INSTALL_PREFIX={0}', parameters.installDir)), '') }}
-    ${{ else }}:
+      ${{ iif(eq(parameters.os, 'almalinux8'), '-DCMAKE_SHARED_LINKER_FLAGS="-L$(Agent.BuildDirectory)/rocm/lib64 -L/opt/rh/gcc-toolset-14/root/usr/lib/gcc/x86_64-redhat-linux/14/"', '') }}
-      cmakeArgs: ${{ parameters.extraBuildFlags }} ${{ parameters.cmakeSourceDir }}
+      ${{ iif(eq(parameters.os, 'almalinux8'), '-DCMAKE_EXE_LINKER_FLAGS="-L$(Agent.BuildDirectory)/rocm/lib64 -L/opt/rh/gcc-toolset-14/root/usr/lib/gcc/x86_64-redhat-linux/14/"', '') }}
      -DCMAKE_CXX_FLAGS="${{ parameters.extraCxxFlags }} ${{ iif(and(eq(parameters.os, 'almalinux8'), parameters.useAmdclang), '--gcc-toolchain=/opt/rh/gcc-toolset-14/root', '') }}"
      ${{ parameters.extraBuildFlags }}
      ${{ parameters.cmakeSourceDir }}
 - ${{ if parameters.printDiskSpace }}:
  - script: df -h
    displayName: Disk space before build
 # equivalent to running make $cmakeTargetDir from $cmakeBuildDir
 # i.e., cd $cmakeBuildDir; make $cmakeTargetDir
 - task: CMake@1
-  displayName: '${{parameters.componentName }} Build'
+  ${{ if and( eq(parameters.os, 'almalinux8'), eq(parameters.consolidateBuildAndInstall , true)) }}:
    displayName: '${{ parameters.componentName }} CMake Build and Install'
  ${{ else }}:
    displayName: '${{ parameters.componentName }} CMake Build'
  ${{ if eq(parameters.os, 'almalinux8')}}:
    env:
      PATH: "/opt/rh/gcc-toolset-14/root/usr/bin:$(PATH)"
      MANPATH: "/opt/rh/gcc-toolset-14/root/usr/share/man:$(MANPATH)"
      INFOPATH: "/opt/rh/gcc-toolset-14/root/usr/share/info:$(INFOPATH)"
      PCP_DIR: "/opt/rh/gcc-toolset-14/root"
      LD_LIBRARY_PATH: "/opt/rh/gcc-toolset-14/root/usr/lib64:/opt/rh/gcc-toolset-14/root/usr/lib:$(LD_LIBRARY_PATH)"
      PKG_CONFIG_PATH: "/opt/rh/gcc-toolset-14/root/usr/lib64/pkgconfig:$(PKG_CONFIG_PATH)"
  inputs:
    workingDirectory: ${{ parameters.cmakeBuildDir }}
-    ${{ if eq(parameters.customBuildTarget, '') }}:
+    ${{ if eq(parameters.os, 'almalinux8') }}:
-      cmakeArgs: '--build ${{ parameters.cmakeTargetDir }} ${{ parameters.multithreadFlag }}'
+      cmakeArgs: >-
-    ${{ else }}:
+        --build ${{ parameters.cmakeTargetDir }}
-      cmakeArgs: '--build ${{ parameters.cmakeTargetDir }} --target ${{ parameters.customBuildTarget }} ${{ parameters.multithreadFlag }}'
+        ${{ iif(and(eq(parameters.consolidateBuildAndInstall, true), ne(parameters.cmakeTarget, '')), format('--target {0}', parameters.cmakeTarget), '') }}
-    retryCountOnTaskFailure: 10
+        ${{ iif(and(ne(parameters.customBuildTarget, ''), ne(parameters.consolidateBuildAndInstall, true)), format('--target {0}', parameters.customBuildTarget), '') }}
        ${{ parameters.multithreadFlag }}
    ${{ if ne(parameters.os, 'almalinux8') }}:
      cmakeArgs: >-
        --build ${{ parameters.cmakeTargetDir }}
        ${{ iif(ne(parameters.customBuildTarget, ''), format('--target {0}', parameters.customBuildTarget), '') }}
        ${{ parameters.multithreadFlag }}
 - ${{ if parameters.printDiskSpace }}:
  - script: df -h
    displayName: Disk space after build
 # equivalent to running make $cmakeTarget from $cmakeBuildDir
 # e.g., make install
- ${{ if eq(parameters.installEnabled, true) }}:
+- ${{ if and(eq(parameters.installEnabled, true), or(ne(parameters.os, 'almalinux8'), eq(parameters.consolidateBuildAndInstall, false))) }}:
  - task: CMake@1
    displayName: '${{parameters.componentName }} ${{ parameters.cmakeTarget }}'
    inputs:
--- a/.azuredevops/templates/steps/checkout.yml
+++ b/.azuredevops/templates/steps/checkout.yml
@@ -4,9 +4,6 @@ parameters:
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: sparseCheckout
  type: boolean
  default: false
 - name: sparseCheckoutDir
  type: string
  default: ''
@@ -22,10 +19,10 @@ steps:
    submodules: ${{ parameters.submoduleBehaviour }}
    retryCountOnTaskFailure: 3
    fetchFilter: blob:none
-    ${{ if eq(parameters.sparseCheckout, true) }}:
+    ${{ if ne(parameters.sparseCheckoutDir, '') }}:
      sparseCheckoutDirectories: ${{ parameters.sparseCheckoutDir }}
      path: sparse
-  - ${{ if eq(parameters.sparseCheckout, true) }}:
+  - ${{ if ne(parameters.sparseCheckoutDir, '') }}:
    - task: Bash@3
      displayName: Symlink sparse checkout
      inputs:
--- a/.azuredevops/templates/steps/dependencies-apt.yml
+++ b/.azuredevops/templates/steps/dependencies-apt.yml
@@ -0,0 +1,42 @@
 parameters:
 - name: aptPackages
  type: object
  default: []
 - name: registerROCmPackages
  type: boolean
  default: false
 steps:
 - ${{ if eq(parameters.registerROCmPackages, true) }}:
  - task: Bash@3
    displayName: 'Register AMDGPU & ROCm repos (apt)'
    inputs:
      targetType: inline
      script: |
        sudo mkdir --parents --mode=0755 /etc/apt/keyrings
        wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null
        echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/amdgpu/$(REPO_RADEON_VERSION)/ubuntu jammy main" | sudo tee /etc/apt/sources.list.d/amdgpu.list
        echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/$(REPO_RADEON_VERSION) jammy main" | sudo tee --append /etc/apt/sources.list.d/rocm.list
        echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600
        sudo apt update
 - task: Bash@3
  displayName: 'sudo apt-get update'
  inputs:
    targetType: inline
    script: |
      echo "deb http://archive.ubuntu.com/ubuntu/ jammy main restricted universe multiverse" | sudo tee -a /etc/apt/sources.list.d/default.list
      echo "deb http://archive.ubuntu.com/ubuntu/ jammy-updates main restricted universe multiverse" | sudo tee -a /etc/apt/sources.list.d/default.list
      echo "deb http://archive.ubuntu.com/ubuntu/ jammy-backports main restricted universe multiverse" | sudo tee -a /etc/apt/sources.list.d/default.list
      echo "deb http://archive.ubuntu.com/ubuntu/ jammy-security main restricted universe multiverse" | sudo tee -a /etc/apt/sources.list.d/default.list
      sudo DEBIAN_FRONTEND=noninteractive apt-get --yes update
 - task: Bash@3
  displayName: 'sudo apt-get fix'
  inputs:
    targetType: inline
    script: sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --fix-broken install
 - ${{ if gt(length(parameters.aptPackages), 0) }}:
  - task: Bash@3
    displayName: 'sudo apt-get install ...'
    inputs:
      targetType: inline
      script: sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --fix-missing install ${{ join(' ', parameters.aptPackages) }}
--- a/.azuredevops/templates/steps/dependencies-aqlprofile.yml
+++ b/.azuredevops/templates/steps/dependencies-aqlprofile.yml
@@ -1,25 +1,44 @@
 parameters:
 - name: os
  type: string
  default: ubuntu2204
 steps:
 - task: Bash@3
  displayName: Get aqlprofile package name
  inputs:
    targetType: inline
-    script: |
+    ${{ if eq(parameters.os, 'ubuntu2204') }}:
-      export packageName=$(curl -s https://repo.radeon.com/rocm/apt/$(REPO_RADEON_VERSION)/pool/main/h/hsa-amd-aqlprofile/ | grep -oP "href=\"\K[^\"]*$(lsb_release -rs)[^\"]*\.deb")
+      script: |
-      echo "##vso[task.setvariable variable=packageName;isreadonly=true]$packageName"
+        export packageName=$(curl -s https://repo.radeon.com/rocm/apt/$(REPO_RADEON_VERSION)/pool/main/h/hsa-amd-aqlprofile/ | grep -oP "href=\"\K[^\"]*$(lsb_release -rs)[^\"]*\.deb")
        echo "##vso[task.setvariable variable=packageName;isreadonly=true]$packageName"
    ${{ if eq(parameters.os, 'almalinux8') }}:
      script: |
        export packageName=$(curl -s https://repo.radeon.com/rocm/rhel8/$(REPO_RADEON_VERSION)/main/ | grep -oP "hsa-amd-aqlprofile-[^\"]+\.rpm" | head -n1)
        echo "##vso[task.setvariable variable=packageName;isreadonly=true]$packageName"
 - task: Bash@3
  displayName: 'Download aqlprofile'
  inputs:
    targetType: inline
    script: wget -nv https://repo.radeon.com/rocm/apt/$(REPO_RADEON_VERSION)/pool/main/h/hsa-amd-aqlprofile/$(packageName)
    workingDirectory: '$(Pipeline.Workspace)'
    ${{ if eq(parameters.os, 'ubuntu2204') }}:
      script: wget -nv https://repo.radeon.com/rocm/apt/$(REPO_RADEON_VERSION)/pool/main/h/hsa-amd-aqlprofile/$(packageName)
    ${{ if eq(parameters.os, 'almalinux8') }}:
      script: wget -nv https://repo.radeon.com/rocm/rhel8/$(REPO_RADEON_VERSION)/main/$(packageName)
 - task: Bash@3
  displayName: 'Extract aqlprofile'
  inputs:
    targetType: inline
    script: |
      mkdir hsa-amd-aqlprofile
      dpkg-deb -R $(packageName) hsa-amd-aqlprofile
    workingDirectory: '$(Pipeline.Workspace)'
    ${{ if eq(parameters.os, 'ubuntu2204') }}:
      script: |
        mkdir hsa-amd-aqlprofile
        dpkg-deb -R $(packageName) hsa-amd-aqlprofile
    ${{ if eq(parameters.os, 'almalinux8') }}:
      script: |
        mkdir hsa-amd-aqlprofile
        sudo dnf -y install rpm-build cpio
        rpm2cpio $(packageName) | (cd hsa-amd-aqlprofile && cpio -idmv)
 - task: Bash@3
  displayName: 'Copy aqlprofile files'
  inputs:
--- a/.azuredevops/templates/steps/dependencies-boost.yml
+++ b/.azuredevops/templates/steps/dependencies-boost.yml
@@ -1,35 +0,0 @@
 steps:
 - task: DownloadPipelineArtifact@2
  displayName: Download Boost
  inputs:
    buildType: specific
    project: ROCm-CI
    definition: $(BOOST_DEPENDENCY_PIPELINE_ID)
    targetPath: $(Pipeline.Workspace)/d
 - task: ExtractFiles@1
  displayName: Extract Boost
  inputs:
    archiveFilePatterns: '$(Pipeline.Workspace)/d/**/*.tar.gz'
    destinationFolder: $(Agent.BuildDirectory)/boost
    cleanDestinationFolder: true
    overwriteExistingFiles: true
 - task: DeleteFiles@1
  displayName: Cleanup Compressed Boost
  inputs:
    SourceFolder: $(Pipeline.Workspace)/d
    Contents: '**/*.tar.gz'
    RemoveDotFiles: true
 - task: Bash@3
  displayName: 'List Boost files'
  inputs:
    targetType: inline
    script: ls -1R $(Agent.BuildDirectory)/boost
 - task: Bash@3
  displayName: 'Link Boost shared libraries'
  inputs:
    targetType: inline
    script: |
      echo $(Agent.BuildDirectory)/boost/lib | sudo tee /etc/ld.so.conf.d/boost.conf
      sudo cat /etc/ld.so.conf.d/boost.conf
      sudo ldconfig -v
      ldconfig -p
--- a/.azuredevops/templates/steps/dependencies-cmake-latest.yml
+++ b/.azuredevops/templates/steps/dependencies-cmake-latest.yml
@@ -1,10 +1,23 @@
 # replace cmake from apt install with newest version using snap install
 steps:
 - task: Bash@3
-  displayName: update cmake
+  displayName: Install CMake 3.31
  inputs:
    targetType: inline
    script: |
-      sudo apt purge cmake -y
+      CMAKE_VERSION=3.31.0
-      sudo snap install cmake --classic --channel=3.31/stable
+      CMAKE_ROOT="$(Pipeline.Workspace)/cmake"
-      hash -r
+
      echo "Downloading CMake $CMAKE_VERSION..."
      curl -fsSL -o cmake.tar.gz https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz
      echo "Extracting to $CMAKE_ROOT..."
      sudo mkdir -p $CMAKE_ROOT
      sudo tar --strip-components=1 -xz -C $CMAKE_ROOT -f cmake.tar.gz
      echo "##vso[task.prependpath]$CMAKE_ROOT/bin"
 - task: Bash@3
  displayName: cmake --version
  inputs:
    targetType: inline
    script: |
      cmake --version
--- a/.azuredevops/templates/steps/dependencies-dnf.yml
+++ b/.azuredevops/templates/steps/dependencies-dnf.yml
@@ -0,0 +1,157 @@
 parameters:
 - name: aptPackages
  type: object
  default: []
 - name: registerROCmPackages
  type: boolean
  default: false
 # As par of installing gcc toolset and python,
 # the environment will install this base set of dnf packages.
 - name: basePackages
  type: object
  default:
    - epel-release
    - gcc-toolset-14
    - gcc-toolset-14-libatomic-devel
    - git
    - jq
    - numactl
    - python3.11
    - python3.11-pip
    - vim-common
    - wget
 # Instead of defining multiple arrays of packages per component,
 # we define a map of apt package names to dnf package names.
 - name: aptToDnfMap
  type: object
  default:
    bison: bison
    ccache: ccache
    cmake: cmake
    cuda-toolkit-12-9: cuda-compiler-12-9 cuda-toolkit-12-9
    libcudnn9-dev-cuda-12: libcudnn9-cuda-12
    dejagnu: dejagnu
    doxygen: doxygen
    # note: doxygen-doc is not available in dnf
    # libavcodec-dev, libavformat-dev, libavutil-dev come with ffmpeg-devel
    ffmpeg: ffmpeg ffmpeg-devel
    flex: flex
    # note: g++ is installed by default with gcc-toolset-14
    # note: gawk is already installed
    # note: gcc-toolset-14-gfortran is installed by default with gcc-toolset-14
    # note: git is in the base packages list
    graphviz: graphviz
    libbabeltrace-dev: libbabeltrace-devel
    libbison-dev: bison-devel
    libboost-program-options-dev: boost-devel
    # note: libdrm-amdgpu1 is not available in dnf
    libdrm-dev: libdrm-devel
    libdrm-amdgpu-dev: libdrm-amdgpu-devel
    libdw-dev: elfutils-devel
    libelf-dev: elfutils-libelf-devel
    libexpat-dev: expat-devel
    libffi-dev: libffi-devel
    libfftw3-dev: fftw-devel
    libgmp-dev: gmp-devel
    liblzma-dev: xz-devel
    libmpfr-dev: mpfr-devel
    libmsgpack-dev: msgpack-devel
    libncurses5-dev: ncurses-devel
    libnuma-dev: numactl-devel
    libopenmpi-dev: openmpi-devel
    libpci-dev: libpciaccess-devel
    libssl-dev: openssl-devel
    # note: libstdc++-devel is in the base packages list
    libsystemd-dev: systemd-devel
    libtool: libtool
    # note: libudev-dev is part of systemd-devel
    libva-amdgpu-dev: libva-amdgpu-devel
    mesa-amdgpu-va-drivers: mesa-amdgpu-va-drivers
    mesa-common-dev: mesa-libGL-devel
    ncurses-dev: ncurses-devel
    # note: llvm needs ninja-build version newer than what dnf provides
    ocl-icd-libopencl1: ocl-icd
    ocl-icd-opencl-dev: ocl-icd-devel
    opencl-headers: opencl-headers
    parallel: parallel
    pkg-config: pkgconf-pkg-config
    # note: python3 is the default python in AlmaLinux 8
    python3-dev: python3.11-devel
    # note: python3.11-pip is already installed when updating to python 3.11
    # note: python3.11-setuptools is already installed when updating to python 3.11
    texinfo: texinfo
    zlib1g-dev: zlib-devel
 steps:
 - ${{ if eq(parameters.registerROCmPackages, true) }}:
  - task: Bash@3
    displayName: 'Register AMDGPU & ROCm repos (dnf)'
    inputs:
      targetType: inline
      script: |
        sudo rpm --import https://repo.radeon.com/rocm/rocm.gpg.key
        echo '[amdgpu]' | sudo tee /etc/yum.repos.d/amdgpu.repo > /dev/null
        echo "name=amdgpu" | sudo tee --append /etc/yum.repos.d/amdgpu.repo
        echo "baseurl=https://repo.radeon.com/amdgpu/$(REPO_RADEON_VERSION)/rhel/8.10/main/x86_64/" | sudo tee --append /etc/yum.repos.d/amdgpu.repo
        echo "enabled=1" | sudo tee --append /etc/yum.repos.d/amdgpu.repo
        echo "gpgcheck=1" | sudo tee --append /etc/yum.repos.d/amdgpu.repo
        echo "gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key" | sudo tee --append /etc/yum.repos.d/amdgpu.repo
        echo '[rocm]' | sudo tee /etc/yum.repos.d/rocm.repo > /dev/null
        echo "name=ROCm$(REPO_RADEON_VERSION)" | sudo tee --append /etc/yum.repos.d/rocm.repo
        echo "baseurl=https://repo.radeon.com/rocm/rhel8/$(REPO_RADEON_VERSION)/main/" | sudo tee --append /etc/yum.repos.d/rocm.repo
        echo "enabled=1" | sudo tee --append /etc/yum.repos.d/rocm.repo
        echo "gpgcheck=1" | sudo tee --append /etc/yum.repos.d/rocm.repo
        echo "gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key" | sudo tee --append /etc/yum.repos.d/rocm.repo
        sudo dnf clean all
        sudo dnf makecache
 - task: Bash@3
  displayName: 'Install base dnf packages'
  inputs:
    targetType: inline
    script: |
      sudo dnf config-manager --set-enabled powertools
      # rpm fusion free repo for some dependencies
      sudo dnf -y install https://download1.rpmfusion.org/free/el/rpmfusion-free-release-8.noarch.rpm
      sudo dnf -y install ${{ join(' ', parameters.basePackages) }}
 - task: Bash@3
  displayName: 'Check gcc environment'
  inputs:
    targetType: inline
    script: |
      echo "=== Versions and sanity checks ==="
      gcc --version
      g++ --version
      gcc -print-file-name=libstdc++.so
      g++ -print-file-name=libstdc++.so
 - task: Bash@3
  displayName: 'Set python 3.11 as default'
  inputs:
    targetType: inline
    script: |
      sudo dnf -y module disable python36
      sudo rm -f /usr/local/bin/python3.12 /usr/local/bin/python3.13 /usr/local/bin/python3.14
      sudo alternatives --set python /usr/bin/python3.11
      sudo alternatives --set python3 /usr/bin/python3.11
      python3 --version
      python3 -m pip install --upgrade pip setuptools wheel
 - ${{ each pkg in parameters.aptPackages }}:
  # note: llvm needs ninja-build version newer than what dnf provides
  - ${{ if eq(pkg, 'ninja-build') }}:
    - task: Bash@3
      displayName: 'Install ninja 1.11.1'
      inputs:
        targetType: inline
        script: |
          curl -LO https://github.com/ninja-build/ninja/releases/download/v1.11.1/ninja-linux.zip
          sudo dnf -y install unzip
          unzip ninja-linux.zip
          sudo mv ninja /usr/local/bin/ninja
          sudo chmod +x /usr/local/bin/ninja
          echo "##vso[task.prependpath]/usr/local/bin"
  - ${{ if ne(parameters.aptToDnfMap[pkg], '') }}:
    - task: Bash@3
      displayName: 'dnf install ${{ parameters.aptToDnfMap[pkg] }}'
      inputs:
        targetType: inline
        script: |
          sudo dnf -y install ${{ parameters.aptToDnfMap[pkg] }}
--- a/.azuredevops/templates/steps/dependencies-other.yml
+++ b/.azuredevops/templates/steps/dependencies-other.yml
@@ -9,56 +9,24 @@ parameters:
 - name: registerROCmPackages
  type: boolean
  default: false
 - name: packageManager
  type: string
  default: apt
 steps:
- ${{ if eq(parameters.registerROCmPackages, true) }}:
+- ${{ if eq(parameters.packageManager, 'apt') }}:
-  - task: Bash@3
+  - template: dependencies-apt.yml
-    displayName: 'Register AMDGPU & ROCm repos'
+    parameters:
-    inputs:
+      aptPackages: ${{ parameters.aptPackages }}
-      targetType: inline
+      registerROCmPackages: ${{ parameters.registerROCmPackages }}
-      script: |
+- ${{ if eq(parameters.packageManager, 'dnf') }}:
-        sudo mkdir --parents --mode=0755 /etc/apt/keyrings
+  - template: dependencies-dnf.yml
-        wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null
+    parameters:
-        echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/amdgpu/$(REPO_RADEON_VERSION)/ubuntu jammy main" | sudo tee /etc/apt/sources.list.d/amdgpu.list
+      aptPackages: ${{ parameters.aptPackages }}
-        echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/$(REPO_RADEON_VERSION) jammy main" | sudo tee --append /etc/apt/sources.list.d/rocm.list
+      registerROCmPackages: ${{ parameters.registerROCmPackages }}
        echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600
        sudo apt update
 # firefox takes time to upgrade and is not needed for CI workloads, hold version
 - task: Bash@3
  continueOnError: true
  displayName: 'sudo apt-mark hold firefox'
  inputs:
    targetType: inline
    script: sudo apt-mark hold firefox
 - task: Bash@3
  displayName: 'sudo apt-get update'
  inputs:
    targetType: inline
    script: |
      echo "deb http://archive.ubuntu.com/ubuntu/ jammy main restricted universe multiverse" | sudo tee -a /etc/apt/sources.list.d/default.list
      echo "deb http://archive.ubuntu.com/ubuntu/ jammy-updates main restricted universe multiverse" | sudo tee -a /etc/apt/sources.list.d/default.list
      echo "deb http://archive.ubuntu.com/ubuntu/ jammy-backports main restricted universe multiverse" | sudo tee -a /etc/apt/sources.list.d/default.list
      echo "deb http://archive.ubuntu.com/ubuntu/ jammy-security main restricted universe multiverse" | sudo tee -a /etc/apt/sources.list.d/default.list
      sudo DEBIAN_FRONTEND=noninteractive apt-get --yes update
 - task: Bash@3
  displayName: 'sudo apt-get upgrade'
  inputs:
    targetType: inline
    script: sudo DEBIAN_FRONTEND=noninteractive apt-get --yes upgrade
 - task: Bash@3
  displayName: 'sudo apt-get fix'
  inputs:
    targetType: inline
    script: sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --fix-broken install
 - ${{ if gt(length(parameters.aptPackages), 0) }}:
  - task: Bash@3
    displayName: 'sudo apt-get install ...'
    inputs:
      targetType: inline
      script: sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --fix-missing install ${{ join(' ', parameters.aptPackages) }}
 - ${{ if gt(length(parameters.pipModules), 0) }}:
  - task: Bash@3
    displayName: 'pip install  ...'
    inputs:
      targetType: inline
-      script: pip install -v --force-reinstall ${{ join(' ', parameters.pipModules) }}
+      script: python3 -m pip install -v --force-reinstall ${{ join(' ', parameters.pipModules) }}
--- a/.azuredevops/templates/steps/dependencies-rocm.yml
+++ b/.azuredevops/templates/steps/dependencies-rocm.yml
@@ -13,6 +13,9 @@ parameters:
 - name: dependencyList
  type: object
  default: []
 - name: os
  type: string
  default: 'ubuntu2204'
 - name: gpuTarget
  type: string
  default: ''
@@ -36,6 +39,10 @@ parameters:
 - name: aggregatePipeline
  type: boolean
  default: false
 # monorepo related parameters
 - name: downstreamAggregateNames
  type: string
  default: ''
 - name: componentVarList
  type: object
@@ -103,7 +110,7 @@ parameters:
    hipCUB:
      pipelineId: $(HIPCUB_PIPELINE_ID)
      stagingBranch: develop
-      mainlineBranch: mainline
+      mainlineBranch: develop
      hasGpuTarget: true
    hipFFT:
      pipelineId: $(HIPFFT_PIPELINE_ID)
@@ -123,7 +130,7 @@ parameters:
    hipRAND:
      pipelineId: $(HIPRAND_PIPELINE_ID)
      stagingBranch: develop
-      mainlineBranch: mainline
+      mainlineBranch: develop
      hasGpuTarget: true
    hipSOLVER:
      pipelineId: $(HIPSOLVER_PIPELINE_ID)
@@ -258,7 +265,7 @@ parameters:
    rocPRIM:
      pipelineId: $(ROCPRIM_PIPELINE_ID)
      stagingBranch: develop
-      mainlineBranch: mainline
+      mainlineBranch: develop
      hasGpuTarget: true
    rocprofiler:
      pipelineId: $(ROCPROFILER_PIPELINE_ID)
@@ -298,7 +305,7 @@ parameters:
    rocRAND:
      pipelineId: $(ROCRAND_PIPELINE_ID)
      stagingBranch: develop
-      mainlineBranch: mainline
+      mainlineBranch: develop
      hasGpuTarget: true
    rocr_debug_agent:
      pipelineId: $(ROCR_DEBUG_AGENT_PIPELINE_ID)
@@ -323,7 +330,7 @@ parameters:
    rocThrust:
      pipelineId: $(ROCTHRUST_PIPELINE_ID)
      stagingBranch: develop
-      mainlineBranch: mainline
+      mainlineBranch: develop
      hasGpuTarget: true
    roctracer:
      pipelineId: $(ROCTRACER_PIPELINE_ID)
@@ -361,7 +368,7 @@ steps:
        pipelineId: ${{ parameters.componentVarList[split(dependency, ':')[0]].pipelineId }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
        ${{ if parameters.componentVarList[split(dependency, ':')[0]].hasGpuTarget }}:
-          fileFilter: "${{ split(dependency, ':')[1] }}*${{ parameters.gpuTarget }}"
+          fileFilter: "${{ split(dependency, ':')[1] }}*_${{ parameters.os }}_${{ parameters.gpuTarget }}"
        # dependencySource = staging
        ${{ if eq(parameters.dependencySource, 'staging')}}:
          branchName: ${{ parameters.componentVarList[split(dependency, ':')[0]].stagingBranch }}
@@ -384,6 +391,14 @@ steps:
        ${{ else }}:
          branchName: ${{ parameters.componentVarList[split(dependency, ':')[0]].stagingBranch }}
 # no colon (:) found in this item in the list
  - ${{ elseif containsValue(split(parameters.downstreamAggregateNames, '+'), dependency) }}:
    - template: local-artifact-download.yml
      parameters:
        ${{ if parameters.componentVarList[dependency].hasGpuTarget }}:
          gpuTarget: ${{ parameters.gpuTarget }}
        preTargetFilter: ${{ dependency }}
        os: ${{ parameters.os }}
        buildType: current
  - ${{ else }}:
    - template: artifact-download.yml
      parameters:
@@ -391,7 +406,9 @@ steps:
        pipelineId: ${{ parameters.componentVarList[dependency].pipelineId }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
        ${{ if parameters.componentVarList[dependency].hasGpuTarget }}:
-          fileFilter: ${{ parameters.gpuTarget }}
+          fileFilter: ${{ parameters.os }}_${{ parameters.gpuTarget }}
        ${{ else }}:
          fileFilter: ${{ parameters.os }}
        # dependencySource = staging
        ${{ if eq(parameters.dependencySource, 'staging')}}:
          branchName: ${{ parameters.componentVarList[dependency].stagingBranch }}
@@ -419,14 +436,16 @@ steps:
    displayName: Symlink from rocm/llvm to rocm/lib/llvm
    inputs:
      targetType: inline
-      script: sudo ln -s $(Agent.BuildDirectory)/rocm/llvm $(Agent.BuildDirectory)/rocm/lib/llvm
+      script: |
        sudo mkdir -p $(Agent.BuildDirectory)/rocm/lib
        sudo ln -sr $(Agent.BuildDirectory)/rocm/llvm $(Agent.BuildDirectory)/rocm/lib/llvm
  - task: Bash@3
    displayName: Symlink executables from rocm/llvm/bin to rocm/bin
    inputs:
      targetType: inline
      script: |
        for file in amdclang amdclang++ amdclang-cl amdclang-cpp amdflang amdlld aompcc mygpu mycpu offload-arch; do
-          sudo ln -s $(Agent.BuildDirectory)/rocm/llvm/bin/$file $(Agent.BuildDirectory)/rocm/bin/$file
+          sudo ln -sr $(Agent.BuildDirectory)/rocm/llvm/bin/$file $(Agent.BuildDirectory)/rocm/bin/$file
        done
 # dlopen calls within a ctest or pytest sequence runs into issues when shared library symlink convention is not followed
 # the convention is as follows:
@@ -471,8 +490,10 @@ steps:
      targetType: inline
 # OS ignores if the ROCm lib folder shows up more than once
      script: |
-        echo $(Agent.BuildDirectory)/rocm/lib | sudo tee /etc/ld.so.conf.d/rocm-ci.conf
+        echo $(Agent.BuildDirectory)/rocm/lib | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
        echo $(Agent.BuildDirectory)/rocm/llvm/lib | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
        echo $(Agent.BuildDirectory)/rocm/lib64 | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
        echo $(Agent.BuildDirectory)/rocm/llvm/lib64 | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
        sudo cat /etc/ld.so.conf.d/rocm-ci.conf
        sudo ldconfig -v
        ldconfig -p
--- a/.azuredevops/templates/steps/dependencies-vendor.yml
+++ b/.azuredevops/templates/steps/dependencies-vendor.yml
@@ -0,0 +1,53 @@
 parameters:
 - name: os
  type: string
  default: 'ubuntu2204'
 - name: dependencyList
  type: object
 - name: pipelineIdList
  type: object
  default:
    boost: 250
    grpc: 72
    gtest: 73
    half560: 68
    lapack: 69
 steps:
 - ${{ each dependency in parameters.dependencyList }}:
  - task: DownloadPipelineArtifact@2
    displayName: Download ${{ dependency }}
    inputs:
      project: ROCm-CI
      buildType: specific
      targetPath: $(Pipeline.Workspace)/d
      definition: ${{ parameters.pipelineIdList[dependency] }}
      itemPattern: '**/*${{ parameters.os }}*'
  - task: ExtractFiles@1
    displayName: Extract ${{ dependency }}
    inputs:
      archiveFilePatterns: '$(Pipeline.Workspace)/d/**/*.tar.gz'
      destinationFolder: $(Agent.BuildDirectory)/vendor
      cleanDestinationFolder: true
      overwriteExistingFiles: true
  - task: DeleteFiles@1
    displayName: Clean up ${{ dependency }}
    inputs:
      SourceFolder: $(Pipeline.Workspace)/d
      Contents: '**/*.tar.gz'
      RemoveDotFiles: true
 - task: Bash@3
  displayName: List vendored files
  inputs:
    targetType: inline
    script: ls -la1R $(Agent.BuildDirectory)/vendor
 - task: Bash@3
  displayName: Link vendored shared libraries
  inputs:
    targetType: inline
    script: |
      echo $(Agent.BuildDirectory)/vendor/lib | sudo tee -a /etc/ld.so.conf.d/vendor.conf
      echo $(Agent.BuildDirectory)/vendor/lib64 | sudo tee -a /etc/ld.so.conf.d/vendor.conf
      sudo cat /etc/ld.so.conf.d/vendor.conf
      sudo ldconfig -v
      ldconfig -p
--- a/.azuredevops/templates/steps/local-artifact-download.yml
+++ b/.azuredevops/templates/steps/local-artifact-download.yml
@@ -2,6 +2,9 @@
 # It can be overridden to download any artifact from any pipeline, given the appropriate build/pipeline IDs
 parameters:
  - name: os
    type: string
    default: 'ubuntu2204'
  - name: gpuTarget
    type: string
    default: ''
@@ -29,25 +32,27 @@ parameters:
 steps:
  - task: DownloadPipelineArtifact@2
-    displayName: 'Download Pipeline Build'
+    displayName: Download ${{ parameters.preTargetFilter}}*${{ parameters.os }}_${{ parameters.gpuTarget}}*${{ parameters.postTargetFilter}}
    inputs:
      ${{ if eq(parameters.buildType, 'specific') }}:
        buildType: specific
        buildVersionToDownload: specific
        project: ROCm-CI
-        definition: ${{ parameters.definitionId }}
+        ${{ if ne(parameters.definitionId, 0) }}:
-        buildId: ${{ parameters.buildId }}
+          definition: ${{ parameters.definitionId }}
-      itemPattern: '**/*${{ parameters.preTargetFilter }}*${{ parameters.gpuTarget }}*${{ parameters.postTargetFilter }}*'
+        ${{ if ne(parameters.buildId, 0) }}:
          buildId: ${{ parameters.buildId }}
      itemPattern: '**/*${{ parameters.preTargetFilter }}*${{ parameters.os }}_${{ parameters.gpuTarget }}*${{ parameters.postTargetFilter }}*'
      targetPath: $(Pipeline.Workspace)/d
  - task: ExtractFiles@1
-    displayName: 'Extract Pipeline Build'
+    displayName: Extract ${{ parameters.preTargetFilter}}*${{ parameters.os }}_${{ parameters.gpuTarget}}*${{ parameters.postTargetFilter}}
    inputs:
      archiveFilePatterns: '$(Pipeline.Workspace)/d/**/*.tar.gz'
      destinationFolder: '$(Agent.BuildDirectory)/rocm'
      cleanDestinationFolder: false
      overwriteExistingFiles: true
  - task: DeleteFiles@1
-    displayName: 'Clean up Compressed Pipeline Build'
+    displayName: Clean up ${{ parameters.preTargetFilter}}*${{ parameters.os }}_${{ parameters.gpuTarget}}*${{ parameters.postTargetFilter}}
    inputs:
      SourceFolder: '$(Pipeline.Workspace)/d'
      Contents: '/**/*.tar.xz'
--- a/.azuredevops/templates/steps/manifest.yml
+++ b/.azuredevops/templates/steps/manifest.yml
@@ -1,10 +1,19 @@
 parameters:
- name: artifactName
+- name: componentName
  type: string
-  default: 'drop'
+  default: $(Build.DefinitionName)
 - name: sparseCheckoutDir
  type: string
  default: ''
 - name: gpuTarget
  type: string
  default: ''
 - name: artifactName
  type: string
  default: drop
 - name: os
  type: string
  default: 'ubuntu2204'
 steps:
 - task: Bash@3
@@ -25,8 +34,9 @@ steps:
      IS_TAG_BUILD=$(jq 'has("release_repo")' resources.repositories)
      IS_AOMP_BUILD=$(jq 'has("aomp_repo")' resources.repositories)
      IS_MATHLIBS_BUILD=$(jq 'has("libraries_repo")' resources.repositories)
-      if [ "$IS_TAG_BUILD" = "true" ] || [ "$IS_AOMP_BUILD" = "true" ]; then
+      if [ "$IS_TAG_BUILD" = "true" ] || [ "$IS_AOMP_BUILD" = "true" ] || [ "$IS_MATHLIBS_BUILD" = "true" ]; then
        exclude_keys=("pipelines_repo" "self") # Triggered by a file under ROCm/ROCm
      else
        exclude_keys=("pipelines_repo") # Triggered by a file under a component repo
@@ -45,6 +55,7 @@ steps:
              buildId: "$(Build.BuildId)",
              repoId: $entry.value.id,
              repoName: $entry.value.name,
              repoSparse: "${{ parameters.sparseCheckoutDir }}",
              repoRef: $entry.value.ref,
              repoUrl: $entry.value.url,
              repoVersion: $entry.value.version
@@ -55,7 +66,7 @@ steps:
        )
      ' resources.repositories)
-      manifest_json=$(Build.ArtifactStagingDirectory)/manifest_$(Build.DefinitionName)_$(Build.BuildId)_$(Build.BuildNumber)_ubuntu2204_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.json
+      manifest_json=$(Build.ArtifactStagingDirectory)/manifest_${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.json
      dependencies=()
      for manifest_file in $(Pipeline.Workspace)/d/**/manifest_*.json; do
@@ -81,6 +92,7 @@ steps:
          "<tr><td>" + .buildNumber + "</td>" +
          "<td><a href=\"https://dev.azure.com/ROCm-CI/ROCm-CI/_build/results?buildId=" + .buildId + "\">" + .buildId + "</a></td>" +
          "<td><a href=\"" + .repoUrl + "\">" + .repoName + "</a></td>" +
          "<td><a href=\"" + .repoUrl + "/tree/" + .repoRef + "/" + .repoSparse + "\">" + .repoSparse + "</a></td>" +
          "<td><a href=\"" + .repoUrl + "/tree/" + .repoRef + "\">" + .repoRef + "</a></td>" +
          "<td><a href=\"" + .repoUrl + "/commit/" + .repoVersion + "\">" + .repoVersion + "</a></td></tr>"
        ')
@@ -93,6 +105,7 @@ steps:
          "<tr><td>" + .buildNumber + "</td>" +
          "<td><a href=\"https://dev.azure.com/ROCm-CI/ROCm-CI/_build/results?buildId=" + .buildId + "\">" + .buildId + "</a></td>" +
          "<td><a href=\"" + .repoUrl + "\">" + .repoName + "</a></td>" +
          "<td><a href=\"" + .repoUrl + "/tree/" + .repoRef + "/" + .repoSparse + "\">" + .repoSparse + "</a></td>" +
          "<td><a href=\"" + .repoUrl + "/tree/" + .repoRef + "\">" + .repoRef + "</a></td>" +
          "<td><a href=\"" + .repoUrl + "/commit/" + .repoVersion + "\">" + .repoVersion + "</a></td></tr>"
        ')
@@ -107,7 +120,7 @@ steps:
  inputs:
    targetType: inline
    script: |
-      manifest_html=$(Build.ArtifactStagingDirectory)/manifest_$(Build.DefinitionName)_$(Build.BuildId)_$(Build.BuildNumber)_ubuntu2204_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.html
+      manifest_html=$(Build.ArtifactStagingDirectory)/manifest_${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.html
      cat <<EOF > $manifest_html
      <html>
      <h1>Manifest</h1>
@@ -117,6 +130,7 @@ steps:
        <th>Build Number</th>
        <th>Build ID</th>
        <th>Repo Name</th>
        <th>Repo Sparse</th>
        <th>Repo Ref</th>
        <th>Repo Version</th>
      </tr>
@@ -128,6 +142,7 @@ steps:
        <th>Build Number</th>
        <th>Build ID</th>
        <th>Repo Name</th>
        <th>Repo Sparse</th>
        <th>Repo Ref</th>
        <th>Repo Version</th>
      </tr>
@@ -148,7 +163,7 @@ steps:
  continueOnError: true
  inputs:
    tabName: Manifest
-    reportDir: $(Build.ArtifactStagingDirectory)/manifest_$(Build.DefinitionName)_$(Build.BuildId)_$(Build.BuildNumber)_ubuntu2204_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.html
+    reportDir: $(Build.ArtifactStagingDirectory)/manifest_${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.html
 - task: Bash@3
  displayName: Save manifest artifact file name
  condition: always()
@@ -157,5 +172,5 @@ steps:
    workingDirectory: $(Pipeline.Workspace)
    targetType: inline
    script: |
-      echo "manifest_$(Build.DefinitionName)_$(Build.BuildId)_$(Build.BuildNumber)_ubuntu2204_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.html" >> pipelineArtifacts.txt
+      echo "manifest_${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.html" >> pipelineArtifacts.txt
-      echo "manifest_$(Build.DefinitionName)_$(Build.BuildId)_$(Build.BuildNumber)_ubuntu2204_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.json" >> pipelineArtifacts.txt
+      echo "manifest_${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.json" >> pipelineArtifacts.txt
--- a/.azuredevops/templates/steps/miopen-get-ck-build.yml
+++ b/.azuredevops/templates/steps/miopen-get-ck-build.yml
@@ -25,7 +25,7 @@ steps:
      echo "Fetching CK build ID for commit $CK_COMMIT"
      CK_CHECKS_URL="$GH_API/composable_kernel/commits/${CK_COMMIT}/check-runs"
      CK_BUILD_ID=$(curl -s $CK_CHECKS_URL | \
-        jq '.check_runs[] | select(.name == "composable_kernel" and .app.slug == "azure-pipelines") | .details_url' | \
+        jq '.check_runs[] | select(.name == "composable_kernel" and .app.slug == "azure-pipelines" and .conclusion == "success") | .details_url' | \
        tr -d '"' | grep -oP 'buildId=\K\d+')
      # If none found, use latest successful CK build instead
--- a/.azuredevops/templates/steps/preamble.yml
+++ b/.azuredevops/templates/steps/preamble.yml
@@ -3,10 +3,27 @@
 # also display installed components and packages
 steps:
 - task: Bash@3
-  displayName: List apt packages
+  displayName: OS Version
  inputs:
    targetType: inline
-    script: apt list --installed
+    script: cat /etc/os-release
 - task: Bash@3
  displayName: List installed packages (apt, dnf, or yum)
  inputs:
    targetType: inline
    script: |
      if command -v apt >/dev/null 2>&1; then
        echo "Listing installed packages with apt:"
        apt list --installed
      elif command -v dnf >/dev/null 2>&1; then
        echo "Listing installed packages with dnf:"
        dnf list installed
      elif command -v yum >/dev/null 2>&1; then
        echo "Listing installed packages with yum:"
        yum list installed
      else
        echo "No supported package manager found (apt, dnf, yum)."
      fi
 - task: Bash@3
  displayName: Print Python version
  inputs:
@@ -16,7 +33,7 @@ steps:
  displayName: List Python packages
  inputs:
    targetType: inline
-    script: pip list -v
+    script: python3 -m pip list -v
 # The "Azure Pipelines" agents install CMake in multiple ways, including a standalone install into /usr/local/bin:
 # https://github.com/actions/runner-images/blob/6d939a3ab352a54a021dd67b071577287b6f14a5/images/ubuntu/scripts/build/install-cmake.sh#L27
 # This standalone CMake does not have a fixed version, and is not the same version as the one installed by the package manager
--- a/.azuredevops/templates/steps/test.yml
+++ b/.azuredevops/templates/steps/test.yml
@@ -2,21 +2,27 @@ parameters:
 - name: componentName
  type: string
  default: ''
 - name: os
  type: string
  default: ubuntu2204
 - name: testDir
  type: string
-  default: 'build'
+  default: build
 - name: testExecutable
  type: string
-  default: 'ctest'
+  default: ctest
 - name: testParameters
  type: string
-  default: '--output-on-failure --force-new-ctest-process --output-junit test_output.xml'
+  default: --output-on-failure --force-new-ctest-process --output-junit test_output.xml
 - name: extraTestParameters
  type: string
  default: ''
 - name: testOutputFile
  type: string
  default: test_output.xml
 - name: testOutputFormat
  type: string
-  default: 'JUnit'
+  default: JUnit
  values:
    - JUnit
    - NUnit
@@ -26,26 +32,28 @@ parameters:
 - name: testPublishResults
  type: boolean
  default: true
- name: allowPartiallySucceededBuilds
+- name: allowComponentTestFailure
  type: object
  default:
    - amdsmi
    - aomp
    - HIPIFY
    - MIVisionX
    - rocm_smi_lib
    - rocprofiler-sdk
    - roctracer
    # the following do not use this template but allow test failures, included for completeness
    - aomp
    - ROCgdb
 steps:
 # run test, continue on failure to publish results
 # and to publish build artifacts
 - task: Bash@3
  displayName: '${{ parameters.componentName }} Test'
-  continueOnError: ${{ containsValue(parameters.allowPartiallySucceededBuilds, parameters.componentName) }}
+  continueOnError: ${{ containsValue(parameters.allowComponentTestFailure, parameters.componentName) }}
  inputs:
    targetType: inline
-    script: ${{ parameters.testExecutable }} ${{ parameters.testParameters }}
+    script: |
      ${{ iif(eq(parameters.os, 'almalinux8'), 'source /opt/rh/gcc-toolset-14/enable', '') }}
      ${{ parameters.testExecutable }} ${{ parameters.testParameters }} ${{ parameters.extraTestParameters }}
    workingDirectory: ${{ parameters.testDir }}
 - ${{ if parameters.testPublishResults }}:
  - task: PublishTestResults@2
--- a/.azuredevops/variables-global.yml
+++ b/.azuredevops/variables-global.yml
@@ -3,6 +3,8 @@
 variables:
 - name: RESOURCES_REPOSITORIES
  value: $[ convertToJson(resources.repositories) ]
 - name: CCACHE_DIR
  value: $(Pipeline.Workspace)/ccache
 - name: CI_ROOT_PATH
  value: /.azuredevops
 - name: CI_COMPONENT_PATH
@@ -30,320 +32,136 @@ variables:
 - name: GFX90A_TEST_POOL
  value: gfx90a_test_pool
 - name: LATEST_RELEASE_VERSION
-  value: 6.4.0
+  value: 6.4.1
 - name: REPO_RADEON_VERSION
-  value: 6.4
+  value: 6.4.1
 - name: NEXT_RELEASE_VERSION
-  value: 6.5.0
+  value: 7.0.0
 - name: LATEST_RELEASE_TAG
-  value: rocm-6.4.0
+  value: rocm-6.4.1
 - name: DOCKER_SKIP_GFX
  value: gfx90a
 - name: AMDMIGRAPHX_GFX942_TEST_PIPELINE_ID
  value: 197
 - name: AMDMIGRAPHX_PIPELINE_ID
  value: 113
 - name: AMDMIGRAPHX_TAGGED_PIPELINE_ID
  value: 60
 - name: AMDSMI_PIPELINE_ID
  value: 99
 - name: AMDSMI_TAGGED_PIPELINE_ID
  value: 33
 - name: AOMP_EXTRAS_PIPELINE_ID
  value: 111
 - name: AOMP_EXTRAS_TAGGED_PIPELINE_ID
  value: 75
 - name: AOMP_PIPELINE_ID
  value: 115
 - name: AOMP_TAGGED_PIPELINE_ID
  value: 76
 - name: CCACHE_DIR
  value: $(Pipeline.Workspace)/ccache
 - name: CLR_PIPELINE_ID
  value: 145
 - name: CLR_TAGGED_PIPELINE_ID
  value: 71
 - name: COMPOSABLE_KERNEL_GFX942_TEST_PIPELINE_ID
  value: 179
 - name: COMPOSABLE_KERNEL_PIPELINE_ID
  value: 86
 - name: COMPOSABLE_KERNEL_TAGGED_PIPELINE_ID
  value: 38
 - name: FLANG_LEGACY_PIPELINE_ID
  value: 77
 - name: FLANG_LEGACY_TAGGED_PIPELINE_ID
  value: 77
 - name: HALF_PIPELINE_ID
  value: 101
 - name: HALF_TAGGED_PIPELINE_ID
  value: 11
 - name: HALF560_PIPELINE_ID
  value: 68
 - name: HALF560_BUILD_ID
  value: 621
 - name: HIP_PIPELINE_ID
  value: 93
 - name: HIP_TAGGED_PIPELINE_ID
  value: 31
 - name: HIP_TESTS_PIPELINE_ID
  value: 233
 - name: HIP_TESTS_TAGGED_PIPELINE_ID
  value: 220
 - name: HIPBLAS_COMMON_PIPELINE_ID
  value: 223
 - name: HIPBLAS_COMMON_TAGGED_PIPELINE_ID
  value: 224
 - name: HIPBLAS_GFX942_TEST_PIPELINE_ID
  value: 202
 - name: HIPBLAS_PIPELINE_ID
  value: 87
 - name: HIPBLAS_TAGGED_PIPELINE_ID
  value: 44
 - name: HIPBLASLT_GFX942_TEST_PIPELINE_ID
  value: 187
 - name: HIPBLASLT_PIPELINE_ID
  value: 112
 - name: HIPBLASLT_TAGGED_PIPELINE_ID
  value: 45
 - name: HIPCUB_GFX942_TEST_PIPELINE_ID
  value: 186
 - name: HIPCUB_PIPELINE_ID
-  value: 97
+  value: 277
 - name: HIPCUB_TAGGED_PIPELINE_ID
  value: 46
 - name: HIPFFT_GFX942_TEST_PIPELINE_ID
  value: 198
 - name: HIPFFT_PIPELINE_ID
  value: 121
 - name: HIPFFT_TAGGED_PIPELINE_ID
  value: 12
 - name: HIPFORT_PIPELINE_ID
  value: 102
 - name: HIPFORT_TAGGED_PIPELINE_ID
  value: 34
 - name: HIPIFY_PIPELINE_ID
  value: 92
 - name: HIPIFY_TAGGED_PIPELINE_ID
  value: 13
 - name: HIPRAND_GFX942_TEST_PIPELINE_ID
  value: 188
 - name: HIPRAND_PIPELINE_ID
-  value: 90
+  value: 275
 - name: HIPRAND_TAGGED_PIPELINE_ID
  value: 42
 - name: HIPSOLVER_GFX942_TEST_PIPELINE_ID
  value: 201
 - name: HIPSOLVER_PIPELINE_ID
  value: 84
 - name: HIPSOLVER_TAGGED_PIPELINE_ID
  value: 52
 - name: HIPSPARSE_GFX942_TEST_PIPELINE_ID
  value: 195
 - name: HIPSPARSE_PIPELINE_ID
  value: 83
 - name: HIPSPARSE_TAGGED_PIPELINE_ID
  value: 14
 - name: HIPSPARSELT_GFX942_TEST_PIPELINE_ID
  value: 200
 - name: HIPSPARSELT_PIPELINE_ID
  value: 104
 - name: HIPSPARSELT_TAGGED_PIPELINE_ID
  value: 53
 - name: HIPTENSOR_GFX942_TEST_PIPELINE_ID
  value: 192
 - name: HIPTENSOR_PIPELINE_ID
  value: 105
 - name: HIPTENSOR_TAGGED_PIPELINE_ID
  value: 56
 - name: LLVM_PROJECT_PIPELINE_ID
  value: 2
 - name: LLVM_PROJECT_TAGGED_PIPELINE_ID
  value: 8
 - name: MIOPEN_PIPELINE_ID
  value: 108
 - name: MIOPEN_TAGGED_PIPELINE_ID
  value: 58
 - name: MIVISIONX_PIPELINE_ID
  value: 80
 - name: MIVISIONX_TAGGED_PIPELINE_ID
  value: 18
 - name: OMNIPERF_PIPELINE_ID
  value: 241
 - name: OMNIPERF_TAGGED_PIPELINE_ID
  value: 242
 - name: OMNITRACE_PIPELINE_ID
  value: 253
 - name: OMNITRACE_TAGGED_PIPELINE_ID
  value: 252
 - name: RCCL_GFX942_TEST_PIPELINE_ID
  value: 184
 - name: RCCL_PIPELINE_ID
  value: 107
 - name: RCCL_TAGGED_PIPELINE_ID
  value: 15
 - name: RDC_PIPELINE_ID
  value: 100
 - name: RDC_TAGGED_PIPELINE_ID
  value: 59
 - name: ROCAL_PIPELINE_ID
  value: 151
 - name: ROCALUTION_GFX942_TEST_PIPELINE_ID
  value: 196
 - name: ROCALUTION_PIPELINE_ID
  value: 89
 - name: ROCALUTION_TAGGED_PIPELINE_ID
  value: 16
 - name: ROCBLAS_GFX942_TEST_PIPELINE_ID
  value: 185
 - name: ROCBLAS_PIPELINE_ID
  value: 85
 - name: ROCBLAS_TAGGED_PIPELINE_ID
  value: 32
 - name: ROCDBGAPI_PIPELINE_ID
  value: 135
 - name: ROCDBGAPI_TAGGED_PIPELINE_ID
  value: 17
 - name: ROCDECODE_PIPELINE_ID
  value: 79
 - name: ROCDECODE_TAGGED_PIPELINE_ID
  value: 21
 - name: ROCFFT_GFX942_TEST_PIPELINE_ID
  value: 189
 - name: ROCFFT_PIPELINE_ID
  value: 120
 - name: ROCFFT_TAGGED_PIPELINE_ID
  value: 19
 - name: ROCGDB_PIPELINE_ID
  value: 134
 - name: ROCGDB_TAGGED_PIPELINE_ID
  value: 50
 - name: ROCJPEG_PIPELINE_ID
  value: 262
 - name: ROCJPEG_TAGGED_PIPELINE_ID
  value: 263
 - name: ROCM_BANDWIDTH_TEST_PIPELINE_ID
  value: 88
 - name: ROCM_BANDWIDTH_TEST_TAGGED_PIPELINE_ID
  value: 23
 - name: ROCM_CMAKE_PIPELINE_ID
  value: 6
 - name: ROCM_CMAKE_TAGGED_PIPELINE_ID
  value: 7
 - name: ROCM_CORE_PIPELINE_ID
  value: 103
 - name: ROCM_CORE_TAGGED_PIPELINE_ID
  value: 22
 - name: ROCM_EXAMPLES_GFX942_TEST_PIPELINE_ID
  value: 204
 - name: ROCM_EXAMPLES_PIPELINE_ID
  value: 216
 - name: ROCM_EXAMPLES_TAGGED_PIPELINE_ID
  value: 245
 - name: ROCM_SMI_LIB_PIPELINE_ID
  value: 96
 - name: ROCM_SMI_LIB_TAGGED_PIPELINE_ID
  value: 47
 - name: ROCMINFO_PIPELINE_ID
  value: 91
 - name: ROCMINFO_TAGGED_PIPELINE_ID
  value: 27
 - name: ROCMLIR_PIPELINE_ID
  value: 229
 - name: ROCMLIR_TAGGED_PIPELINE_ID
  value: 62
 - name: ROCMVALIDATIONSUITE_PIPELINE_ID
  value: 106
 - name: ROCMVALIDATIONSUITE_TAGGED_PIPELINE_ID
  value: 43
 - name: ROCPRIM_GFX942_TEST_PIPELINE_ID
  value: 180
 - name: ROCPRIM_PIPELINE_ID
-  value: 82
+  value: 273
 - name: ROCPRIM_TAGGED_PIPELINE_ID
  value: 20
 - name: ROCPROFILER_GFX942_TEST_PIPELINE_ID
  value: 190
 - name: ROCPROFILER_COMPUTE_PIPELINE_ID
  value: 257
 - name: ROCPROFILER_COMPUTE_TAGGED_PIPELINE_ID
  value: 258
 - name: ROCPROFILER_REGISTER_PIPELINE_ID
  value: 1
 - name: ROCPROFILER_REGISTER_TAGGED_PIPELINE_ID
  value: 25
 - name: ROCPROFILER_SDK_PIPELINE_ID
  value: 246
 - name: ROCPROFILER_SDK_TAGGED_PIPELINE_ID
  value: 234
 - name: ROCPROFILER_SYSTEMS_PIPELINE_ID
  value: 255
 - name: ROCPROFILER_SYSTEMS_TAGGED_PIPELINE_ID
  value: 254
 - name: ROCPROFILER_PIPELINE_ID
  value: 143
 - name: ROCPROFILER_TAGGED_PIPELINE_ID
  value: 28
 - name: ROCPYDECODE_PIPELINE_ID
  value: 239
 - name: ROCPYDECODE_TAGGED_PIPELINE_ID
  value: 232
 - name: ROCR_DEBUG_AGENT_PIPELINE_ID
  value: 136
 - name: ROCR_DEBUG_AGENT_TAGGED_PIPELINE_ID
  value: 29
 - name: ROCR_RUNTIME_PIPELINE_ID
  value: 10
 - name: ROCR_RUNTIME_TAGGED_PIPELINE_ID
  value: 24
 - name: ROCRAND_GFX942_TEST_PIPELINE_ID
  value: 183
 - name: ROCRAND_PIPELINE_ID
-  value: 95
+  value: 274
 - name: ROCRAND_TAGGED_PIPELINE_ID
  value: 41
 - name: ROCSOLVER_GFX942_TEST_PIPELINE_ID
  value: 199
 - name: ROCSOLVER_PIPELINE_ID
  value: 81
 - name: ROCSOLVER_TAGGED_PIPELINE_ID
  value: 55
 - name: ROCSPARSE_GFX942_TEST_PIPELINE_ID
  value: 191
 - name: ROCSPARSE_PIPELINE_ID
  value: 98
 - name: ROCSPARSE_TAGGED_PIPELINE_ID
  value: 67
 - name: ROCT_THUNK_INTERFACE_PIPELINE_ID
  value: 3
 - name: ROCT_THUNK_INTERFACE_TAGGED_PIPELINE_ID
  value: 9
 - name: ROCTHRUST_GFX942_TEST_PIPELINE_ID
  value: 194
 - name: ROCTHRUST_PIPELINE_ID
-  value: 94
+  value: 276
 - name: ROCTHRUST_TAGGED_PIPELINE_ID
  value: 26
 - name: ROCTRACER_GFX942_TEST_PIPELINE_ID
  value: 181
 - name: ROCTRACER_PIPELINE_ID
  value: 141
 - name: ROCTRACER_TAGGED_PIPELINE_ID
  value: 30
 - name: ROCWMMA_GFX942_TEST_PIPELINE_ID
  value: 193
 - name: ROCWMMA_PIPELINE_ID
  value: 109
 - name: ROCWMMA_TAGGED_PIPELINE_ID
  value: 57
 - name: RPP_GFX942_TEST_PIPELINE_ID
  value: 182
 - name: RPP_PIPELINE_ID
  value: 78
 - name: RPP_TAGGED_PIPELINE_ID
  value: 39
 - name: TRANSFERBENCH_PIPELINE_ID
  value: 265
 - name: TRANSFERBENCH_TAGGED_PIPELINE_ID
  value: 266
 - name: BOOST_DEPENDENCY_PIPELINE_ID
  value: 250
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -272,6 +272,7 @@ NBIO
 NBIOs
 NCCL
 NCF
 NFS
 NIC
 NICs
 NLI
@@ -500,6 +501,7 @@ ZenDNN
 accuracies
 activations
 addr
 ade
 ai
 alloc
 allocatable
@@ -515,6 +517,7 @@ avx
 awk
 backend
 backends
 bb
 benchmarked
 benchmarking
 bfloat
@@ -538,6 +541,7 @@ cd
 centos
 centric
 changelog
 checkpointing
 chiplet
 cmake
 cmd
@@ -578,6 +582,7 @@ de
 deallocation
 debuggability
 debian
 deepseek
 denoise
 denoised
 denoises
@@ -601,6 +606,7 @@ embeddings
 enablement
 encodings
 endfor
 endif
 endpgm
 enqueue
 env
@@ -702,6 +708,7 @@ migratable
 miopen
 miopengemm
 mivisionx
 mixtral
 mjx
 mkdir
 mlirmiopen
@@ -843,6 +850,7 @@ subfolder
 subfolders
 submodule
 submodules
 subnet
 supercomputing
 symlink
 symlinks
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,139 @@ This page is a historical overview of changes made to ROCm components. This
 consolidated changelog documents key modifications and improvements across
 different versions of the ROCm software stack and its components.
 ## ROCm 6.4.1
 See the [ROCm 6.4.1 release notes](https://rocm.docs.amd.com/en/docs-6.4.1/about/release-notes.html)
 for a complete overview of this release.
 ### **AMD SMI** (25.4.2)
 #### Added
 * Dumping CPER entries from RAS tool `amdsmi_get_gpu_cper_entries()` to Python and C APIs.
  - Dumping CPER entries consist of `amdsmi_cper_hdr_t`.
  - Dumping CPER entries is also enabled in the CLI interface through `sudo amd-smi ras --cper`.
 * `amdsmi_get_gpu_busy_percent` to the C API.
 #### Changed
 * Modified VRAM display for `amd-smi monitor -v`. 
 #### Optimized
 * Improved load times for CLI commands when the GPU has multiple parititons.
 #### Resolved issues
 * Fixed partition enumeration in `amd-smi list -e`, `amdsmi_get_gpu_enumeration_info()`, `amdsmi_enumeration_info_t`, `drm_card`, and `drm_render` fields.
 #### Known issues
 * When using the `--follow` flag with `amd-smi ras --cper`, CPER entries are not streamed continuously as intended. This will be fixed in an upcoming ROCm release.
 ```{note}
 See the full [AMD SMI changelog](https://github.com/ROCm/amdsmi/blob/release/rocm-rel-6.4/CHANGELOG.md) for details, examples, and in-depth descriptions.
 ```
 ### **HIP** (6.4.1)
 #### Added
 * New log mask enumeration `LOG_COMGR` enables logging precise code object information.
 #### Changed
 * HIP runtime uses device bitcode before SPIRV.
 * The implementation of preventing `hipLaunchKernel` latency degradation with number of idle streams is reverted or disabled by default.
 #### Optimized
 * Improved kernel logging includes de-mangling shader names.
 * Refined implementation in HIP APIs `hipEventRecords` and `hipStreamWaitEvent` for performance improvement.
 #### Resolved issues
 * Stale state during the graph capture. The return error was fixed, HIP runtime now always uses the latest dependent nodes during `hipEventRecord` capture.
 * Segmentation fault during kernel execution. HIP runtime now allows maximum stack size as per ISA on the GPU device.
 ### **hipBLASLt** (0.12.1)
 #### Resolved issues
 * Fixed an accuracy issue for some solutions using an `FP32` or `TF32` data type with a TT transpose.
 ### **RCCL** (2.22.3)
 #### Changed
 * MSCCL++ is now disabled by default. To enable it, set `RCCL_MSCCLPP_ENABLE=1`.
 #### Resolved issues
 * Fixed an issue where early termination, in rare circumstances, could cause the application to stop responding by adding synchronization before destroying a proxy thread.
 * Fixed the accuracy issue for the MSCCLPP `allreduce7` kernel in graph mode.
 #### Known issues
 * When splitting a communicator using `ncclCommSplit` in some GPU configurations, MSCCL initialization can cause a segmentation fault. The recommended workaround is to disable MSCCL with `export RCCL_MSCCL_ENABLE=0`.
  This issue will be fixed in a future ROCm release.
 * Within the RCCL-UnitTests test suite, failures occur in tests ending with the
  `.ManagedMem` and `.ManagedMemGraph` suffixes. These failures only affect the
  test results and do not affect the RCCL component itself. This issue will be
  resolved in a future ROCm release.
 ### **rocALUTION** (3.2.3)
 #### Added
 * The `-a` option has been added to the `rmake.py` build script. This option allows you to select specific architectures when building on Microsoft Windows.
 #### Resolved issues
 * Fixed an issue where the `HIP_PATH` environment variable was being ignored when compiling on Microsoft Windows.
 ### **ROCm Data Center Tool** (0.3.0)
 #### Added
 - Support for GPU partitions.
 - `RDC_FI_GPU_BUSY_PERCENT` metric.
 #### Changed
 - Updated `rdc_field` to align with `rdc_bootstrap` for current metrics.
 #### Resolved issues
 - Fixed [ROCProfiler](https://rocm.docs.amd.com/projects/rocprofiler/en/docs-6.4.0/index.html) eval metrics and memory leaks.
 ### **ROCm SMI** (7.5.0)
 #### Resolved issues
 - Fixed partition enumeration. It now refers to the correct DRM Render and Card paths.
 ```{note}
 See the full [ROCm SMI changelog](https://github.com/ROCm/rocm_smi_lib/blob/release/rocm-rel-6.4/CHANGELOG.md) for details, examples, and in-depth descriptions.
 ```
 ### **ROCm Systems Profiler** (1.0.1)
 #### Added 
 * How-to document for [network performance profiling](https://rocm.docs.amd.com/projects/rocprofiler-systems/en/latest/how-to/nic-profiling.html) for standard Network Interface Cards (NICs).
 #### Resolved issues
 * Fixed a build issue with Dyninst on GCC 13.
 ### **ROCr Runtime** (1.15.0)
 #### Resolved issues
 * Fixed a rare occurrence issue on AMD Instinct MI25, MI50, and MI100 GPUs, where the `SDMA` copies might start before the dependent Kernel finishes and could cause memory corruption.
 ## ROCm 6.4.0
 See the [ROCm 6.4.0 release notes](https://rocm.docs.amd.com/en/docs-6.4.0/about/release-notes.html)
@@ -761,6 +894,18 @@ See the full [ROCm SMI changelog](https://github.com/ROCm/rocm_smi_lib/blob/rele
 - Fixed an issue where sampling multi-GPU Python workloads caused the system to stop responding.
 ### **ROCm Validation Suite** (1.1.0)
 #### Added
 * Configuration files for MI210.
 * Support for OCP fp8 data type.
 * GPU index-based CLI execution.
 #### Changed
 * JSON logging with updated schema.
 ### **rocPRIM** (3.4.0)
 #### Added
--- a/README.md
+++ b/README.md
@@ -19,142 +19,17 @@ ROCm supports programming models, such as OpenMP and OpenCL, and includes all ne
 source software compilers, debuggers, and libraries. ROCm is fully integrated into machine learning
 (ML) frameworks, such as PyTorch and TensorFlow.
-## Getting the ROCm Source Code
+> [!IMPORTANT]
 > A new open source build platform for ROCm is under development at
 > https://github.com/ROCm/TheRock, featuring a unified CMake build with bundled
 > dependencies, Windows support, and more.
 >
 > The instructions below describe the prior process for building from source
 > which will be replaced once TheRock is mature enough.
-AMD ROCm is built from open source software. It is, therefore, possible to modify the various components of ROCm by downloading the source code and rebuilding the components. The source code for ROCm components can be cloned from each of the GitHub repositories using git.  For easy access to download the correct versions of each of these tools, the ROCm repository contains a repo manifest file called [default.xml](./default.xml). You can use this manifest file to download the source code for ROCm software.
+## Getting and Building ROCm from Source
-### Installing the repo tool
+Please use [TheRock](https://github.com/ROCm/TheRock) build system to build ROCm from source.
 The repo tool from Google allows you to manage multiple git repositories simultaneously. Run the following commands to install the repo tool:
 ```bash
 mkdir -p ~/bin/
 curl https://storage.googleapis.com/git-repo-downloads/repo > ~/bin/repo
 chmod a+x ~/bin/repo
 ```
 **Note:** The ```~/bin/``` folder is used as an example. You can specify a different folder to install the repo tool into if you desire.
 ### Installing git-lfs
 Some ROCm projects use the Git Large File Storage (LFS) format that may require you to install git-lfs. Refer to [Git Large File Storage](https://github.com/git-lfs/git-lfs/blob/main/INSTALLING.md) for more information. For example, to install git-lfs for Ubuntu, use the following command:
 ```bash
 sudo apt-get install git-lfs
 ```
 ### Downloading the ROCm source code
 The following example shows how to use the repo tool to download the ROCm source code. If you choose a directory other than ~/bin/ to install the repo tool, you must use that chosen directory in the code as shown below:
 ```bash
 mkdir -p ~/ROCm/
 cd ~/ROCm/
 export ROCM_VERSION=6.4.0
 ~/bin/repo init -u http://github.com/ROCm/ROCm.git -b roc-6.4.x -m tools/rocm-build/rocm-${ROCM_VERSION}.xml
 ~/bin/repo sync
 ```
 **Note:** Using this sample code will cause the repo tool to download the open source code associated with the specified ROCm release. Ensure that you have ssh-keys configured on your machine for your GitHub ID prior to the download as explained at [Connecting to GitHub with SSH](https://docs.github.com/en/authentication/connecting-to-github-with-ssh).
 ## Building the ROCm source code
 Each ROCm component repository contains directions for building that component, such as the rocSPARSE documentation [Installation and Building for Linux](https://rocm.docs.amd.com/projects/rocSPARSE/en/latest/install/Linux_Install_Guide.html). Refer to the specific component documentation for instructions on building the repository.
 Each release of the ROCm software supports specific hardware and software configurations. Refer to [System requirements (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html) for the current supported hardware and OS.
 ## Build ROCm from source
 The Build will use as many processors as it can find to build in parallel. Some of the compiles can consume as much as 10GB of RAM, so make sure you have plenty of Swap Space !
 By default the ROCm build will compile for all supported GPU architectures and will take approximately 500 CPU hours.
 The Build time will reduce significantly if we limit the GPU Architecture/s against which we need to build by using the environment variable GPU_ARCHS as mentioned below.
 ```bash
 # --------------------------------------
 # Step1: clone source code
 # --------------------------------------
 mkdir -p ~/WORKSPACE/      # Or any folder name other than WORKSPACE
 cd ~/WORKSPACE/
 export ROCM_VERSION=6.4.0
 ~/bin/repo init -u http://github.com/ROCm/ROCm.git -b roc-6.4.x -m tools/rocm-build/rocm-${ROCM_VERSION}.xml
 ~/bin/repo sync
 # --------------------------------------
 # Step 2: Prepare build environment
 # --------------------------------------
 # Option 1: Start a docker container
 # Pulling required base docker images:
 # Ubuntu22.04 built from ROCm/tools/rocm-build/docker/ubuntu22/Dockerfile
 docker pull rocm/rocm-build-ubuntu-22.04:6.4
 # Ubuntu24.04 built from ROCm/tools/rocm-build/docker/ubuntu24/Dockerfile
 docker pull rocm/rocm-build-ubuntu-24.04:6.4
 # Start docker container and mount the source code folder:
 docker run -ti \
    -e ROCM_VERSION=${ROCM_VERSION} \
    -e CCACHE_DIR=$HOME/.ccache \
    -e CCACHE_ENABLED=true \
    -e DOCK_WORK_FOLD=/src \
    -w /src \
    -v $PWD:/src \
    -v /etc/passwd:/etc/passwd \
    -v /etc/shadow:/etc/shadow \
    -v ${HOME}/.ccache:${HOME}/.ccache \
    -u $(id -u):$(id -g) \
    <replace_with_required_ubuntu_base_docker_image> bash
 # Option 2: Install required packages into the host machine
 # For ubuntu22.04 system
 cd ROCm/tools/rocm-build/docker/ubuntu22
 cp * /tmp && cd /tmp
 bash install-prerequisites.sh
 # For ubuntu24.04 system
 cd ROCm/tools/rocm-build/docker/ubuntu24
 cp * /tmp && cd /tmp
 bash install-prerequisites.sh
 # --------------------------------------
 # Step 3: Run build command line
 # --------------------------------------
 # Select GPU targets before building:
 # When GPU_ARCHS is not set, default GPU targets supported by ROCm6.1 will be used.
 # To build against a subset of GFX architectures you can use the below env variable.
 # Support MI300 (gfx940, gfx941, gfx942).
 export GPU_ARCHS="gfx942"               # Example
 export GPU_ARCHS="gfx940;gfx941;gfx942" # Example
 # Pick and run build commands in the docker container:
 # Build rocm-dev packages
 make -f ROCm/tools/rocm-build/ROCm.mk -j ${NPROC:-$(nproc)} rocm-dev
 # Build all ROCm packages
 make -f ROCm/tools/rocm-build/ROCm.mk -j ${NPROC:-$(nproc)} all
 # list all ROCm components to find required components
 make -f ROCm/tools/rocm-build/ROCm.mk list_components
 # Build a single ROCm packages
 make -f ROCm/tools/rocm-build/ROCm.mk T_rocblas
 # Find built packages in ubuntu22.04:
 out/ubuntu-22.04/22.04/deb/
 # Find built packages in ubuntu24.04:
 out/ubuntu-24.04/24.04/deb/
 # Find built logs in ubuntu22.04:
 out/ubuntu-22.04/22.04/logs/
 # Find built logs in ubuntu24.04:
 out/ubuntu-24.04/24.04/logs/
 # All logs pertaining to failed components, end with .errrors extension.
 out/ubuntu-22.04/22.04/logs/rocblas.errors      # Example
 # All logs pertaining to building components, end with .inprogress extension.
 out/ubuntu-22.04/22.04/logs/rocblas.inprogress  # Example
 # All logs pertaining to passed components, use the component names.
 out/ubuntu-22.04/22.04/logs/rocblas             # Example
 ```
 Note: [Overview for ROCm.mk](tools/rocm-build/README.md)
 ## ROCm documentation
--- a/RELEASE.md
+++ b/RELEASE.md
--- a/default.xml
+++ b/default.xml
@@ -1,7 +1,7 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <manifest>
    <remote name="rocm-org" fetch="https://github.com/ROCm/" />
-    <default revision="refs/tags/rocm-6.4.0"
+    <default revision="refs/tags/rocm-6.4.1"
     remote="rocm-org"
     sync-c="true"
     sync-j="4" />
--- a/docs/compatibility/compatibility-matrix-historical-6.0.csv
+++ b/docs/compatibility/compatibility-matrix-historical-6.0.csv
@@ -1,121 +1,126 @@
-ROCm Version,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0, 6.1.5, 6.1.2, 6.1.1, 6.1.0, 6.0.2, 6.0.0
+ROCm Version,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0, 6.1.5, 6.1.2, 6.1.1, 6.1.0, 6.0.2, 6.0.0
-      :ref:`Operating systems & kernels <OS-kernel-versions>`,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,"Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04",Ubuntu 24.04,,,,,,
+      :ref:`Operating systems & kernels <OS-kernel-versions>`,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2,"Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04","Ubuntu 24.04.1, 24.04",Ubuntu 24.04,,,,,,
-      ,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,"Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3, 22.04.2","Ubuntu 22.04.4, 22.04.3, 22.04.2"
+      ,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5,"Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4","Ubuntu 22.04.5, 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3","Ubuntu 22.04.4, 22.04.3, 22.04.2","Ubuntu 22.04.4, 22.04.3, 22.04.2"
-      ,,,,,,,,,,"Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5"
+      ,,,,,,,,,,,"Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5","Ubuntu 20.04.6, 20.04.5"
-      ,"RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.3, 9.2","RHEL 9.3, 9.2"
+      ,"RHEL 9.6, 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.4, 9.3, 9.2","RHEL 9.3, 9.2","RHEL 9.3, 9.2"
-      ,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,"RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8"
+      ,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,RHEL 8.10,"RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.10, 8.9","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8","RHEL 8.9, 8.8"
-      ,SLES 15 SP6,"SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4"
+      ,SLES 15 SP6,SLES 15 SP6,"SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP6, SP5","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4","SLES 15 SP5, SP4"
-      ,,,,,,,,,,,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9
+      ,,,,,,,,,,,,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9,CentOS 7.9
-      ,"Oracle Linux 9, 8 [#mi300x-past-60]_",Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,,,
+      ,"Oracle Linux 9, 8 [#mi300x-past-60]_","Oracle Linux 9, 8 [#mi300x-past-60]_",Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.10 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,Oracle Linux 8.9 [#mi300x-past-60]_,,,
-,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,,,,,,,,,,,
+      ,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,Debian 12 [#single-node-past-60]_,,,,,,,,,,,
-,Azure Linux 3.0 [#mi300x-past-60]_,Azure Linux 3.0 [#mi300x-past-60]_,Azure Linux 3.0 [#mi300x-past-60]_,,,,,,,,,,,,
+      ,Azure Linux 3.0 [#mi300x-past-60]_,Azure Linux 3.0 [#mi300x-past-60]_,Azure Linux 3.0 [#mi300x-past-60]_,Azure Linux 3.0 [#mi300x-past-60]_,,,,,,,,,,,,
-      ,.. _architecture-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,
+      ,.. _architecture-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,
-      :doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3
+      :doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3,CDNA3
-      ,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2
+      ,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2,CDNA2
-      ,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA
+      ,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA,CDNA
-      ,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3
+      ,RDNA4,,,,,,,,,,,,,,,
-      ,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2
+      ,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3,RDNA3
-      ,.. _gpu-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,
+      ,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2,RDNA2
-      :doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>`,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100
+      ,.. _gpu-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,
-      ,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030
+      :doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>`,gfx1201 [#RDNA-OS-past-60]_,,,,,,,,,,,,,,,
-      ,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942 [#mi300_624-past-60]_,gfx942 [#mi300_622-past-60]_,gfx942 [#mi300_621-past-60]_,gfx942 [#mi300_620-past-60]_, gfx942 [#mi300_612-past-60]_, gfx942 [#mi300_612-past-60]_, gfx942 [#mi300_611-past-60]_, gfx942 [#mi300_610-past-60]_, gfx942 [#mi300_602-past-60]_, gfx942 [#mi300_600-past-60]_
+      ,gfx1200 [#RDNA-OS-past-60]_,,,,,,,,,,,,,,,
-      ,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a
+,gfx1101 [#RDNA-OS-past-60]_,,,,,,,,,,,,,,,
-      ,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908
+      ,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100,gfx1100
-,,,,,,,,,,,,,,,
+      ,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030,gfx1030
-      FRAMEWORK SUPPORT,.. _framework-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,
+      ,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942,gfx942 [#mi300_624-past-60]_,gfx942 [#mi300_622-past-60]_,gfx942 [#mi300_621-past-60]_,gfx942 [#mi300_620-past-60]_, gfx942 [#mi300_612-past-60]_, gfx942 [#mi300_612-past-60]_, gfx942 [#mi300_611-past-60]_, gfx942 [#mi300_610-past-60]_, gfx942 [#mi300_602-past-60]_, gfx942 [#mi300_600-past-60]_
-      :doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.6, 2.5, 2.4, 2.3","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13"
+      ,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a,gfx90a
-      :doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.14.0, 2.13.1, 2.12.1","2.14.0, 2.13.1, 2.12.1"
+      ,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908,gfx908
-      :doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.4.35,0.4.31,0.4.31,0.4.31,0.4.31,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26
+,,,,,,,,,,,,,,,,
-      `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.2,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.14.1,1.14.1
+      FRAMEWORK SUPPORT,.. _framework-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,
-      ,,,,,,,,,,,,,,,
+      :doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 1.13","2.4, 2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.3, 2.2, 2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13","2.1, 2.0, 1.13"
-      THIRD PARTY COMMS,.. _thirdpartycomms-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,
+      :doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.14.0, 2.13.1, 2.12.1","2.14.0, 2.13.1, 2.12.1"
-      `UCC <https://github.com/ROCm/ucc>`_,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.2.0,>=1.2.0
+      :doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.4.35,0.4.35,0.4.31,0.4.31,0.4.31,0.4.31,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26
-      `UCX <https://github.com/ROCm/ucx>`_,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1
+      `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.2,1.2,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.14.1,1.14.1
-      ,,,,,,,,,,,,,,,
+,,,,,,,,,,,,,,,,
-      THIRD PARTY ALGORITHM,.. _thirdpartyalgorithm-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,
+      ,,,,,,,,,,,,,,,,
-      Thrust,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
+      THIRD PARTY COMMS,.. _thirdpartycomms-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,
-      CUB,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
+      `UCC <https://github.com/ROCm/ucc>`_,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.3.0,>=1.2.0,>=1.2.0
-,,,,,,,,,,,,,,,
+      `UCX <https://github.com/ROCm/ucx>`_,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.15.0,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1,>=1.14.1
-      KMD & USER SPACE [#kfd_support-past-60]_,.. _kfd-userspace-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,
+      ,,,,,,,,,,,,,,,,
-      KMD versions,"6.4.x, 6.3.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x"
+      THIRD PARTY ALGORITHM,.. _thirdpartyalgorithm-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,
-      ,,,,,,,,,,,,,,,
+      Thrust,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
-      ML & COMPUTER VISION,.. _mllibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,
+      CUB,2.5.0,2.5.0,2.3.2,2.3.2,2.3.2,2.3.2,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.1,2.0.1
-      :doc:`Composable Kernel <composable_kernel:index>`,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0
+,,,,,,,,,,,,,,,,
-      :doc:`MIGraphX <amdmigraphx:index>`,2.12.0,2.11.0,2.11.0,2.11.0,2.11.0,2.10.0,2.10.0,2.10.0,2.10.0,2.9.0,2.9.0,2.9.0,2.9.0,2.8.0,2.8.0
+      KMD & USER SPACE [#kfd_support-past-60]_,.. _kfd-userspace-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,
-      :doc:`MIOpen <miopen:index>`,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
+      :doc:`KMD versions <rocm-install-on-linux:reference/user-kernel-space-compat-matrix>`,"6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x, 5.7.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x","6.2.x, 6.1.x, 6.0.x, 5.7.x, 5.6.x"
-      :doc:`MIVisionX <mivisionx:index>`,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0
+      ,,,,,,,,,,,,,,,,
-      :doc:`rocAL <rocal:index>`,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0,2.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
+      ML & COMPUTER VISION,.. _mllibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,
-      :doc:`rocDecode <rocdecode:index>`,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.0,0.5.0,N/A,N/A
+      :doc:`Composable Kernel <composable_kernel:index>`,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0
-      :doc:`rocJPEG <rocjpeg:index>`,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`MIGraphX <amdmigraphx:index>`,2.12.0,2.12.0,2.11.0,2.11.0,2.11.0,2.11.0,2.10.0,2.10.0,2.10.0,2.10.0,2.9.0,2.9.0,2.9.0,2.9.0,2.8.0,2.8.0
-      :doc:`rocPyDecode <rocpydecode:index>`,0.3.1,0.2.0,0.2.0,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`MIOpen <miopen:index>`,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
-      :doc:`RPP <rpp:index>`,1.9.10,1.9.1,1.9.1,1.9.1,1.9.1,1.8.0,1.8.0,1.8.0,1.8.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0
+      :doc:`MIVisionX <mivisionx:index>`,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0,2.5.0
-      ,,,,,,,,,,,,,,,
+      :doc:`rocAL <rocal:index>`,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0,2.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
-      COMMUNICATION,.. _commlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,
+      :doc:`rocDecode <rocdecode:index>`,0.10.0,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.6.0,0.5.0,0.5.0,N/A,N/A
-      :doc:`RCCL <rccl:index>`,2.22.3,2.21.5,2.21.5,2.21.5,2.21.5,2.20.5,2.20.5,2.20.5,2.20.5,2.18.6,2.18.6,2.18.6,2.18.6,2.18.3,2.18.3
+      :doc:`rocJPEG <rocjpeg:index>`,0.8.0,0.8.0,0.6.0,0.6.0,0.6.0,0.6.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-`rocSHMEM <https://github.com/ROCm/rocSHMEM>`_,2.0.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`rocPyDecode <rocpydecode:index>`,0.3.1,0.3.1,0.2.0,0.2.0,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0,N/A,N/A,N/A,N/A,N/A,N/A
-      ,,,,,,,,,,,,,,,
+      :doc:`RPP <rpp:index>`,1.9.10,1.9.10,1.9.1,1.9.1,1.9.1,1.9.1,1.8.0,1.8.0,1.8.0,1.8.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0
-      MATH LIBS,.. _mathlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,
+      ,,,,,,,,,,,,,,,,
-      `half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0
+      COMMUNICATION,.. _commlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,
-      :doc:`hipBLAS <hipblas:index>`,2.4.0,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0
+      :doc:`RCCL <rccl:index>`,2.22.3,2.22.3,2.21.5,2.21.5,2.21.5,2.21.5,2.20.5,2.20.5,2.20.5,2.20.5,2.18.6,2.18.6,2.18.6,2.18.6,2.18.3,2.18.3
-      :doc:`hipBLASLt <hipblaslt:index>`,0.12.0,0.10.0,0.10.0,0.10.0,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.7.0,0.7.0,0.7.0,0.7.0,0.6.0,0.6.0
+      :doc:`rocSHMEM <rocshmem:index>`,2.0.0,2.0.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`hipFFT <hipfft:index>`,1.0.18,1.0.17,1.0.17,1.0.17,1.0.17,1.0.16,1.0.15,1.0.15,1.0.14,1.0.14,1.0.14,1.0.14,1.0.14,1.0.13,1.0.13
+      ,,,,,,,,,,,,,,,,
-      :doc:`hipfort <hipfort:index>`,0.6.0,0.5.1,0.5.1,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0
+      MATH LIBS,.. _mathlibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,
-      :doc:`hipRAND <hiprand:index>`,2.12.0,2.11.1,2.11.1,2.11.1,2.11.0,2.11.1,2.11.0,2.11.0,2.11.0,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16
+      `half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0,1.12.0
-      :doc:`hipSOLVER <hipsolver:index>`,2.4.0,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.1,2.1.1,2.1.1,2.1.0,2.0.0,2.0.0
+      :doc:`hipBLAS <hipblas:index>`,2.4.0,2.4.0,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.0,2.1.0,2.1.0,2.1.0,2.0.0,2.0.0
-      :doc:`hipSPARSE <hipsparse:index>`,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.1.1,3.1.1,3.1.1,3.1.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
+      :doc:`hipBLASLt <hipblaslt:index>`,0.12.1,0.12.0,0.10.0,0.10.0,0.10.0,0.10.0,0.8.0,0.8.0,0.8.0,0.8.0,0.7.0,0.7.0,0.7.0,0.7.0,0.6.0,0.6.0
-      :doc:`hipSPARSELt <hipsparselt:index>`,0.2.3,0.2.2,0.2.2,0.2.2,0.2.2,0.2.1,0.2.1,0.2.1,0.2.1,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0
+      :doc:`hipFFT <hipfft:index>`,1.0.18,1.0.18,1.0.17,1.0.17,1.0.17,1.0.17,1.0.16,1.0.15,1.0.15,1.0.14,1.0.14,1.0.14,1.0.14,1.0.14,1.0.13,1.0.13
-      :doc:`rocALUTION <rocalution:index>`,3.2.2,3.2.1,3.2.1,3.2.1,3.2.1,3.2.1,3.2.0,3.2.0,3.2.0,3.1.1,3.1.1,3.1.1,3.1.1,3.0.3,3.0.3
+      :doc:`hipfort <hipfort:index>`,0.6.0,0.6.0,0.5.1,0.5.1,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0
-      :doc:`rocBLAS <rocblas:index>`,4.4.0,4.3.0,4.3.0,4.3.0,4.3.0,4.2.4,4.2.1,4.2.1,4.2.0,4.1.2,4.1.2,4.1.0,4.1.0,4.0.0,4.0.0
+      :doc:`hipRAND <hiprand:index>`,2.12.0,2.12.0,2.11.1,2.11.1,2.11.1,2.11.0,2.11.1,2.11.0,2.11.0,2.11.0,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16,2.10.16
-      :doc:`rocFFT <rocfft:index>`,1.0.32,1.0.31,1.0.31,1.0.31,1.0.31,1.0.30,1.0.29,1.0.29,1.0.28,1.0.27,1.0.27,1.0.27,1.0.26,1.0.25,1.0.23
+      :doc:`hipSOLVER <hipsolver:index>`,2.4.0,2.4.0,2.3.0,2.3.0,2.3.0,2.3.0,2.2.0,2.2.0,2.2.0,2.2.0,2.1.1,2.1.1,2.1.1,2.1.0,2.0.0,2.0.0
-      :doc:`rocRAND <rocrand:index>`,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.1,3.1.0,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,2.10.17
+      :doc:`hipSPARSE <hipsparse:index>`,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.1.1,3.1.1,3.1.1,3.1.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
-      :doc:`rocSOLVER <rocsolver:index>`,3.28.0,3.27.0,3.27.0,3.27.0,3.27.0,3.26.2,3.26.0,3.26.0,3.26.0,3.25.0,3.25.0,3.25.0,3.25.0,3.24.0,3.24.0
+      :doc:`hipSPARSELt <hipsparselt:index>`,0.2.3,0.2.3,0.2.2,0.2.2,0.2.2,0.2.2,0.2.1,0.2.1,0.2.1,0.2.1,0.2.0,0.2.0,0.1.0,0.1.0,0.1.0,0.1.0
-      :doc:`rocSPARSE <rocsparse:index>`,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.0.2,3.0.2
+      :doc:`rocALUTION <rocalution:index>`,3.2.3,3.2.2,3.2.1,3.2.1,3.2.1,3.2.1,3.2.1,3.2.0,3.2.0,3.2.0,3.1.1,3.1.1,3.1.1,3.1.1,3.0.3,3.0.3
-      :doc:`rocWMMA <rocwmma:index>`,1.7.0,1.6.0,1.6.0,1.6.0,1.6.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0
+      :doc:`rocBLAS <rocblas:index>`,4.4.0,4.4.0,4.3.0,4.3.0,4.3.0,4.3.0,4.2.4,4.2.1,4.2.1,4.2.0,4.1.2,4.1.2,4.1.0,4.1.0,4.0.0,4.0.0
-      :doc:`Tensile <tensile:src/index>`,4.43.0,4.42.0,4.42.0,4.42.0,4.42.0,4.41.0,4.41.0,4.41.0,4.41.0,4.40.0,4.40.0,4.40.0,4.40.0,4.39.0,4.39.0
+      :doc:`rocFFT <rocfft:index>`,1.0.32,1.0.32,1.0.31,1.0.31,1.0.31,1.0.31,1.0.30,1.0.29,1.0.29,1.0.28,1.0.27,1.0.27,1.0.27,1.0.26,1.0.25,1.0.23
-      ,,,,,,,,,,,,,,,
+      :doc:`rocRAND <rocrand:index>`,3.3.0,3.3.0,3.2.0,3.2.0,3.2.0,3.2.0,3.1.1,3.1.0,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,2.10.17
-      PRIMITIVES,.. _primitivelibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,
+      :doc:`rocSOLVER <rocsolver:index>`,3.28.0,3.28.0,3.27.0,3.27.0,3.27.0,3.27.0,3.26.2,3.26.0,3.26.0,3.26.0,3.25.0,3.25.0,3.25.0,3.25.0,3.24.0,3.24.0
-      :doc:`hipCUB <hipcub:index>`,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
+      :doc:`rocSPARSE <rocsparse:index>`,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.2,3.1.2,3.1.2,3.1.2,3.0.2,3.0.2
-      :doc:`hipTensor <hiptensor:index>`,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0,1.3.0,1.3.0,1.2.0,1.2.0,1.2.0,1.2.0,1.1.0,1.1.0
+      :doc:`rocWMMA <rocwmma:index>`,1.7.0,1.7.0,1.6.0,1.6.0,1.6.0,1.6.0,1.5.0,1.5.0,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0
-      :doc:`rocPRIM <rocprim:index>`,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.2,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
+      :doc:`Tensile <tensile:src/index>`,4.43.0,4.43.0,4.42.0,4.42.0,4.42.0,4.42.0,4.41.0,4.41.0,4.41.0,4.41.0,4.40.0,4.40.0,4.40.0,4.40.0,4.39.0,4.39.0
-      :doc:`rocThrust <rocthrust:index>`,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.1.1,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
+      ,,,,,,,,,,,,,,,,
-      ,,,,,,,,,,,,,,,
+      PRIMITIVES,.. _primitivelibs-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,
-      SUPPORT LIBS,,,,,,,,,,,,,,,
+      :doc:`hipCUB <hipcub:index>`,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.1,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
-      `hipother <https://github.com/ROCm/hipother>`_,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
+      :doc:`hipTensor <hiptensor:index>`,1.5.0,1.5.0,1.4.0,1.4.0,1.4.0,1.4.0,1.3.0,1.3.0,1.3.0,1.3.0,1.2.0,1.2.0,1.2.0,1.2.0,1.1.0,1.1.0
-      `rocm-core <https://github.com/ROCm/rocm-core>`_,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0,6.1.5,6.1.2,6.1.1,6.1.0,6.0.2,6.0.0
+      :doc:`rocPRIM <rocprim:index>`,3.4.0,3.4.0,3.3.0,3.3.0,3.3.0,3.3.0,3.2.2,3.2.0,3.2.0,3.2.0,3.1.0,3.1.0,3.1.0,3.1.0,3.0.0,3.0.0
-      `ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,20240607.5.7,20240607.5.7,20240607.4.05,20240607.1.4246,20240125.5.08,20240125.5.08,20240125.5.08,20240125.3.30,20231016.2.245,20231016.2.245
+      :doc:`rocThrust <rocthrust:index>`,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.3.0,3.1.1,3.1.0,3.1.0,3.0.1,3.0.1,3.0.1,3.0.1,3.0.1,3.0.0,3.0.0
-      ,,,,,,,,,,,,,,,
+      ,,,,,,,,,,,,,,,,
-      SYSTEM MGMT TOOLS,.. _tools-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,
+      SUPPORT LIBS,,,,,,,,,,,,,,,,
-      :doc:`AMD SMI <amdsmi:index>`,25.3.0,24.7.1,24.7.1,24.7.1,24.7.1,24.6.3,24.6.3,24.6.3,24.6.2,24.5.1,24.5.1,24.5.1,24.4.1,23.4.2,23.4.2
+      `hipother <https://github.com/ROCm/hipother>`_,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
-      :doc:`ROCm Data Center Tool <rdc:index>`,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0
+      `rocm-core <https://github.com/ROCm/rocm-core>`_,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6.2.0,6.1.5,6.1.2,6.1.1,6.1.0,6.0.2,6.0.0
-      :doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
+      `ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,N/A [#ROCT-rocr-past-60]_,20240607.5.7,20240607.5.7,20240607.4.05,20240607.1.4246,20240125.5.08,20240125.5.08,20240125.5.08,20240125.3.30,20231016.2.245,20231016.2.245
-      :doc:`ROCm SMI <rocm_smi_lib:index>`,7.5.0,7.4.0,7.4.0,7.4.0,7.4.0,7.3.0,7.3.0,7.3.0,7.3.0,7.2.0,7.2.0,7.0.0,7.0.0,6.0.2,6.0.0
+      ,,,,,,,,,,,,,,,,
-      :doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.0.60204,1.0.60202,1.0.60201,1.0.60200,1.0.60105,1.0.60102,1.0.60101,1.0.60100,1.0.60002,1.0.60000
+      SYSTEM MGMT TOOLS,.. _tools-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,
-      ,,,,,,,,,,,,,,,
+      :doc:`AMD SMI <amdsmi:index>`,25.4.2,25.3.0,24.7.1,24.7.1,24.7.1,24.7.1,24.6.3,24.6.3,24.6.3,24.6.2,24.5.1,24.5.1,24.5.1,24.4.1,23.4.2,23.4.2
-      PERFORMANCE TOOLS,,,,,,,,,,,,,,,
+      :doc:`ROCm Data Center Tool <rdc:index>`,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0,0.3.0
-      :doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0
+      :doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
-      :doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.0.1,2.0.1,2.0.1,2.0.1,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`ROCm SMI <rocm_smi_lib:index>`,7.5.0,7.5.0,7.4.0,7.4.0,7.4.0,7.4.0,7.3.0,7.3.0,7.3.0,7.3.0,7.2.0,7.2.0,7.0.0,7.0.0,6.0.2,6.0.0
-      :doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.0.0,0.1.2,0.1.1,0.1.0,0.1.0,1.11.2,1.11.2,1.11.2,1.11.2,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.1.0,1.0.60204,1.0.60202,1.0.60201,1.0.60200,1.0.60105,1.0.60102,1.0.60101,1.0.60100,1.0.60002,1.0.60000
-      :doc:`ROCProfiler <rocprofiler:index>`,2.0.60400,2.0.60303,2.0.60302,2.0.60301,2.0.60300,2.0.60204,2.0.60202,2.0.60201,2.0.60200,2.0.60105,2.0.60102,2.0.60101,2.0.60100,2.0.60002,2.0.60000
+      ,,,,,,,,,,,,,,,,
-      :doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,0.6.0,0.5.0,0.5.0,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,N/A,N/A,N/A,N/A,N/A,N/A
+      PERFORMANCE TOOLS,,,,,,,,,,,,,,,,
-      :doc:`ROCTracer <roctracer:index>`,4.1.60400,4.1.60303,4.1.60302,4.1.60301,4.1.60300,4.1.60204,4.1.60202,4.1.60201,4.1.60200,4.1.60105,4.1.60102,4.1.60101,4.1.60100,4.1.60002,4.1.60000
+      :doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0,1.4.0
-      ,,,,,,,,,,,,,,,
+      :doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.1.0,3.1.0,3.0.0,3.0.0,3.0.0,3.0.0,2.0.1,2.0.1,2.0.1,2.0.1,N/A,N/A,N/A,N/A,N/A,N/A
-      DEVELOPMENT TOOLS,,,,,,,,,,,,,,,
+      :doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.0.1,1.0.0,0.1.2,0.1.1,0.1.0,0.1.0,1.11.2,1.11.2,1.11.2,1.11.2,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`HIPIFY <hipify:index>`,19.0.0.25104,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
+      :doc:`ROCProfiler <rocprofiler:index>`,2.0.60401,2.0.60400,2.0.60303,2.0.60302,2.0.60301,2.0.60300,2.0.60204,2.0.60202,2.0.60201,2.0.60200,2.0.60105,2.0.60102,2.0.60101,2.0.60100,2.0.60002,2.0.60000
-      :doc:`ROCm CMake <rocmcmakebuildtools:index>`,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.13.0,0.13.0,0.13.0,0.13.0,0.12.0,0.12.0,0.12.0,0.12.0,0.11.0,0.11.0
+      :doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,0.6.0,0.6.0,0.5.0,0.5.0,0.5.0,0.5.0,0.4.0,0.4.0,0.4.0,0.4.0,N/A,N/A,N/A,N/A,N/A,N/A
-      :doc:`ROCdbgapi <rocdbgapi:index>`,0.77.2,0.77.0,0.77.0,0.77.0,0.77.0,0.76.0,0.76.0,0.76.0,0.76.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0
+      :doc:`ROCTracer <roctracer:index>`,4.1.60401,4.1.60400,4.1.60303,4.1.60302,4.1.60301,4.1.60300,4.1.60204,4.1.60202,4.1.60201,4.1.60200,4.1.60105,4.1.60102,4.1.60101,4.1.60100,4.1.60002,4.1.60000
-      :doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,14.2.0,14.2.0,14.2.0,14.2.0,14.1.0,14.1.0,14.1.0,14.1.0,13.2.0,13.2.0
+      ,,,,,,,,,,,,,,,,
-      `rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.3.0,0.3.0,0.3.0,0.3.0,N/A,N/A
+      DEVELOPMENT TOOLS,,,,,,,,,,,,,,,,
-      :doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.0.4,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3
+      :doc:`HIPIFY <hipify:index>`,19.0.0,19.0.0,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
-      ,,,,,,,,,,,,,,,
+      :doc:`ROCm CMake <rocmcmakebuildtools:index>`,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.14.0,0.13.0,0.13.0,0.13.0,0.13.0,0.12.0,0.12.0,0.12.0,0.12.0,0.11.0,0.11.0
-      COMPILERS,.. _compilers-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,
+      :doc:`ROCdbgapi <rocdbgapi:index>`,0.77.2,0.77.2,0.77.0,0.77.0,0.77.0,0.77.0,0.76.0,0.76.0,0.76.0,0.76.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0,0.71.0
-      `clang-ocl <https://github.com/ROCm/clang-ocl>`_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0
+      :doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,15.2.0,14.2.0,14.2.0,14.2.0,14.2.0,14.1.0,14.1.0,14.1.0,14.1.0,13.2.0,13.2.0
-      :doc:`hipCC <hipcc:index>`,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
+      `rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.4.0,0.3.0,0.3.0,0.3.0,0.3.0,N/A,N/A
-      `Flang <https://github.com/ROCm/flang>`_,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
+      :doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.0.4,2.0.4,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3,2.0.3
-      :doc:`llvm-project <llvm-project:index>`,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
+      ,,,,,,,,,,,,,,,,
-      `OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
+      COMPILERS,.. _compilers-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,
-,,,,,,,,,,,,,,,
+      `clang-ocl <https://github.com/ROCm/clang-ocl>`_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0,0.5.0
-      RUNTIMES,.. _runtime-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,
+      :doc:`hipCC <hipcc:index>`,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.1.1,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0,1.0.0
-      :doc:`AMD CLR <hip:understand/amd_clr>`,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
+      `Flang <https://github.com/ROCm/flang>`_,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24455,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
-      :doc:`HIP <hip:index>`,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
+      :doc:`llvm-project <llvm-project:index>`,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
-      `OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0
+      `OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,19.0.0.25184,19.0.0.25133,18.0.0.25012,18.0.0.25012,18.0.0.24491,18.0.0.24491,18.0.0.24392,18.0.0.24355,18.0.0.24355,18.0.0.24232,17.0.0.24193,17.0.0.24193,17.0.0.24154,17.0.0.24103,17.0.0.24012,17.0.0.23483
-      :doc:`ROCr Runtime <rocr-runtime:index>`,1.15.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.13.0,1.13.0,1.13.0,1.13.0,1.13.0,1.12.0,1.12.0
+,,,,,,,,,,,,,,,,
      RUNTIMES,.. _runtime-support-compatibility-matrix-past-60:,,,,,,,,,,,,,,,
      :doc:`AMD CLR <hip:understand/amd_clr>`,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
      :doc:`HIP <hip:index>`,6.4.43483,6.4.43482,6.3.42134,6.3.42134,6.3.42133,6.3.42131,6.2.41134,6.2.41134,6.2.41134,6.2.41133,6.1.40093,6.1.40093,6.1.40092,6.1.40091,6.1.32831,6.1.32830
      `OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0,2.0.0
      :doc:`ROCr Runtime <rocr-runtime:index>`,1.15.0,1.15.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.14.0,1.13.0,1.13.0,1.13.0,1.13.0,1.13.0,1.12.0,1.12.0
--- a/docs/compatibility/compatibility-matrix.rst
+++ b/docs/compatibility/compatibility-matrix.rst
@@ -23,127 +23,131 @@ compatibility and system requirements.
 .. container:: format-big-table
  .. csv-table::
-      :header: "ROCm Version", "6.4.0", "6.3.3", "6.2.0"
+      :header: "ROCm Version", "6.4.1", "6.4.0", "6.3.0"
      :stub-columns: 1
-      :ref:`Operating systems & kernels <OS-kernel-versions>`,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04
+      :ref:`Operating systems & kernels <OS-kernel-versions>`,Ubuntu 24.04.2,Ubuntu 24.04.2,Ubuntu 24.04.2
-      ,Ubuntu 22.04.5,Ubuntu 22.04.5,"Ubuntu 22.04.5, 22.04.4"
+      ,Ubuntu 22.04.5,Ubuntu 22.04.5,Ubuntu 22.04.5
-      ,"RHEL 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.4, 9.3"
+      ,"RHEL 9.6, 9.5, 9.4","RHEL 9.5, 9.4","RHEL 9.5, 9.4"
-      ,RHEL 8.10,RHEL 8.10,"RHEL 8.10, 8.9"
+      ,RHEL 8.10,RHEL 8.10,RHEL 8.10
-      ,"SLES 15 SP6","SLES 15 SP6, SP5","SLES 15 SP6, SP5"
+      ,SLES 15 SP6,SLES 15 SP6,"SLES 15 SP6, SP5"
-      ,"Oracle Linux 9, 8 [#mi300x]_",Oracle Linux 8.10 [#mi300x]_,Oracle Linux 8.9 [#mi300x]_
+      ,"Oracle Linux 9, 8 [#mi300x]_","Oracle Linux 9, 8 [#mi300x]_",Oracle Linux 8.10 [#mi300x]_
      ,Debian 12 [#single-node]_,Debian 12 [#single-node]_,
      ,Azure Linux 3.0 [#mi300x]_,Azure Linux 3.0 [#mi300x]_,
      ,.. _architecture-support-compatibility-matrix:,,
      :doc:`Architecture <rocm-install-on-linux:reference/system-requirements>`,CDNA3,CDNA3,CDNA3
      ,CDNA2,CDNA2,CDNA2
      ,CDNA,CDNA,CDNA
      ,RDNA4,,
      ,RDNA3,RDNA3,RDNA3
      ,RDNA2,RDNA2,RDNA2
      ,.. _gpu-support-compatibility-matrix:,,
-      :doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>`,gfx1100,gfx1100,gfx1100
+      :doc:`GPU / LLVM target <rocm-install-on-linux:reference/system-requirements>`,gfx1201 [#RDNA-OS]_,,
      ,gfx1200 [#RDNA-OS]_,,
      ,gfx1101 [#RDNA-OS]_,,
      ,gfx1100,gfx1100,gfx1100
      ,gfx1030,gfx1030,gfx1030
-      ,gfx942,gfx942,gfx942 [#mi300_620]_
+      ,gfx942,gfx942,gfx942
      ,gfx90a,gfx90a,gfx90a
      ,gfx908,gfx908,gfx908
      ,,,
      FRAMEWORK SUPPORT,.. _framework-support-compatibility-matrix:,,
-      :doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.6, 2.5, 2.4, 2.3","2.4, 2.3, 2.2, 1.13","2.3, 2.2, 2.1, 2.0, 1.13"
+      :doc:`PyTorch <../compatibility/ml-compatibility/pytorch-compatibility>`,"2.6, 2.5, 2.4, 2.3","2.6, 2.5, 2.4, 2.3","2.4, 2.3, 2.2, 2.1, 2.0, 1.13"
-      :doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1","2.16.1, 2.15.1, 2.14.1"
+      :doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1"
-      :doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.4.35,0.4.31,0.4.26
+      :doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.4.35,0.4.35,0.4.31
-      `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.20,1.17.3,1.17.3
+      `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.2,1.2,1.17.3
      ,,,
      THIRD PARTY COMMS,.. _thirdpartycomms-support-compatibility-matrix:,,
      `UCC <https://github.com/ROCm/ucc>`_,>=1.3.0,>=1.3.0,>=1.3.0
      `UCX <https://github.com/ROCm/ucx>`_,>=1.15.0,>=1.15.0,>=1.15.0
      ,,,
      THIRD PARTY ALGORITHM,.. _thirdpartyalgorithm-support-compatibility-matrix:,,
-      Thrust,2.5.0,2.3.2,2.2.0
+      Thrust,2.5.0,2.5.0,2.3.2
-      CUB,2.5.0,2.3.2,2.2.0
+      CUB,2.5.0,2.5.0,2.3.2
      ,,,
      KMD & USER SPACE [#kfd_support]_,.. _kfd-userspace-support-compatibility-matrix:,,
-      KMD versions,"6.4.x, 6.3.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x, 6.0.x"
+      :doc:`KMD versions <rocm-install-on-linux:reference/user-kernel-space-compat-matrix>`,"6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x","6.4.x, 6.3.x, 6.2.x, 6.1.x"
      ,,,
      ML & COMPUTER VISION,.. _mllibs-support-compatibility-matrix:,,
      :doc:`Composable Kernel <composable_kernel:index>`,1.1.0,1.1.0,1.1.0
-      :doc:`MIGraphX <amdmigraphx:index>`,2.12.0,2.11.0,2.10.0
+      :doc:`MIGraphX <amdmigraphx:index>`,2.12.0,2.12.0,2.11.0
-      :doc:`MIOpen <miopen:index>`,3.4.0,3.3.0,3.2.0
+      :doc:`MIOpen <miopen:index>`,3.4.0,3.4.0,3.3.0
-      :doc:`MIVisionX <mivisionx:index>`,3.2.0,3.1.0,3.0.0
+      :doc:`MIVisionX <mivisionx:index>`,3.2.0,3.2.0,3.1.0
-      :doc:`rocAL <rocal:index>`,2.2.0,2.1.0,1.0.0
+      :doc:`rocAL <rocal:index>`,2.2.0,2.2.0,2.1.0
-      :doc:`rocDecode <rocdecode:index>`,0.10.0,0.8.0,0.6.0
+      :doc:`rocDecode <rocdecode:index>`,0.10.0,0.10.0,0.8.0
-      :doc:`rocJPEG <rocjpeg:index>`,0.8.0,0.6.0,N/A
+      :doc:`rocJPEG <rocjpeg:index>`,0.8.0,0.8.0,0.6.0
-      :doc:`rocPyDecode <rocpydecode:index>`,0.3.1,0.2.0,0.1.0
+      :doc:`rocPyDecode <rocpydecode:index>`,0.3.1,0.3.1,0.2.0
-      :doc:`RPP <rpp:index>`,1.9.10,1.9.1,1.8.0
+      :doc:`RPP <rpp:index>`,1.9.10,1.9.10,1.9.1
      ,,,
      COMMUNICATION,.. _commlibs-support-compatibility-matrix:,,
-      :doc:`RCCL <rccl:index>`,2.22.3,2.21.5,2.20.5
+      :doc:`RCCL <rccl:index>`,2.22.3,2.22.3,2.21.5
-      `rocSHMEM <https://github.com/ROCm/rocSHMEM>`_ ,2.0.0,N/A,N/A
+      :doc:`rocSHMEM <rocshmem:index>`,2.0.0,2.0.0,N/A
      ,,,
      MATH LIBS,.. _mathlibs-support-compatibility-matrix:,,
      `half <https://github.com/ROCm/half>`_ ,1.12.0,1.12.0,1.12.0
-      :doc:`hipBLAS <hipblas:index>`,2.4.0,2.3.0,2.2.0
+      :doc:`hipBLAS <hipblas:index>`,2.4.0,2.4.0,2.3.0
-      :doc:`hipBLASLt <hipblaslt:index>`,0.12.0,0.10.0,0.8.0
+      :doc:`hipBLASLt <hipblaslt:index>`,0.12.1,0.12.0,0.10.0
-      :doc:`hipFFT <hipfft:index>`,1.0.18,1.0.17,1.0.14
+      :doc:`hipFFT <hipfft:index>`,1.0.18,1.0.18,1.0.17
-      :doc:`hipfort <hipfort:index>`,0.6.0,0.5.1,0.4.0
+      :doc:`hipfort <hipfort:index>`,0.6.0,0.6.0,0.5.0
-      :doc:`hipRAND <hiprand:index>`,2.12.0,2.11.1,2.11.0
+      :doc:`hipRAND <hiprand:index>`,2.12.0,2.12.0,2.11.0
-      :doc:`hipSOLVER <hipsolver:index>`,2.4.0,2.3.0,2.2.0
+      :doc:`hipSOLVER <hipsolver:index>`,2.4.0,2.4.0,2.3.0
-      :doc:`hipSPARSE <hipsparse:index>`,3.2.0,3.1.2,3.1.1
+      :doc:`hipSPARSE <hipsparse:index>`,3.2.0,3.2.0,3.1.2
-      :doc:`hipSPARSELt <hipsparselt:index>`,0.2.3,0.2.2,0.2.1
+      :doc:`hipSPARSELt <hipsparselt:index>`,0.2.3,0.2.3,0.2.2
-      :doc:`rocALUTION <rocalution:index>`,3.2.2,3.2.1,3.2.0
+      :doc:`rocALUTION <rocalution:index>`,3.2.3,3.2.2,3.2.1
-      :doc:`rocBLAS <rocblas:index>`,4.4.0,4.3.0,4.2.0
+      :doc:`rocBLAS <rocblas:index>`,4.4.0,4.4.0,4.3.0
-      :doc:`rocFFT <rocfft:index>`,1.0.32,1.0.31,1.0.28
+      :doc:`rocFFT <rocfft:index>`,1.0.32,1.0.32,1.0.31
-      :doc:`rocRAND <rocrand:index>`,3.3.0,3.2.0,3.1.0
+      :doc:`rocRAND <rocrand:index>`,3.3.0,3.3.0,3.2.0
-      :doc:`rocSOLVER <rocsolver:index>`,3.28.0,3.27.0,3.26.0
+      :doc:`rocSOLVER <rocsolver:index>`,3.28.0,3.28.0,3.27.0
-      :doc:`rocSPARSE <rocsparse:index>`,3.4.0,3.3.0,3.2.0
+      :doc:`rocSPARSE <rocsparse:index>`,3.4.0,3.4.0,3.3.0
-      :doc:`rocWMMA <rocwmma:index>`,1.7.0,1.6.0,1.5.0
+      :doc:`rocWMMA <rocwmma:index>`,1.7.0,1.7.0,1.6.0
-      :doc:`Tensile <tensile:src/index>`,4.43.0,4.42.0,4.41.0
+      :doc:`Tensile <tensile:src/index>`,4.43.0,4.43.0,4.42.0
      ,,,
      PRIMITIVES,.. _primitivelibs-support-compatibility-matrix:,,
-      :doc:`hipCUB <hipcub:index>`,3.4.0,3.3.0,3.2.0
+      :doc:`hipCUB <hipcub:index>`,3.4.0,3.4.0,3.3.0
-      :doc:`hipTensor <hiptensor:index>`,1.5.0,1.4.0,1.3.0
+      :doc:`hipTensor <hiptensor:index>`,1.5.0,1.5.0,1.4.0
-      :doc:`rocPRIM <rocprim:index>`,3.4.0,3.3.0,3.2.0
+      :doc:`rocPRIM <rocprim:index>`,3.4.0,3.4.0,3.3.0
-      :doc:`rocThrust <rocthrust:index>`,3.3.0,3.3.0,3.0.1
+      :doc:`rocThrust <rocthrust:index>`,3.3.0,3.3.0,3.3.0
      ,,,
      SUPPORT LIBS,,,
-      `hipother <https://github.com/ROCm/hipother>`_,6.4.43482,6.3.42134,6.2.41133
+      `hipother <https://github.com/ROCm/hipother>`_,6.4.43483,6.4.43482,6.3.42131
-      `rocm-core <https://github.com/ROCm/rocm-core>`_,6.4.0,6.3.3,6.2.0
+      `rocm-core <https://github.com/ROCm/rocm-core>`_,6.4.1,6.4.0,6.3.0
-      `ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr]_,N/A [#ROCT-rocr]_,20240607.1.4246
+      `ROCT-Thunk-Interface <https://github.com/ROCm/ROCT-Thunk-Interface>`_,N/A [#ROCT-rocr]_,N/A [#ROCT-rocr]_,N/A [#ROCT-rocr]_
      ,,,
      SYSTEM MGMT TOOLS,.. _tools-support-compatibility-matrix:,,
-      :doc:`AMD SMI <amdsmi:index>`,25.3.0,24.7.1,24.6.2
+      :doc:`AMD SMI <amdsmi:index>`,25.4.2,25.3.0,24.7.1
      :doc:`ROCm Data Center Tool <rdc:index>`,0.3.0,0.3.0,0.3.0
      :doc:`rocminfo <rocminfo:index>`,1.0.0,1.0.0,1.0.0
-      :doc:`ROCm SMI <rocm_smi_lib:index>`,7.5.0,7.4.0,7.3.0
+      :doc:`ROCm SMI <rocm_smi_lib:index>`,7.5.0,7.5.0,7.4.0
-      :doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.1.0,1.1.0,1.0.60200
+      :doc:`ROCm Validation Suite <rocmvalidationsuite:index>`,1.1.0,1.1.0,1.1.0
      ,,,
      PERFORMANCE TOOLS,,,
      :doc:`ROCm Bandwidth Test <rocm_bandwidth_test:index>`,1.4.0,1.4.0,1.4.0
-      :doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.1.0,3.0.0,2.0.1
+      :doc:`ROCm Compute Profiler <rocprofiler-compute:index>`,3.1.0,3.1.0,3.0.0
-      :doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.0.0,0.1.2,1.11.2
+      :doc:`ROCm Systems Profiler <rocprofiler-systems:index>`,1.0.1,1.0.0,0.1.0
-      :doc:`ROCProfiler <rocprofiler:index>`,2.0.60400,2.0.60303,2.0.60200
+      :doc:`ROCProfiler <rocprofiler:index>`,2.0.60401,2.0.60400,2.0.60300
-      :doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,0.6.0,0.5.0,0.4.0
+      :doc:`ROCprofiler-SDK <rocprofiler-sdk:index>`,0.6.0,0.6.0,0.5.0
-      :doc:`ROCTracer <roctracer:index>`,4.1.60400,4.1.60303,4.1.60200
+      :doc:`ROCTracer <roctracer:index>`,4.1.60401,4.1.60400,4.1.60300
      ,,,
      DEVELOPMENT TOOLS,,,
-      :doc:`HIPIFY <hipify:index>`,19.0.0.25133,18.0.0.25012,18.0.0.24232
+      :doc:`HIPIFY <hipify:index>`,19.0.0,19.0.0,18.0.0.24455
-      :doc:`ROCm CMake <rocmcmakebuildtools:index>`,0.14.0,0.14.0,0.13.0
+      :doc:`ROCm CMake <rocmcmakebuildtools:index>`,0.14.0,0.14.0,0.14.0
-      :doc:`ROCdbgapi <rocdbgapi:index>`,0.77.2,0.77.0,0.76.0
+      :doc:`ROCdbgapi <rocdbgapi:index>`,0.77.2,0.77.2,0.77.0
-      :doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,15.2.0,15.2.0,14.2.0
+      :doc:`ROCm Debugger (ROCgdb) <rocgdb:index>`,15.2.0,15.2.0,15.2.0
      `rocprofiler-register <https://github.com/ROCm/rocprofiler-register>`_,0.4.0,0.4.0,0.4.0
-      :doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.0.4,2.0.3,2.0.3
+      :doc:`ROCr Debug Agent <rocr_debug_agent:index>`,2.0.4,2.0.4,2.0.3
      ,,,
      COMPILERS,.. _compilers-support-compatibility-matrix:,,
      `clang-ocl <https://github.com/ROCm/clang-ocl>`_,N/A,N/A,N/A
      :doc:`hipCC <hipcc:index>`,1.1.1,1.1.1,1.1.1
-      `Flang <https://github.com/ROCm/flang>`_,19.0.0.25133,18.0.0.25012,18.0.0.24232
+      `Flang <https://github.com/ROCm/flang>`_,19.0.0.25184,19.0.0.25133,18.0.0.24455
-      :doc:`llvm-project <llvm-project:index>`,19.0.0.25133,18.0.0.25012,18.0.0.24232
+      :doc:`llvm-project <llvm-project:index>`,19.0.0.25184,19.0.0.25133,18.0.0.24491
-      `OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,19.0.0.25133,18.0.0.25012,18.0.0.24232
+      `OpenMP <https://github.com/ROCm/llvm-project/tree/amd-staging/openmp>`_,19.0.0.25184,19.0.0.25133,18.0.0.24491
      ,,,
      RUNTIMES,.. _runtime-support-compatibility-matrix:,,
-      :doc:`AMD CLR <hip:understand/amd_clr>`,6.4.43482,6.3.42134,6.2.41133
+      :doc:`AMD CLR <hip:understand/amd_clr>`,6.4.43483,6.4.43482,6.3.42131
-      :doc:`HIP <hip:index>`,6.4.43482,6.3.42134,6.2.41133
+      :doc:`HIP <hip:index>`,6.4.43483,6.4.43482,6.3.42131
      `OpenCL Runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_,2.0.0,2.0.0,2.0.0
-      :doc:`ROCr Runtime <rocr-runtime:index>`,1.15.0,1.14.0,1.13.0
+      :doc:`ROCr Runtime <rocr-runtime:index>`,1.15.0,1.15.0,1.14.0
 .. rubric:: Footnotes
@@ -153,6 +157,7 @@ compatibility and system requirements.
 .. [#mi300_620] **For ROCm 6.2.0** - MI300X (gfx942) is supported on listed operating systems *except* Ubuntu 22.04.5 [6.8 HWE] and Ubuntu 22.04.4 [6.5 HWE].
 .. [#kfd_support] Starting from ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart (assuming hardware support is available in both). For earlier ROCm releases, the compatibility is provided for +/- 2 releases. These are the compatibility combinations that are currently supported.
 .. [#ROCT-rocr] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.
 .. [#RDNA-OS] Radeon AI PRO R9700, Radeon RX 9070 XT (gfx1201), Radeon RX 9060 XT (gfx1200), Radeon PRO W7700 (gfx1101), and Radeon RX 7800 XT (gfx1101) are supported only on Ubuntu 24.04.2, Ubuntu 22.04.5, RHEL 9.6, RHEL 9.5, and RHEL 9.4.
 .. _OS-kernel-versions:
@@ -170,7 +175,8 @@ Use this lookup table to confirm which operating system and kernel versions are
   ,,
   `Ubuntu <https://ubuntu.com/about/release-cycle#ubuntu-kernel-release-cycle>`_, 22.04.5, "5.15 GA, 6.8 HWE", 2.35
   ,,
-   `Red Hat Enterprise Linux (RHEL 9) <https://access.redhat.com/articles/3078#RHEL9>`_, 9.5, 5.14+, 2.34
+   `Red Hat Enterprise Linux (RHEL 9) <https://access.redhat.com/articles/3078#RHEL9>`_, 9.6, 5.14+, 2.34
   , 9.5, 5.14+, 2.34
   ,9.4, 5.14+, 2.34
   ,9.3, 5.14+, 2.34
   ,,
@@ -231,3 +237,4 @@ Expand for full historical view of:
   .. [#mi300_600-past-60] **For ROCm 6.0.0** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.
   .. [#kfd_support-past-60] Starting from ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart (assuming hardware support is available in both). For earlier ROCm releases, the compatibility is provided for +/- 2 releases. These are the compatibility combinations that are currently supported.
   .. [#ROCT-rocr-past-60] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.
   .. [#RDNA-OS-past-60] Radeon AI PRO R9700, Radeon RX 9070 XT (gfx1201), Radeon RX 9060 XT (gfx1200), Radeon PRO W7700 (gfx1101), and Radeon RX 7800 XT (gfx1101) are supported only on Ubuntu 24.04.2, Ubuntu 22.04.5, RHEL 9.6, RHEL 9.5, and RHEL 9.4.
--- a/docs/compatibility/ml-compatibility/jax-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/jax-compatibility.rst
@@ -97,7 +97,7 @@ Docker image compatibility
 AMD validates and publishes ready-made `ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax>`_
 with ROCm backends on Docker Hub. The following Docker image tags and
 associated inventories represent the latest JAX version from the official Docker Hub and are validated for
-`ROCm 6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`_. Click the |docker-icon|
+`ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`_. Click the |docker-icon|
 icon to view the image on Docker Hub.
 .. list-table:: JAX Docker image components
@@ -110,19 +110,19 @@ icon to view the image on Docker Hub.
    * - .. raw:: html
-           <a href="https://hub.docker.com/layers/rocm/jax/rocm6.4-jax0.4.35-py3.12/images/sha256-4069398229078f3311128b6d276c6af377c7e97d3363d020b0bf7154fae619ca"><i class="fab fa-docker fa-lg"></i> rocm/jax</a>
+           <a href="https://hub.docker.com/layers/rocm/jax/rocm6.4.1-jax0.4.35-py3.12/images/sha256-7a0745a2a2758bdf86397750bac00e9086cbf67d170cfdbb08af73f7c7d18a6a"><i class="fab fa-docker fa-lg"></i> rocm/jax</a>
      - `0.4.35 <https://github.com/ROCm/jax/releases/tag/rocm-jax-v0.4.35>`_
      - Ubuntu 24.04
-      - `3.12.7 <https://www.python.org/downloads/release/python-3127/>`_
+      - `3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
    * - .. raw:: html
-           <a href="https://hub.docker.com/layers/rocm/jax/rocm6.4-jax0.4.35-py3.10/images/sha256-a137f901f91ce6c13b424c40a6cf535248d4d20fd36d5daf5eee0570190a4a11"><i class="fab fa-docker fa-lg"></i> rocm/jax</a>
+           <a href="https://hub.docker.com/layers/rocm/jax/rocm6.4.1-jax0.4.35-py3.10/images/sha256-5f9e8d6e6e69fdc9a1a3f2ba3b1234c3f46c53b7468538c07fd18b00899da54f"><i class="fab fa-docker fa-lg"></i> rocm/jax</a>
      - `0.4.35 <https://github.com/ROCm/jax/releases/tag/rocm-jax-v0.4.35>`_
      - Ubuntu 22.04
-      - `3.10.14 <https://www.python.org/downloads/release/python-31014/>`_
+      - `3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
 AMD publishes `Community ROCm JAX Docker images <https://hub.docker.com/r/rocm/jax-community>`_
 with ROCm backends on Docker Hub. The following Docker image tags and
--- a/docs/compatibility/ml-compatibility/pytorch-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/pytorch-compatibility.rst
@@ -95,7 +95,7 @@ Docker image compatibility
 AMD validates and publishes `PyTorch images <https://hub.docker.com/r/rocm/pytorch>`_
 with ROCm backends on Docker Hub. The following Docker image tags and associated
-inventories were tested on `ROCm 6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`_.
+inventories were tested on `ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`_.
 Click |docker-icon| to view the image on Docker Hub.
 .. list-table:: PyTorch Docker image components
@@ -116,137 +116,122 @@ Click |docker-icon| to view the image on Docker Hub.
    * - .. raw:: html
-           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.6.0/images/sha256-ab1d350b818b90123cfda31363019d11c0d41a8f12a19e3cb2cb40cf0261137d"><i class="fab fa-docker fa-lg"></i></a>
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.1_ubuntu24.04_py3.12_pytorch_release_2.6.0/images/sha256-c76af9bfb1c25b0f40d4c29e8652105c57250bf018d23ff595b06bd79666fdd7"><i class="fab fa-docker fa-lg"></i></a>
      - `2.6.0 <https://github.com/ROCm/pytorch/tree/release/2.6>`_
      - 24.04
-      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
+      - `3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
      - `1.6.0 <https://github.com/ROCm/apex/tree/release/1.6.0>`_
      - `0.21.0 <https://github.com/pytorch/vision/tree/v0.21.0>`_
      - `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`_
      - `master <https://bitbucket.org/icl/magma/src/master/>`_
-      - `1.10.0 <https://github.com/openucx/ucx/tree/v1.10.0>`_
+      - `1.16.0 <https://github.com/openucx/ucx/tree/v1.16.0>`_
-      - `4.0.3 <https://github.com/open-mpi/ompi/tree/v4.0.3>`_
+      - `4.1.6-7ubuntu2 <https://github.com/open-mpi/ompi/tree/v4.1.6>`_
      - `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
    * - .. raw:: html
-           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.6.0/images/sha256-130536fdfceb374626a7bcb8d00b9d796ddfc3115677d51229e5b852d96b5ef4"><i class="fab fa-docker fa-lg"></i></a>
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.1_ubuntu22.04_py3.10_pytorch_release_2.6.0/images/sha256-f9d226135d51831c810dcb1251636ec61f85c65fcdda03e188c053a5d4f6585b"><i class="fab fa-docker fa-lg"></i></a>
      - `2.6.0 <https://github.com/ROCm/pytorch/tree/release/2.6>`_
      - 22.04
-      - `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
+      - `3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
      - `1.6.0 <https://github.com/ROCm/apex/tree/release/1.6.0>`_
      - `0.21.0 <https://github.com/pytorch/vision/tree/v0.21.0>`_
      - `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`_
      - `master <https://bitbucket.org/icl/magma/src/master/>`_
-      - `1.10.0 <https://github.com/openucx/ucx/tree/v1.10.0>`_
+      - `1.12.1~rc2-1 <https://github.com/openucx/ucx/tree/v1.12.1>`_
-      - `4.0.7 <https://github.com/open-mpi/ompi/tree/v4.0.7>`_
+      - `4.1.2-2ubuntu1 <https://github.com/open-mpi/ompi/tree/v4.1.2>`_
      - `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
    * - .. raw:: html
-           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.5.1/images/sha256-20a2e24b4738dc1f1a44a04f23827918b56c99f7e697e6fccb90e9c4fae8ca9b"><i class="fab fa-docker fa-lg"></i></a>
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.1_ubuntu24.04_py3.12_pytorch_release_2.5.1/images/sha256-3490e74d4f43dcdb3351dd334108d1ccd47e5a687c0523a2424ac1bcdd3dd6dd"><i class="fab fa-docker fa-lg"></i></a>
      - `2.5.1 <https://github.com/ROCm/pytorch/tree/release/2.5>`_
      - 24.04
-      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
+      - `3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
      - `1.5.0 <https://github.com/ROCm/apex/tree/release/1.5.0>`_
      - `0.20.1 <https://github.com/pytorch/vision/tree/v0.20.1>`_
      - `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`_
      - `master <https://bitbucket.org/icl/magma/src/master/>`_
-      - `1.10.0 <https://github.com/openucx/ucx/tree/v1.10.0>`_
+      - `1.16.0+ds-5ubuntu1 <https://github.com/openucx/ucx/tree/v1.10.0>`_
-      - `4.0.7 <https://github.com/open-mpi/ompi/tree/v4.0.7>`_
+      - `4.1.6-7ubuntu2 <https://github.com/open-mpi/ompi/tree/v4.1.6>`_
      - `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
    * - .. raw:: html
-           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4_ubuntu22.04_py3.11_pytorch_release_2.5.1/images/sha256-f09cb8ca39cc39222fb554060711f5c19130f7b4047aaf41fad4ba3ec470ca03"><i class="fab fa-docker fa-lg"></i></a>
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.1_ubuntu22.04_py3.10_pytorch_release_2.5.1/images/sha256-26c5dfffb4a54625884abca83166940f17dd27bc75f1b24f6e80fbcb7d4e9afb"><i class="fab fa-docker fa-lg"></i></a>
      - `2.5.1 <https://github.com/ROCm/pytorch/tree/release/2.5>`_
      - 22.04
-      - `3.11.9 <https://www.python.org/downloads/release/python-3119/>`_
+      - `3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
      - `1.5.0 <https://github.com/ROCm/apex/tree/release/1.5.0>`_
      - `0.20.1 <https://github.com/pytorch/vision/tree/v0.20.1>`_
      - `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`_
      - `master <https://bitbucket.org/icl/magma/src/master/>`_
-      - `1.14.1 <https://github.com/openucx/ucx/tree/v1.14.1>`_
+      - `1.12.1~rc2-1 <https://github.com/openucx/ucx/tree/v1.12.1>`_
-      - `4.1.5 <https://github.com/open-mpi/ompi/tree/v4.1.5>`_
+      - `4.1.2-2ubuntu1 <https://github.com/open-mpi/ompi/tree/v4.1.2>`_
      - `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
    * - .. raw:: html
-           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.5.1/images/sha256-a91c100d1fe608dae3eb7f60a751630363d4027ac3d077d428e92945204c338e"><i class="fab fa-docker fa-lg"></i></a>
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.1_ubuntu24.04_py3.12_pytorch_release_2.4.1/images/sha256-f378a24561fa6efc178b6dc93fc7d82e5b93653ecd59c89d4476674d29e1284d"><i class="fab fa-docker fa-lg"></i></a>
      - `2.5.1 <https://github.com/ROCm/pytorch/tree/release/2.5>`_
      - 22.04
      - `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
      - `1.5.0 <https://github.com/ROCm/apex/tree/release/1.5.0>`_
      - `0.20.1 <https://github.com/pytorch/vision/tree/v0.20.1>`_
      - `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`_
      - `master <https://bitbucket.org/icl/magma/src/master/>`_
      - `1.14.1 <https://github.com/openucx/ucx/tree/v1.14.1>`_
      - `4.1.5 <https://github.com/open-mpi/ompi/tree/v4.1.5>`_
      - `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
    * - .. raw:: html
           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.4.1/images/sha256-66a89ce6485bb887af74bb9bd76bb613ab9834a6b1374649ea7ae379883454a4"><i class="fab fa-docker fa-lg"></i></a>
      - `2.4.1 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
      - 24.04
-      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
+      - `3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
      - `1.4.0 <https://github.com/ROCm/apex/tree/release/1.4.0>`_
      - `0.19.0 <https://github.com/pytorch/vision/tree/v0.19.0>`_
      - `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`_
      - `master <https://bitbucket.org/icl/magma/src/master/>`_
-      - `1.10.0 <https://github.com/openucx/ucx/tree/v1.10.0>`_
+      - `1.16.0+ds-5ubuntu1 <https://github.com/openucx/ucx/tree/v1.16.0>`_
-      - `4.0.3 <https://github.com/open-mpi/ompi/tree/v4.0.3>`_
+      - `4.1.6-7ubuntu2 <https://github.com/open-mpi/ompi/tree/v4.1.6>`_
      - `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
    * - .. raw:: html
-           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.4.1/images/sha256-c716cf167e6e49893f11de03606ed37044153aca089e74ca615065c06877f86b"><i class="fab fa-docker fa-lg"></i></a>
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.1_ubuntu22.04_py3.10_pytorch_release_2.4.1/images/sha256-2308dbd0e650b7bf8d548575cbb6e2bdc021f9386384ce570da16d58ee684d22"><i class="fab fa-docker fa-lg"></i></a>
      - `2.4.1 <https://github.com/ROCm/pytorch/tree/release/2.4>`_
      - 22.04
-      - `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
+      - `3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
      - `1.4.0 <https://github.com/ROCm/apex/tree/release/1.4.0>`_
      - `0.19.0 <https://github.com/pytorch/vision/tree/v0.19.0>`_
      - `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13.0>`_
      - `master <https://bitbucket.org/icl/magma/src/master/>`_
-      - `1.14.1 <https://github.com/openucx/ucx/tree/v1.14.1>`_
+      - `1.12.1~rc2-1 <https://github.com/openucx/ucx/tree/v1.12.1>`_
-      - `4.1.5 <https://github.com/open-mpi/ompi/tree/v4.1.5>`_
+      - `4.1.2-2ubuntu1 <https://github.com/open-mpi/ompi/tree/v4.1.2>`_
      - `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
    * - .. raw:: html
-           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.3.0/images/sha256-0434cbc9b07b2c26e39480d7447f676f9057a1054dcff00e0050c25a6eddbd3c"><i class="fab fa-docker fa-lg"></i></a>
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.1_ubuntu24.04_py3.12_pytorch_release_2.3.0/images/sha256-eefd2ab019728f91f94c5e6a9463cb0ea900b3011458d18fe5d88e50c0b57d86"><i class="fab fa-docker fa-lg"></i></a>
      - `2.3.0 <https://github.com/ROCm/pytorch/tree/release/2.3>`_
      - 24.04
-      - `3.12.9 <https://www.python.org/downloads/release/python-3129/>`_
+      - `3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
      - `1.3.0 <https://github.com/ROCm/apex/tree/release/1.3.0>`_
      - `0.18.0 <https://github.com/pytorch/vision/tree/v0.18.0>`_
      - `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13>`_
      - `master <https://bitbucket.org/icl/magma/src/master/>`_
-      - `1.10.0 <https://github.com/openucx/ucx/tree/v1.10.0>`_
+      - `1.16.0+ds-5ubuntu1 <https://github.com/openucx/ucx/tree/v1.16.0>`_
-      - `4.0.3 <https://github.com/open-mpi/ompi/tree/v4.0.3>`_
+      - `4.1.6-7ubuntu2 <https://github.com/open-mpi/ompi/tree/v4.1.6>`_
      - `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
    * - .. raw:: html
-           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.3.0/images/sha256-688b1c0073092615fb98778d78b16191e506097ee116a2d3d2628b264d5d367b"><i class="fab fa-docker fa-lg"></i></a>
+           <a href="https://hub.docker.com/layers/rocm/pytorch/rocm6.4.1_ubuntu22.04_py3.10_pytorch_release_2.3.0/images/sha256-473643226ab0e93a04720b256ed772619878abf9c42b9f84828cefed522696fd"><i class="fab fa-docker fa-lg"></i></a>
      - `2.3.0 <https://github.com/ROCm/pytorch/tree/release/2.3>`_
      - 22.04
-      - `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
+      - `3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
      - `1.3.0 <https://github.com/ROCm/apex/tree/release/1.3.0>`_
      - `0.18.0 <https://github.com/pytorch/vision/tree/v0.18.0>`_
      - `2.13.0 <https://github.com/tensorflow/tensorboard/tree/2.13>`_
      - `master <https://bitbucket.org/icl/magma/src/master/>`_
-      - `1.10.0 <https://github.com/openucx/ucx/tree/v1.10.0>`_
+      - `1.12.1~rc2-1 <https://github.com/openucx/ucx/tree/v1.12.1>`_
-      - `4.0.3 <https://github.com/open-mpi/ompi/tree/v4.0.3>`_
+      - `4.1.2-2ubuntu1 <https://github.com/open-mpi/ompi/tree/v4.1.2>`_
      - `5.3-1.0.5.0 <https://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz>`_
 Key ROCm libraries for PyTorch
--- a/docs/compatibility/ml-compatibility/tensorflow-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/tensorflow-compatibility.rst
@@ -56,7 +56,7 @@ Docker image compatibility
 AMD validates and publishes ready-made `TensorFlow images
 <https://hub.docker.com/r/rocm/tensorflow>`_ with ROCm backends on
 Docker Hub. The following Docker image tags and associated inventories are
-validated for `ROCm 6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`_. Click
+validated for `ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`_. Click
 the |docker-icon| icon to view the image on Docker Hub.
 .. list-table:: TensorFlow Docker image components
@@ -73,82 +73,122 @@ the |docker-icon| icon to view the image on Docker Hub.
           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4-py3.12-tf2.18-dev/images/sha256-fa9cf5fa6c6079a7118727531ccd0056c6e3224a42c3d6e78a49e7781daafff4"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
-      - `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4/tensorflow_rocm-2.18.1-cp312-cp312-manylinux_2_28_x86_64.whl>`__
+      - `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.18.1-cp312-cp312-manylinux_2_28_x86_64.whl>`__
      - dev
      - 24.04
-      - `Python 3.12.4 <https://www.python.org/downloads/release/python-3124/>`_
+      - `Python 3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
      - `TensorBoard 2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`_
    * - .. raw:: html
-           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4-py3.12-tf2.18-runtime/images/sha256-14addca4b92a47c806b83ebaeed593fc6672cd99f0017ed8dad759fe72ed0309"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.12-tf2.18-runtime/images/sha256-d14d8c4989e7c9a60f4e72461b9e349de72347c6162dcd6897e6f4f80ffbb440"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
-      - `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4/tensorflow_rocm-2.18.1-cp312-cp312-manylinux_2_28_x86_64.whl>`__
+      - `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.18.1-cp312-cp312-manylinux_2_28_x86_64.whl>`__
      - runtime
      - 24.04
-      - `Python 3.12.4 <https://www.python.org/downloads/release/python-3124/>`_
+      - `Python 3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
      - `TensorBoard 2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`_
    * - .. raw:: html
-           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4-py3.10-tf2.18-dev/images/sha256-f5e151060df04ff5fb59f5604b49cd371931bbe75b06aec9fe7781397c4be0ce"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.10-tf2.18-dev/images/sha256-081e5bd6615a5dc17247ebd2ccc26895c3feeff086720400fa39b477e60a77c0"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
-      - `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4/tensorflow_rocm-2.18.1-cp310-cp310-manylinux_2_28_x86_64.whl>`__
+      - `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.18.1-cp310-cp310-manylinux_2_28_x86_64.whl>`__
      - dev
      - 22.04
-      - `Python 3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
+      - `Python 3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
      - `TensorBoard 2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`_
    * - .. raw:: html
-           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4-py3.10-tf2.18-runtime/images/sha256-5cd4c03fdb1036570c0d4929da60a65c4466998dc80f1dc8a5a0b173eae017fb"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.10-tf2.18-runtime/images/sha256-bf369637378264f4af6ddad5ca8b8611d3e372ffbea9ab7a06f1e122f0a0867b"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
-      - `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4/tensorflow_rocm-2.18.1-cp310-cp310-manylinux_2_28_x86_64.whl>`__
+      - `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.18.1-cp310-cp310-manylinux_2_28_x86_64.whl>`__
      - runtime
      - 22.04
-      - `Python 3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
+      - `Python 3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
      - `TensorBoard 2.18.0 <https://github.com/tensorflow/tensorboard/tree/2.18.0>`_
    * - .. raw:: html
-           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4-py3.12-tf2.17-dev/images/sha256-b3add80e374a2db2d1088d746e740afa89d439aca02cacba959ad298f5cd2b3f"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.12-tf2.17-dev/images/sha256-5a502008c50d0b6508e6027f911bdff070a7493700ae064bed74e1d22b91ed50"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
      - `tensorflow-rocm 2.17.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4/tensorflow_rocm-2.17.1-cp312-cp312-manylinux_2_28_x86_64.whl>`__
      - dev
      - 24.04
-      - `Python 3.12.4 <https://www.python.org/downloads/release/python-3124/>`_
+      - `Python 3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
      - `TensorBoard 2.17.1 <https://github.com/tensorflow/tensorboard/tree/2.17.1>`_
    * - .. raw:: html
-           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4-py3.12-tf2.17-runtime/images/sha256-3a244f026c32177eff7958ffbad390de85b438b2b48b455cc39f15d70fa1270d"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.12-tf2.17-runtime/images/sha256-1ee5dfffceb71ac66617ada33de3a10de0cb74199cc4b82441192e5e92fa2ddf"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
      - `tensorflow-rocm 2.18.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4/tensorflow_rocm-2.17.1-cp312-cp312-manylinux_2_28_x86_64.whl>`__
      - runtime
      - 24.04
-      - `Python 3.12.4 <https://www.python.org/downloads/release/python-3124/>`_
+      - `Python 3.12.10 <https://www.python.org/downloads/release/python-3124/>`_
      - `TensorBoard 2.17.1 <https://github.com/tensorflow/tensorboard/tree/2.17.1>`_
    * - .. raw:: html
-           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4-py3.10-tf2.17-dev/images/sha256-e0cecdfacb59169335049983cdab6da578c209bb9f4d08aad97e184ae59171a6"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.10-tf2.17-dev/images/sha256-109218ad92bfae83bbd2710475f7502166e1ed54ca0b9748a9cbc3f5a1d75af1"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
-      - `tensorflow-rocm 2.17.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4/tensorflow_rocm-2.17.1-cp310-cp310-manylinux_2_28_x86_64.whl>`__
+      - `tensorflow-rocm 2.17.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.17.1-cp312-cp312-manylinux_2_28_x86_64.whl>`__
      - dev
      - 22.04
-      - `Python 3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
+      - `Python 3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
      - `TensorBoard 2.17.1 <https://github.com/tensorflow/tensorboard/tree/2.17.1>`_
    * - .. raw:: html
-           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4-py3.10-tf2.17-runtime/images/sha256-6f43de12f7eb202791b698ac51d28b72098de90034dbcd48486629b0125f7707"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
+           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.10-tf2.17-runtime/images/sha256-5d78bd5918d394f92263daa2990e88d695d27200dd90ed83ec64d20c7661c9c1"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
-      - `tensorflow-rocm 2.17.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4/tensorflow_rocm-2.17.1-cp310-cp310-manylinux_2_28_x86_64.whl>`__
+      - `tensorflow-rocm 2.17.1 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.17.1-cp310-cp310-manylinux_2_28_x86_64.whl>`__
      - runtime
      - 22.04
-      - `Python 3.10.16 <https://www.python.org/downloads/release/python-31016/>`_
+      - `Python 3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
      - `TensorBoard 2.17.1 <https://github.com/tensorflow/tensorboard/tree/2.17.1>`_
    * - .. raw:: html
           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.12-tf2.16-dev/images/sha256-b09b1ad921c09c687b7c916141051e9fcf15539a5686e5aa67c689195a522719"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
      - `tensorflow-rocm 2.16.2 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.16.2-cp312-cp312-manylinux_2_28_x86_64.whl>`__
      - dev
      - 24.04
      - `Python 3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
      - `TensorBoard 2.16.2 <https://github.com/tensorflow/tensorboard/tree/2.16.2>`_
    * - .. raw:: html
           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.12-tf2.16-runtime/images/sha256-20dbd824e85558abfe33fc9283cc547d88cde3c623fe95322743a5082f883a64"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
      - `tensorflow-rocm 2.16.2 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.16.2-cp312-cp312-manylinux_2_28_x86_64.whl>`__
      - runtime
      - 24.04
      - `Python 3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
      - `TensorBoard 2.16.2 <https://github.com/tensorflow/tensorboard/tree/2.16.2>`_
    * - .. raw:: html
           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.10-tf2.16-dev/images/sha256-36c4fa047c86e2470ac473ec1429aea6d4b8934b90ffeb34d1afab40e7e5b377"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
      - `tensorflow-rocm 2.16.2 <https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.10-tf2.16-dev/images/sha256-36c4fa047c86e2470ac473ec1429aea6d4b8934b90ffeb34d1afab40e7e5b377>`__
      - dev
      - 22.04
      - `Python 3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
      - `TensorBoard 2.16.2 <https://github.com/tensorflow/tensorboard/tree/2.16.2>`_
    * - .. raw:: html
           <a href="https://hub.docker.com/layers/rocm/tensorflow/rocm6.4.1-py3.10-tf2.16-runtime/images/sha256-a94150ffb81365234ebfa34e764db5474bc6ab7d141b56495eac349778dafcf3"><i class="fab fa-docker fa-lg"></i> rocm/tensorflow</a>
      - `tensorflow-rocm 2.16.2 <https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/tensorflow_rocm-2.16.2-cp312-cp312-manylinux_2_28_x86_64.whl>`__
      - runtime
      - 22.04
      - `Python 3.10.17 <https://www.python.org/downloads/release/python-31017/>`_
      - `TensorBoard 2.16.2 <https://github.com/tensorflow/tensorboard/tree/2.16.2>`_
 Critical ROCm libraries for TensorFlow
 ===============================================================================
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -34,15 +34,15 @@ project = "ROCm Documentation"
 project_path = os.path.abspath(".").replace("\\", "/")
 author = "Advanced Micro Devices, Inc."
 copyright = "Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved."
-version = "6.4.0"
+version = "6.4.1"
-release = "6.4.0"
+release = "6.4.1"
 setting_all_article_info = True
 all_article_info_os = ["linux", "windows"]
 all_article_info_author = ""
 # pages with specific settings
 article_pages = [
-    {"file": "about/release-notes", "os": ["linux"], "date": "2025-04-11"},
+    {"file": "about/release-notes", "os": ["linux"], "date": "2025-05-07"},
    {"file": "release/changelog", "os": ["linux"],},
    {"file": "compatibility/compatibility-matrix", "os": ["linux"]},
    {"file": "compatibility/ml-compatibility/pytorch-compatibility", "os": ["linux"]},
@@ -71,8 +71,9 @@ article_pages = [
    {"file": "how-to/rocm-for-ai/inference/index", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/hugging-face-models", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/llm-inference-frameworks", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/inference/vllm-benchmark", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/vllm", "os": ["linux"]},
-    {"file": "how-to/rocm-for-ai/inference/pytorch-inference-benchmark", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250513", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/deploy-your-model", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference-optimization/index", "os": ["linux"]},
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.7.3_20250325-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.7.3_20250325-benchmark-models.yaml
@@ -0,0 +1,159 @@
 vllm_benchmark:
  unified_docker:
    latest:
      pull_tag: rocm/vllm:rocm6.3.1_instinct_vllm0.7.3_20250325
      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.7.3_20250325/images/sha256-25245924f61750b19be6dcd8e787e46088a496c1fe17ee9b9e397f3d84d35640
      rocm_version: 6.3.1
      vllm_version: 0.7.3
      pytorch_version: 2.7.0 (dev nightly)
      hipblaslt_version: 0.13
  model_groups:
    - group: Llama
      tag: llama
      models:
      - model: Llama 3.1 8B
        mad_tag: pyt_vllm_llama-3.1-8b
        model_repo: meta-llama/Llama-3.1-8B-Instruct
        url: https://huggingface.co/meta-llama/Llama-3.1-8B
        precision: float16
      - model: Llama 3.1 70B
        mad_tag: pyt_vllm_llama-3.1-70b
        model_repo: meta-llama/Llama-3.1-70B-Instruct
        url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
        precision: float16
      - model: Llama 3.1 405B
        mad_tag: pyt_vllm_llama-3.1-405b
        model_repo: meta-llama/Llama-3.1-405B-Instruct
        url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
        precision: float16
      - model: Llama 3.2 11B Vision
        mad_tag: pyt_vllm_llama-3.2-11b-vision-instruct
        model_repo: meta-llama/Llama-3.2-11B-Vision-Instruct
        url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct
        precision: float16
      - model: Llama 2 7B
        mad_tag: pyt_vllm_llama-2-7b
        model_repo: meta-llama/Llama-2-7b-chat-hf
        url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
        precision: float16
      - model: Llama 2 70B
        mad_tag: pyt_vllm_llama-2-70b
        model_repo: meta-llama/Llama-2-70b-chat-hf
        url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
        precision: float16
      - model: Llama 3.1 8B FP8
        mad_tag: pyt_vllm_llama-3.1-8b_fp8
        model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
        url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
        precision: float8
      - model: Llama 3.1 70B FP8
        mad_tag: pyt_vllm_llama-3.1-70b_fp8
        model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
        url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
        precision: float8
      - model: Llama 3.1 405B FP8
        mad_tag: pyt_vllm_llama-3.1-405b_fp8
        model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
        precision: float8
    - group: Mistral
      tag: mistral
      models:
      - model: Mixtral MoE 8x7B
        mad_tag: pyt_vllm_mixtral-8x7b
        model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
        url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
        precision: float16
      - model: Mixtral MoE 8x22B
        mad_tag: pyt_vllm_mixtral-8x22b
        model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
        url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
        precision: float16
      - model: Mistral 7B
        mad_tag: pyt_vllm_mistral-7b
        model_repo: mistralai/Mistral-7B-Instruct-v0.3
        url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
        precision: float16
      - model: Mixtral MoE 8x7B FP8
        mad_tag: pyt_vllm_mixtral-8x7b_fp8
        model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
        url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
        precision: float8
      - model: Mixtral MoE 8x22B FP8
        mad_tag: pyt_vllm_mixtral-8x22b_fp8
        model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
        url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
        precision: float8
      - model: Mistral 7B FP8
        mad_tag: pyt_vllm_mistral-7b_fp8
        model_repo: amd/Mistral-7B-v0.1-FP8-KV
        url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
        precision: float8
    - group: Qwen
      tag: qwen
      models:
      - model: Qwen2 7B
        mad_tag: pyt_vllm_qwen2-7b
        model_repo: Qwen/Qwen2-7B-Instruct
        url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
        precision: float16
      - model: Qwen2 72B
        mad_tag: pyt_vllm_qwen2-72b
        model_repo: Qwen/Qwen2-72B-Instruct
        url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
        precision: float16
    - group: JAIS
      tag: jais
      models:
      - model: JAIS 13B
        mad_tag: pyt_vllm_jais-13b
        model_repo: core42/jais-13b-chat
        url: https://huggingface.co/core42/jais-13b-chat
        precision: float16
      - model: JAIS 30B
        mad_tag: pyt_vllm_jais-30b
        model_repo: core42/jais-30b-chat-v3
        url: https://huggingface.co/core42/jais-30b-chat-v3
        precision: float16
    - group: DBRX
      tag: dbrx
      models:
      - model: DBRX Instruct
        mad_tag: pyt_vllm_dbrx-instruct
        model_repo: databricks/dbrx-instruct
        url: https://huggingface.co/databricks/dbrx-instruct
        precision: float16
      - model: DBRX Instruct FP8
        mad_tag: pyt_vllm_dbrx_fp8
        model_repo: amd/dbrx-instruct-FP8-KV
        url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
        precision: float8
    - group: Gemma
      tag: gemma
      models:
      - model: Gemma 2 27B
        mad_tag: pyt_vllm_gemma-2-27b
        model_repo: google/gemma-2-27b
        url: https://huggingface.co/google/gemma-2-27b
        precision: float16
    - group: Cohere
      tag: cohere
      models:
      - model: C4AI Command R+ 08-2024
        mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
        model_repo: CohereForAI/c4ai-command-r-plus-08-2024
        url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
        precision: float16
      - model: C4AI Command R+ 08-2024 FP8
        mad_tag: pyt_vllm_command-r-plus_fp8
        model_repo: amd/c4ai-command-r-plus-FP8-KV
        url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
        precision: float8
    - group: DeepSeek
      tag: deepseek
      models:
      - model: DeepSeek MoE 16B
        mad_tag: pyt_vllm_deepseek-moe-16b-chat
        model_repo: deepseek-ai/deepseek-moe-16b-chat
        url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
        precision: float16
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.8.3_20250415-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.8.3_20250415-benchmark-models.yaml
@@ -0,0 +1,152 @@
 vllm_benchmark:
  unified_docker:
    latest:
      pull_tag: rocm/vllm:rocm6.3.1_instinct_vllm0.8.3_20250415
      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.8.3_20250415/images/sha256-ad9062dea3483d59dedb17c67f7c49f30eebd6eb37c3fac0a171fb19696cc845
      rocm_version: 6.3.1
      vllm_version: 0.8.3
      pytorch_version: 2.7.0 (dev nightly)
      hipblaslt_version: 0.13
  model_groups:
    - group: Llama
      tag: llama
      models:
      - model: Llama 3.1 8B
        mad_tag: pyt_vllm_llama-3.1-8b
        model_repo: meta-llama/Llama-3.1-8B-Instruct
        url: https://huggingface.co/meta-llama/Llama-3.1-8B
        precision: float16
      - model: Llama 3.1 70B
        mad_tag: pyt_vllm_llama-3.1-70b
        model_repo: meta-llama/Llama-3.1-70B-Instruct
        url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
        precision: float16
      - model: Llama 3.1 405B
        mad_tag: pyt_vllm_llama-3.1-405b
        model_repo: meta-llama/Llama-3.1-405B-Instruct
        url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
        precision: float16
      - model: Llama 3.2 11B Vision
        mad_tag: pyt_vllm_llama-3.2-11b-vision-instruct
        model_repo: meta-llama/Llama-3.2-11B-Vision-Instruct
        url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct
        precision: float16
      - model: Llama 2 7B
        mad_tag: pyt_vllm_llama-2-7b
        model_repo: meta-llama/Llama-2-7b-chat-hf
        url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
        precision: float16
      - model: Llama 2 70B
        mad_tag: pyt_vllm_llama-2-70b
        model_repo: meta-llama/Llama-2-70b-chat-hf
        url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
        precision: float16
      - model: Llama 3.1 8B FP8
        mad_tag: pyt_vllm_llama-3.1-8b_fp8
        model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
        url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
        precision: float8
      - model: Llama 3.1 70B FP8
        mad_tag: pyt_vllm_llama-3.1-70b_fp8
        model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
        url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
        precision: float8
      - model: Llama 3.1 405B FP8
        mad_tag: pyt_vllm_llama-3.1-405b_fp8
        model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
        precision: float8
    - group: Mistral
      tag: mistral
      models:
      - model: Mixtral MoE 8x7B
        mad_tag: pyt_vllm_mixtral-8x7b
        model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
        url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
        precision: float16
      - model: Mixtral MoE 8x22B
        mad_tag: pyt_vllm_mixtral-8x22b
        model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
        url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
        precision: float16
      - model: Mistral 7B
        mad_tag: pyt_vllm_mistral-7b
        model_repo: mistralai/Mistral-7B-Instruct-v0.3
        url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
        precision: float16
      - model: Mixtral MoE 8x7B FP8
        mad_tag: pyt_vllm_mixtral-8x7b_fp8
        model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
        url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
        precision: float8
      - model: Mixtral MoE 8x22B FP8
        mad_tag: pyt_vllm_mixtral-8x22b_fp8
        model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
        url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
        precision: float8
      - model: Mistral 7B FP8
        mad_tag: pyt_vllm_mistral-7b_fp8
        model_repo: amd/Mistral-7B-v0.1-FP8-KV
        url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
        precision: float8
    - group: Qwen
      tag: qwen
      models:
      - model: Qwen2 7B
        mad_tag: pyt_vllm_qwen2-7b
        model_repo: Qwen/Qwen2-7B-Instruct
        url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
        precision: float16
      - model: Qwen2 72B
        mad_tag: pyt_vllm_qwen2-72b
        model_repo: Qwen/Qwen2-72B-Instruct
        url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
        precision: float16
      - model: QwQ-32B
        mad_tag: pyt_vllm_qwq-32b
        model_repo: Qwen/QwQ-32B
        url: https://huggingface.co/Qwen/QwQ-32B
        precision: float16
        tunableop: true
    - group: DBRX
      tag: dbrx
      models:
      - model: DBRX Instruct
        mad_tag: pyt_vllm_dbrx-instruct
        model_repo: databricks/dbrx-instruct
        url: https://huggingface.co/databricks/dbrx-instruct
        precision: float16
      - model: DBRX Instruct FP8
        mad_tag: pyt_vllm_dbrx_fp8
        model_repo: amd/dbrx-instruct-FP8-KV
        url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
        precision: float8
    - group: Gemma
      tag: gemma
      models:
      - model: Gemma 2 27B
        mad_tag: pyt_vllm_gemma-2-27b
        model_repo: google/gemma-2-27b
        url: https://huggingface.co/google/gemma-2-27b
        precision: float16
    - group: Cohere
      tag: cohere
      models:
      - model: C4AI Command R+ 08-2024
        mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
        model_repo: CohereForAI/c4ai-command-r-plus-08-2024
        url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
        precision: float16
      - model: C4AI Command R+ 08-2024 FP8
        mad_tag: pyt_vllm_command-r-plus_fp8
        model_repo: amd/c4ai-command-r-plus-FP8-KV
        url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
        precision: float8
    - group: DeepSeek
      tag: deepseek
      models:
      - model: DeepSeek MoE 16B
        mad_tag: pyt_vllm_deepseek-moe-16b-chat
        model_repo: deepseek-ai/deepseek-moe-16b-chat
        url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
        precision: float16
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.8.5_20250513-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.8.5_20250513-benchmark-models.yaml
@@ -0,0 +1,152 @@
 vllm_benchmark:
  unified_docker:
    latest:
      pull_tag: rocm/vllm:rocm6.3.1_vllm0.8.5_20250513
      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250513/images/sha256-5c8b4436dd0464119d9df2b44c745fadf81512f18ffb2f4b5dc235c71ebe26b4
      rocm_version: 6.3.1
      vllm_version: 0.8.5
      pytorch_version: 2.7.0+gitf717b2a
      hipblaslt_version: 0.15
  model_groups:
    - group: Meta Llama
      tag: llama
      models:
      - model: Llama 3.1 8B
        mad_tag: pyt_vllm_llama-3.1-8b
        model_repo: meta-llama/Llama-3.1-8B-Instruct
        url: https://huggingface.co/meta-llama/Llama-3.1-8B
        precision: float16
      - model: Llama 3.1 70B
        mad_tag: pyt_vllm_llama-3.1-70b
        model_repo: meta-llama/Llama-3.1-70B-Instruct
        url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
        precision: float16
      - model: Llama 3.1 405B
        mad_tag: pyt_vllm_llama-3.1-405b
        model_repo: meta-llama/Llama-3.1-405B-Instruct
        url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
        precision: float16
      - model: Llama 3.2 11B Vision
        mad_tag: pyt_vllm_llama-3.2-11b-vision-instruct
        model_repo: meta-llama/Llama-3.2-11B-Vision-Instruct
        url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct
        precision: float16
      - model: Llama 2 7B
        mad_tag: pyt_vllm_llama-2-7b
        model_repo: meta-llama/Llama-2-7b-chat-hf
        url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
        precision: float16
      - model: Llama 2 70B
        mad_tag: pyt_vllm_llama-2-70b
        model_repo: meta-llama/Llama-2-70b-chat-hf
        url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
        precision: float16
      - model: Llama 3.1 8B FP8
        mad_tag: pyt_vllm_llama-3.1-8b_fp8
        model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
        url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
        precision: float8
      - model: Llama 3.1 70B FP8
        mad_tag: pyt_vllm_llama-3.1-70b_fp8
        model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
        url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
        precision: float8
      - model: Llama 3.1 405B FP8
        mad_tag: pyt_vllm_llama-3.1-405b_fp8
        model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
        precision: float8
    - group: Mistral AI
      tag: mistral
      models:
      - model: Mixtral MoE 8x7B
        mad_tag: pyt_vllm_mixtral-8x7b
        model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
        url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
        precision: float16
      - model: Mixtral MoE 8x22B
        mad_tag: pyt_vllm_mixtral-8x22b
        model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
        url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
        precision: float16
      - model: Mistral 7B
        mad_tag: pyt_vllm_mistral-7b
        model_repo: mistralai/Mistral-7B-Instruct-v0.3
        url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
        precision: float16
      - model: Mixtral MoE 8x7B FP8
        mad_tag: pyt_vllm_mixtral-8x7b_fp8
        model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
        url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
        precision: float8
      - model: Mixtral MoE 8x22B FP8
        mad_tag: pyt_vllm_mixtral-8x22b_fp8
        model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
        url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
        precision: float8
      - model: Mistral 7B FP8
        mad_tag: pyt_vllm_mistral-7b_fp8
        model_repo: amd/Mistral-7B-v0.1-FP8-KV
        url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
        precision: float8
    - group: Qwen
      tag: qwen
      models:
      - model: Qwen2 7B
        mad_tag: pyt_vllm_qwen2-7b
        model_repo: Qwen/Qwen2-7B-Instruct
        url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
        precision: float16
      - model: Qwen2 72B
        mad_tag: pyt_vllm_qwen2-72b
        model_repo: Qwen/Qwen2-72B-Instruct
        url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
        precision: float16
      - model: QwQ-32B
        mad_tag: pyt_vllm_qwq-32b
        model_repo: Qwen/QwQ-32B
        url: https://huggingface.co/Qwen/QwQ-32B
        precision: float16
        tunableop: true
    - group: Databricks DBRX
      tag: dbrx
      models:
      - model: DBRX Instruct
        mad_tag: pyt_vllm_dbrx-instruct
        model_repo: databricks/dbrx-instruct
        url: https://huggingface.co/databricks/dbrx-instruct
        precision: float16
      - model: DBRX Instruct FP8
        mad_tag: pyt_vllm_dbrx_fp8
        model_repo: amd/dbrx-instruct-FP8-KV
        url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
        precision: float8
    - group: Google Gemma
      tag: gemma
      models:
      - model: Gemma 2 27B
        mad_tag: pyt_vllm_gemma-2-27b
        model_repo: google/gemma-2-27b
        url: https://huggingface.co/google/gemma-2-27b
        precision: float16
    - group: Cohere
      tag: cohere
      models:
      - model: C4AI Command R+ 08-2024
        mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
        model_repo: CohereForAI/c4ai-command-r-plus-08-2024
        url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
        precision: float16
      - model: C4AI Command R+ 08-2024 FP8
        mad_tag: pyt_vllm_command-r-plus_fp8
        model_repo: amd/c4ai-command-r-plus-FP8-KV
        url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
        precision: float8
    - group: DeepSeek
      tag: deepseek
      models:
      - model: DeepSeek MoE 16B
        mad_tag: pyt_vllm_deepseek-moe-16b-chat
        model_repo: deepseek-ai/deepseek-moe-16b-chat
        url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
        precision: float16
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.8.5_20250521-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.8.5_20250521-benchmark-models.yaml
@@ -0,0 +1,167 @@
 vllm_benchmark:
  unified_docker:
    latest:
      pull_tag: rocm/vllm:rocm6.3.1_vllm0.8.5_20250521
      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250521/images/sha256-38410c51af7208897cd8b737c9bdfc126e9bc8952d4aa6b88c85482f03092a11
      rocm_version: 6.3.1
      vllm_version: 0.8.5 (0.8.6.dev315+g91a560098.rocm631)
      pytorch_version: 2.7.0+gitf717b2a
      hipblaslt_version: 0.15
  model_groups:
    - group: Meta Llama
      tag: llama
      models:
        - model: Llama 3.1 8B
          mad_tag: pyt_vllm_llama-3.1-8b
          model_repo: meta-llama/Llama-3.1-8B-Instruct
          url: https://huggingface.co/meta-llama/Llama-3.1-8B
          precision: float16
        - model: Llama 3.1 70B
          mad_tag: pyt_vllm_llama-3.1-70b
          model_repo: meta-llama/Llama-3.1-70B-Instruct
          url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
          precision: float16
        - model: Llama 3.1 405B
          mad_tag: pyt_vllm_llama-3.1-405b
          model_repo: meta-llama/Llama-3.1-405B-Instruct
          url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
          precision: float16
        - model: Llama 3.2 11B Vision
          mad_tag: pyt_vllm_llama-3.2-11b-vision-instruct
          model_repo: meta-llama/Llama-3.2-11B-Vision-Instruct
          url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct
          precision: float16
        - model: Llama 2 7B
          mad_tag: pyt_vllm_llama-2-7b
          model_repo: meta-llama/Llama-2-7b-chat-hf
          url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
          precision: float16
        - model: Llama 2 70B
          mad_tag: pyt_vllm_llama-2-70b
          model_repo: meta-llama/Llama-2-70b-chat-hf
          url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
          precision: float16
        - model: Llama 3.1 8B FP8
          mad_tag: pyt_vllm_llama-3.1-8b_fp8
          model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
          url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
          precision: float8
        - model: Llama 3.1 70B FP8
          mad_tag: pyt_vllm_llama-3.1-70b_fp8
          model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
          url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
          precision: float8
        - model: Llama 3.1 405B FP8
          mad_tag: pyt_vllm_llama-3.1-405b_fp8
          model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
          url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
          precision: float8
    - group: Mistral AI
      tag: mistral
      models:
        - model: Mixtral MoE 8x7B
          mad_tag: pyt_vllm_mixtral-8x7b
          model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
          url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
          precision: float16
        - model: Mixtral MoE 8x22B
          mad_tag: pyt_vllm_mixtral-8x22b
          model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
          url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
          precision: float16
        - model: Mistral 7B
          mad_tag: pyt_vllm_mistral-7b
          model_repo: mistralai/Mistral-7B-Instruct-v0.3
          url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
          precision: float16
        - model: Mixtral MoE 8x7B FP8
          mad_tag: pyt_vllm_mixtral-8x7b_fp8
          model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
          url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
          precision: float8
        - model: Mixtral MoE 8x22B FP8
          mad_tag: pyt_vllm_mixtral-8x22b_fp8
          model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
          url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
          precision: float8
        - model: Mistral 7B FP8
          mad_tag: pyt_vllm_mistral-7b_fp8
          model_repo: amd/Mistral-7B-v0.1-FP8-KV
          url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
          precision: float8
    - group: Qwen
      tag: qwen
      models:
        - model: Qwen2 7B
          mad_tag: pyt_vllm_qwen2-7b
          model_repo: Qwen/Qwen2-7B-Instruct
          url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
          precision: float16
        - model: Qwen2 72B
          mad_tag: pyt_vllm_qwen2-72b
          model_repo: Qwen/Qwen2-72B-Instruct
          url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
          precision: float16
        - model: QwQ-32B
          mad_tag: pyt_vllm_qwq-32b
          model_repo: Qwen/QwQ-32B
          url: https://huggingface.co/Qwen/QwQ-32B
          precision: float16
          tunableop: true
    - group: Databricks DBRX
      tag: dbrx
      models:
        - model: DBRX Instruct
          mad_tag: pyt_vllm_dbrx-instruct
          model_repo: databricks/dbrx-instruct
          url: https://huggingface.co/databricks/dbrx-instruct
          precision: float16
        - model: DBRX Instruct FP8
          mad_tag: pyt_vllm_dbrx_fp8
          model_repo: amd/dbrx-instruct-FP8-KV
          url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
          precision: float8
    - group: Google Gemma
      tag: gemma
      models:
        - model: Gemma 2 27B
          mad_tag: pyt_vllm_gemma-2-27b
          model_repo: google/gemma-2-27b
          url: https://huggingface.co/google/gemma-2-27b
          precision: float16
    - group: Cohere
      tag: cohere
      models:
        - model: C4AI Command R+ 08-2024
          mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
          model_repo: CohereForAI/c4ai-command-r-plus-08-2024
          url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
          precision: float16
        - model: C4AI Command R+ 08-2024 FP8
          mad_tag: pyt_vllm_command-r-plus_fp8
          model_repo: amd/c4ai-command-r-plus-FP8-KV
          url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
          precision: float8
    - group: DeepSeek
      tag: deepseek
      models:
        - model: DeepSeek MoE 16B
          mad_tag: pyt_vllm_deepseek-moe-16b-chat
          model_repo: deepseek-ai/deepseek-moe-16b-chat
          url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
          precision: float16
    - group: Microsoft Phi
      tag: phi
      models:
        - model: Phi-4
          mad_tag: pyt_vllm_phi-4
          model_repo: microsoft/phi-4
          url: https://huggingface.co/microsoft/phi-4
    - group: TII Falcon
      tag: falcon
      models:
        - model: Falcon 180B
          mad_tag: pyt_vllm_falcon-180b
          model_repo: tiiuae/falcon-180B
          url: https://huggingface.co/tiiuae/falcon-180B
          precision: float16
--- a/docs/data/how-to/rocm-for-ai/inference/pytorch-inference-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/pytorch-inference-benchmark-models.yaml
@@ -23,3 +23,11 @@ pytorch_inference_benchmark:
        model_repo: meta-llama/Llama-3.1-8B-Instruct
        url: https://huggingface.co/chaidiscovery/chai-1
        precision: float16
    - group: Mochi Video
      tag: mochi
      models:
      - model: Mochi 1
        mad_tag: pyt_mochi_video_inference
        model_repo: genmo/mochi-1-preview
        url: https://huggingface.co/genmo/mochi-1-preview
        precision: float16
--- a/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
@@ -1,14 +1,14 @@
 vllm_benchmark:
  unified_docker:
    latest:
-      pull_tag: rocm/vllm:rocm6.3.1_instinct_vllm0.8.3_20250415
+      pull_tag: rocm/vllm:rocm6.4.1_vllm_0.9.0.1_20250605
-      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.8.3_20250415/images/sha256-ad9062dea3483d59dedb17c67f7c49f30eebd6eb37c3fac0a171fb19696cc845
+      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.0.1_20250605/images/sha256-f48beeb3d72663a93c77211eb45273d564451447c097e060befa713d565fa36c
-      rocm_version: 6.3.1
+      rocm_version: 6.4.1
-      vllm_version: 0.8.3
+      vllm_version: 0.9.0.1 (0.9.0.2.dev108+g71faa1880.rocm641)
-      pytorch_version: 2.7.0 (dev nightly)
+      pytorch_version: 2.7.0+gitf717b2a
-      hipblaslt_version: 0.13
+      hipblaslt_version: 0.15
  model_groups:
-    - group: Llama
+    - group: Meta Llama
      tag: llama
      models:
      - model: Llama 3.1 8B
@@ -26,11 +26,6 @@ vllm_benchmark:
        model_repo: meta-llama/Llama-3.1-405B-Instruct
        url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
        precision: float16
      - model: Llama 3.2 11B Vision
        mad_tag: pyt_vllm_llama-3.2-11b-vision-instruct
        model_repo: meta-llama/Llama-3.2-11B-Vision-Instruct
        url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct
        precision: float16
      - model: Llama 2 7B
        mad_tag: pyt_vllm_llama-2-7b
        model_repo: meta-llama/Llama-2-7b-chat-hf
@@ -56,7 +51,7 @@ vllm_benchmark:
        model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
        precision: float8
-    - group: Mistral
+    - group: Mistral AI
      tag: mistral
      models:
      - model: Mixtral MoE 8x7B
@@ -108,7 +103,7 @@ vllm_benchmark:
        url: https://huggingface.co/Qwen/QwQ-32B
        precision: float16
        tunableop: true
-    - group: DBRX
+    - group: Databricks DBRX
      tag: dbrx
      models:
      - model: DBRX Instruct
@@ -121,7 +116,7 @@ vllm_benchmark:
        model_repo: amd/dbrx-instruct-FP8-KV
        url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
        precision: float8
-    - group: Gemma
+    - group: Google Gemma
      tag: gemma
      models:
      - model: Gemma 2 27B
@@ -150,3 +145,18 @@ vllm_benchmark:
        model_repo: deepseek-ai/deepseek-moe-16b-chat
        url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
        precision: float16
    - group: Microsoft Phi
      tag: phi
      models:
      - model: Phi-4
        mad_tag: pyt_vllm_phi-4
        model_repo: microsoft/phi-4
        url: https://huggingface.co/microsoft/phi-4
    - group: TII Falcon
      tag: falcon
      models:
      - model: Falcon 180B
        mad_tag: pyt_vllm_falcon-180b
        model_repo: tiiuae/falcon-180B
        url: https://huggingface.co/tiiuae/falcon-180B
        precision: float16
--- a/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
@@ -0,0 +1,29 @@
 megatron-lm_benchmark:
  model_groups:
    - group: Meta Llama
      tag: llama
      models:
      - model: Llama 3.3 70B
        mad_tag: pyt_megatron_lm_train_llama-3.3-70b
      - model: Llama 3.1 8B
        mad_tag: pyt_megatron_lm_train_llama-3.1-8b
      - model: Llama 3.1 70B
        mad_tag: pyt_megatron_lm_train_llama-3.1-70b
      - model: Llama 2 7B
        mad_tag: pyt_megatron_lm_train_llama-2-7b
      - model: Llama 2 70B
        mad_tag: pyt_megatron_lm_train_llama-2-70b
    - group: DeepSeek
      tag: deepseek
      models:
      - model: DeepSeek-V3
        mad_tag: pyt_megatron_lm_train_deepseek-v3-proxy
      - model: DeepSeek-V2-Lite
        mad_tag: pyt_megatron_lm_train_deepseek-v2-lite-16b
    - group: Mistral AI
      tag: mistral
      models:
      - model: Mixtral 8x7B
        mad_tag: pyt_megatron_lm_train_mixtral-8x7b
      - model: Mixtral 8x22B
        mad_tag: pyt_megatron_lm_train_mixtral-8x22b-proxy
--- a/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
@@ -0,0 +1,120 @@
 unified_docker:
  latest:
    pull_tag: rocm/pytorch-training:v25.6
    docker_hub_url: https://hub.docker.com/r/rocm/pytorch-training/tags
    rocm_version: 6.4.1
    pytorch_version: 2.8.0a0+git7d205b2
    python_version: 3.10.17
    transformer_engine_version: 1.14.0+2f85f5f2
    flash_attention_version: 3.0.0.post1
    hipblaslt_version: 0.15.0-8c6919d
    triton_version: 3.3.0
 model_groups:
  - group: Pre-training
    tag: pre-training
    models:
    - model: Llama 3.1 8B
      mad_tag: pyt_train_llama-3.1-8b
      model_repo: Llama-3.1-8B
      url: https://huggingface.co/meta-llama/Llama-3.1-8B
      precision: BF16
      training_modes: [pretrain]
    - model: Llama 3.1 70B
      mad_tag: pyt_train_llama-3.1-70b
      model_repo: Llama-3.1-70B
      url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
      precision: BF16
      training_modes: [pretrain]
    - model: FLUX.1-dev
      mad_tag: pyt_train_flux
      model_repo: Flux
      url: https://huggingface.co/black-forest-labs/FLUX.1-dev
      precision: BF16
      training_modes: [pretrain]
  - group: Fine-tuning
    tag: fine-tuning
    models:
    - model: Llama 4 Scout 17B-16E
      mad_tag: pyt_train_llama-4-scout-17b-16e
      model_repo: Llama-4-17B_16E
      url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E
      precision: BF16
      training_modes: [finetune_fw, finetune_lora]
    - model: Llama 3.3 70B
      mad_tag: pyt_train_llama-3.3-70b
      model_repo: Llama-3.3-70B
      url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
      precision: BF16
      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
    - model: Llama 3.2 1B
      mad_tag: pyt_train_llama-3.2-1b
      model_repo: Llama-3.2-1B
      url: https://huggingface.co/meta-llama/Llama-3.2-1B
      precision: BF16
      training_modes: [finetune_fw, finetune_lora]
    - model: Llama 3.2 3B
      mad_tag: pyt_train_llama-3.2-3b
      model_repo: Llama-3.2-3B
      url: https://huggingface.co/meta-llama/Llama-3.2-3B
      precision: BF16
      training_modes: [finetune_fw, finetune_lora]
    - model: Llama 3.2 Vision 11B
      mad_tag: pyt_train_llama-3.2-vision-11b
      model_repo: Llama-3.2-Vision-11B
      url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision
      precision: BF16
      training_modes: [finetune_fw]
    - model: Llama 3.2 Vision 90B
      mad_tag: pyt_train_llama-3.2-vision-90b
      model_repo: Llama-3.2-Vision-90B
      url: https://huggingface.co/meta-llama/Llama-3.2-90B-Vision
      precision: BF16
      training_modes: [finetune_fw]
    - model: Llama 3.1 8B
      mad_tag: pyt_train_llama-3.1-8b
      model_repo: Llama-3.1-8B
      url: https://huggingface.co/meta-llama/Llama-3.1-8B
      precision: BF16
      training_modes: [finetune_fw, finetune_lora]
    - model: Llama 3.1 70B
      mad_tag: pyt_train_llama-3.1-70b
      model_repo: Llama-3.1-70B
      url: https://huggingface.co/meta-llama/Llama-3.1-70B
      precision: BF16
      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
    - model: Llama 3.1 405B
      mad_tag: pyt_train_llama-3.1-405b
      model_repo: Llama-3.1-405B
      url: https://huggingface.co/meta-llama/Llama-3.1-405B
      precision: BF16
      training_modes: [finetune_qlora, HF_finetune_lora]
    - model: Llama 3 8B
      mad_tag: pyt_train_llama-3-8b
      model_repo: Llama-3-8B
      url: https://huggingface.co/meta-llama/Meta-Llama-3-8B
      precision: BF16
      training_modes: [finetune_fw, finetune_lora]
    - model: Llama 3 70B
      mad_tag: pyt_train_llama-3-70b
      model_repo: Llama-3-70B
      url: https://huggingface.co/meta-llama/Meta-Llama-3-70B
      precision: BF16
      training_modes: [finetune_fw, finetune_lora]
    - model: Llama 2 7B
      mad_tag: pyt_train_llama-2-7b
      model_repo: Llama-2-7B
      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
      precision: BF16
      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
    - model: Llama 2 13B
      mad_tag: pyt_train_llama-2-13b
      model_repo: Llama-2-13B
      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
      precision: BF16
      training_modes: [finetune_fw, finetune_lora]
    - model: Llama 2 70B
      mad_tag: pyt_train_llama-2-70b
      model_repo: Llama-2-70B
      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
      precision: BF16
      training_modes: [finetune_lora, finetune_qlora, HF_finetune_lora]
--- a/docs/how-to/rocm-for-ai/inference-optimization/workload.rst
+++ b/docs/how-to/rocm-for-ai/inference-optimization/workload.rst
@@ -678,7 +678,7 @@ To specify the quantization scaling config, use the
 ``--quantization-param-path`` parameter. If the parameter is not specified,
 the default scaling factor of ``1`` is used, which can lead to less accurate
 results. To generate ``kv-cache`` scaling JSON file, see `FP8 KV
-Cache <https://github.com/vllm-project/vllm/blob/main/examples/fp8/README.md>`__
+Cache <https://github.com/vllm-project/llm-compressor/blob/main/examples/quantization_kv_cache/README.md>`__
 in the vLLM GitHub repository.
 Two sample Llama scaling configuration files are in vLLM for ``llama2-70b`` and
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.4.3.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.4.3.rst
@@ -0,0 +1,346 @@
 :orphan:
 .. meta::
   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the unified
                 ROCm Docker image.
   :keywords: model, MAD, automation, dashboarding, validate
 **********************************
 vLLM inference performance testing
 **********************************
 .. caution::
   This documentation does not reflect the latest version of ROCm vLLM
   inference performance documentation. See :doc:`../vllm` for the latest version.
 .. _vllm-benchmark-unified-docker:
 The `ROCm vLLM Docker <https://hub.docker.com/r/rocm/vllm/tags>`_ image offers
 a prebuilt, optimized environment designed for validating large language model
 (LLM) inference performance on the AMD Instinct™ MI300X accelerator. This
 ROCm vLLM Docker image integrates vLLM and PyTorch tailored specifically for the
 MI300X accelerator and includes the following components:
 * `ROCm 6.2.0 <https://github.com/ROCm/ROCm>`_
 * `vLLM 0.4.3 <https://docs.vllm.ai/en/latest>`_
 * `PyTorch 2.4.0 <https://github.com/pytorch/pytorch>`_
 * Tuning files (in CSV format)
 With this Docker image, you can quickly validate the expected inference
 performance numbers on the MI300X accelerator. This topic also provides tips on
 optimizing performance with popular AI models.
 .. _vllm-benchmark-vllm:
 .. note::
   vLLM is a toolkit and library for LLM inference and
   serving. It deploys the PagedAttention algorithm, which reduces memory
   consumption and increases throughput by leveraging dynamic key and value
   allocation in GPU memory. vLLM also incorporates many LLM acceleration
   and quantization algorithms. In addition, AMD implements high-performance
   custom kernels and modules in vLLM to enhance performance further. See
   :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for more
   information.
 Getting started
 ===============
 Use the following procedures to reproduce the benchmark results on an
 MI300X accelerator with the prebuilt vLLM Docker image.
 .. _vllm-benchmark-get-started:
 1. Disable NUMA auto-balancing.
   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
   might hang until the periodic balancing is finalized. For more information,
   see :ref:`AMD Instinct MI300X system optimization <mi300x-disable-numa>`.
   .. code-block:: shell
      # disable automatic NUMA balancing
      sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
      # check if NUMA balancing is disabled (returns 0 if disabled)
      cat /proc/sys/kernel/numa_balancing
      0
 2. Download the :ref:`ROCm vLLM Docker image <vllm-benchmark-unified-docker>`.
   Use the following command to pull the Docker image from Docker Hub.
   .. code-block:: shell
      docker pull rocm/vllm:rocm6.2_mi300_ubuntu22.04_py3.9_vllm_7c5fd50
 Once setup is complete, you can choose between two options to reproduce the
 benchmark results:
 -  :ref:`MAD-integrated benchmarking <vllm-benchmark-mad>`
 -  :ref:`Standalone benchmarking <vllm-benchmark-standalone>`
 .. _vllm-benchmark-mad:
 MAD-integrated benchmarking
 ===========================
 Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
 directory and install the required packages on the host machine.
 .. code-block:: shell
   git clone https://github.com/ROCm/MAD
   cd MAD
   pip install -r requirements.txt
 Use this command to run a performance benchmark test of the Llama 3.1 8B model
 on one GPU with ``float16`` data type in the host machine.
 .. code-block:: shell
   export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
   python3 tools/run_models.py --tags pyt_vllm_llama-3.1-8b --keep-model-dir --live-output --timeout 28800
 ROCm MAD launches a Docker container with the name
 ``container_ci-pyt_vllm_llama-3.1-8b``. The latency and throughput reports of the
 model are collected in the following path: ``~/MAD/reports_float16/``
 Although the following eight models are pre-configured to collect latency and
 throughput performance data, users can also change the benchmarking parameters.
 Refer to the :ref:`Standalone benchmarking <vllm-benchmark-standalone>` section.
 Available models
 ----------------
 .. hlist::
   :columns: 3
   * ``pyt_vllm_llama-3.1-8b``
   * ``pyt_vllm_llama-3.1-70b``
   * ``pyt_vllm_llama-3.1-405b``
   * ``pyt_vllm_llama-2-7b``
   * ``pyt_vllm_mistral-7b``
   * ``pyt_vllm_qwen2-7b``
   * ``pyt_vllm_jais-13b``
   * ``pyt_vllm_jais-30b``
 .. _vllm-benchmark-standalone:
 Standalone benchmarking
 =======================
 You can run the vLLM benchmark tool independently by starting the
 :ref:`Docker container <vllm-benchmark-get-started>` as shown in the following
 snippet.
 .. code-block::
   docker pull rocm/vllm:rocm6.2_mi300_ubuntu22.04_py3.9_vllm_7c5fd50
   docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 128G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name unified_docker_vllm rocm/vllm:rocm6.2_mi300_ubuntu22.04_py3.9_vllm_7c5fd50
 In the Docker container, clone the ROCm MAD repository and navigate to the
 benchmark scripts directory at ``~/MAD/scripts/vllm``.
 .. code-block::
   git clone https://github.com/ROCm/MAD
   cd MAD/scripts/vllm
 Multiprocessing distributed executor
 --------------------------------------
 To optimize vLLM performance, add the multiprocessing API server argument ``--distributed-executor-backend mp``.
 Command
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 To start the benchmark, use the following command with the appropriate options.
 See :ref:`Options <vllm-benchmark-standalone-options>` for the list of
 options and their descriptions.
 .. code-block:: shell
   ./vllm_benchmark_report.sh -s $test_option -m $model_repo -g $num_gpu -d $datatype
 See the :ref:`examples <vllm-benchmark-run-benchmark>` for more information.
 .. note::
   The input sequence length, output sequence length, and tensor parallel (TP) are
   already configured. You don't need to specify them with this script.
 .. note::
   If you encounter the following error, pass your access-authorized Hugging
   Face token to the gated models.
   .. code-block:: shell
      OSError: You are trying to access a gated repo.
      # pass your HF_TOKEN
      export HF_TOKEN=$your_personal_hf_token
 .. _vllm-benchmark-standalone-options:
 Options
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 .. list-table::
   :header-rows: 1
   * - Name
     - Options
     - Description
   * - ``$test_option``
     - latency
     - Measure decoding token latency
   * -
     - throughput
     - Measure token generation throughput
   * -
     - all
     - Measure both throughput and latency
   * - ``$model_repo``
     - ``meta-llama/Meta-Llama-3.1-8B-Instruct``
     - Llama 3.1 8B
   * - (``float16``)
     - ``meta-llama/Meta-Llama-3.1-70B-Instruct``
     - Llama 3.1 70B
   * -
     - ``meta-llama/Meta-Llama-3.1-405B-Instruct``
     - Llama 3.1 405B
   * -
     - ``meta-llama/Llama-2-7b-chat-hf``
     - Llama 2 7B
   * -
     - ``mistralai/Mixtral-8x7B-Instruct-v0.1``
     - Mixtral 8x7B
   * -
     - ``mistralai/Mixtral-8x22B-Instruct-v0.1``
     - Mixtral 8x22B
   * -
     - ``mistralai/Mistral-7B-Instruct-v0.3``
     - Mixtral 7B
   * -
     - ``Qwen/Qwen2-7B-Instruct``
     - Qwen2 7B
   * -
     - ``core42/jais-13b-chat``
     - JAIS 13B
   * -
     - ``core42/jais-30b-chat-v3``
     - JAIS 30B
   * - ``$num_gpu``
     - 1 or 8
     - Number of GPUs
   * - ``$datatype``
     - ``float16``
     - Data type
 .. _vllm-benchmark-run-benchmark:
 Running the benchmark on the MI300X accelerator
 -----------------------------------------------
 Here are some examples of running the benchmark with various options.
 See :ref:`Options <vllm-benchmark-standalone-options>` for the list of
 options and their descriptions.
 Latency benchmark example
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 Use this command to benchmark the latency of the Llama 3.1 8B model on one GPU with the ``float16`` data type.
 .. code-block::
   ./vllm_benchmark_report.sh -s latency -m meta-llama/Meta-Llama-3.1-8B-Instruct -g 1 -d float16
 Find the latency report at:
 - ``./reports_float16/summary/Meta-Llama-3.1-8B-Instruct_latency_report.csv``
 Throughput benchmark example
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 Use this command to benchmark the throughput of the Llama 3.1 8B model on one GPU with the ``float16`` and ``float8`` data types.
 .. code-block:: shell
   ./vllm_benchmark_report.sh -s throughput -m meta-llama/Meta-Llama-3.1-8B-Instruct -g 1 -d float16
 Find the throughput reports at:
 - ``./reports_float16/summary/Meta-Llama-3.1-8B-Instruct_throughput_report.csv``
 .. raw:: html
   <style>
   mjx-container[jax="CHTML"][display="true"] {
       text-align: left;
       margin: 0;
   }
   </style>
 .. note::
   Throughput is calculated as:
   - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
   - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
 Further reading
 ===============
 - For application performance optimization strategies for HPC and AI workloads,
  including inference with vLLM, see :doc:`/how-to/tuning-guides/mi300x/workload`.
 - To learn more about the options for latency and throughput benchmark scripts,
  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
 - To learn more about system settings and management practices to configure your system for
  MI300X accelerators, see :doc:`/how-to/system-optimization/mi300x`.
 - To learn how to run LLM models from Hugging Face or your own model, see
  :doc:`Using ROCm for AI </how-to/rocm-for-ai/index>`.
 - To learn how to optimize inference on LLMs, see
  :doc:`Fine-tuning LLMs and inference optimization </how-to/llm-fine-tuning-optimization/index>`.
 - For a list of other ready-made Docker images for ROCm, see the
  :doc:`Docker image support matrix <rocm-install-on-linux:reference/docker-image-support-matrix>`.
 Previous versions
 =================
 See :doc:`vllm-history` to find documentation for previous releases
 of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.4.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.4.rst
@@ -0,0 +1,419 @@
 :orphan:
 .. meta::
   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the unified
                 ROCm Docker image.
   :keywords: model, MAD, automation, dashboarding, validate
 **********************************
 vLLM inference performance testing
 **********************************
 .. caution::
   This documentation does not reflect the latest version of ROCm vLLM
   inference performance documentation. See :doc:`../vllm` for the latest version.
 .. _vllm-benchmark-unified-docker:
 The `ROCm vLLM Docker <https://hub.docker.com/r/rocm/vllm/tags>`_ image offers
 a prebuilt, optimized environment designed for validating large language model
 (LLM) inference performance on the AMD Instinct™ MI300X accelerator. This
 ROCm vLLM Docker image integrates vLLM and PyTorch tailored specifically for the
 MI300X accelerator and includes the following components:
 * `ROCm 6.2.1 <https://github.com/ROCm/ROCm>`_
 * `vLLM 0.6.4 <https://docs.vllm.ai/en/latest>`_
 * `PyTorch 2.5.0 <https://github.com/pytorch/pytorch>`_
 * Tuning files (in CSV format)
 With this Docker image, you can quickly validate the expected inference
 performance numbers on the MI300X accelerator. This topic also provides tips on
 optimizing performance with popular AI models.
 .. hlist::
   :columns: 6
   * Llama 3.1 8B
   * Llama 3.1 70B
   * Llama 3.1 405B
   * Llama 2 7B
   * Llama 2 70B
   * Mixtral 8x7B
   * Mixtral 8x22B
   * Mixtral 7B
   * Qwen2 7B
   * Qwen2 72B
   * JAIS 13B
   * JAIS 30B
 .. _vllm-benchmark-vllm:
 .. note::
   vLLM is a toolkit and library for LLM inference and serving. AMD implements
   high-performance custom kernels and modules in vLLM to enhance performance.
   See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
   more information.
 Getting started
 ===============
 Use the following procedures to reproduce the benchmark results on an
 MI300X accelerator with the prebuilt vLLM Docker image.
 .. _vllm-benchmark-get-started:
 1. Disable NUMA auto-balancing.
   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
   might hang until the periodic balancing is finalized. For more information,
   see :ref:`AMD Instinct MI300X system optimization <mi300x-disable-numa>`.
   .. code-block:: shell
      # disable automatic NUMA balancing
      sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
      # check if NUMA balancing is disabled (returns 0 if disabled)
      cat /proc/sys/kernel/numa_balancing
      0
 2. Download the :ref:`ROCm vLLM Docker image <vllm-benchmark-unified-docker>`.
   Use the following command to pull the Docker image from Docker Hub.
   .. code-block:: shell
      docker pull rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
 Once setup is complete, you can choose between two options to reproduce the
 benchmark results:
 -  :ref:`MAD-integrated benchmarking <vllm-benchmark-mad>`
 -  :ref:`Standalone benchmarking <vllm-benchmark-standalone>`
 .. _vllm-benchmark-mad:
 MAD-integrated benchmarking
 ===========================
 Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
 directory and install the required packages on the host machine.
 .. code-block:: shell
   git clone https://github.com/ROCm/MAD
   cd MAD
   pip install -r requirements.txt
 Use this command to run a performance benchmark test of the Llama 3.1 8B model
 on one GPU with ``float16`` data type in the host machine.
 .. code-block:: shell
   export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
   python3 tools/run_models.py --tags pyt_vllm_llama-3.1-8b --keep-model-dir --live-output --timeout 28800
 ROCm MAD launches a Docker container with the name
 ``container_ci-pyt_vllm_llama-3.1-8b``. The latency and throughput reports of the
 model are collected in the following path: ``~/MAD/reports_float16/``.
 Although the following models are preconfigured to collect latency and
 throughput performance data, you can also change the benchmarking parameters.
 Refer to the :ref:`Standalone benchmarking <vllm-benchmark-standalone>` section.
 Available models
 ----------------
 .. hlist::
   :columns: 3
   * ``pyt_vllm_llama-3.1-8b``
   * ``pyt_vllm_llama-3.1-70b``
   * ``pyt_vllm_llama-3.1-405b``
   * ``pyt_vllm_llama-2-7b``
   * ``pyt_vllm_llama-2-70b``
   * ``pyt_vllm_mixtral-8x7b``
   * ``pyt_vllm_mixtral-8x22b``
   * ``pyt_vllm_mistral-7b``
   * ``pyt_vllm_qwen2-7b``
   * ``pyt_vllm_qwen2-72b``
   * ``pyt_vllm_jais-13b``
   * ``pyt_vllm_jais-30b``
   * ``pyt_vllm_llama-3.1-8b_fp8``
   * ``pyt_vllm_llama-3.1-70b_fp8``
   * ``pyt_vllm_llama-3.1-405b_fp8``
   * ``pyt_vllm_mixtral-8x7b_fp8``
   * ``pyt_vllm_mixtral-8x22b_fp8``
 .. _vllm-benchmark-standalone:
 Standalone benchmarking
 =======================
 You can run the vLLM benchmark tool independently by starting the
 :ref:`Docker container <vllm-benchmark-get-started>` as shown in the following
 snippet.
 .. code-block::
   docker pull rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
   docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 128G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name vllm_v0.6.4 rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
 In the Docker container, clone the ROCm MAD repository and navigate to the
 benchmark scripts directory at ``~/MAD/scripts/vllm``.
 .. code-block::
   git clone https://github.com/ROCm/MAD
   cd MAD/scripts/vllm
 Command
 -------
 To start the benchmark, use the following command with the appropriate options.
 See :ref:`Options <vllm-benchmark-standalone-options>` for the list of
 options and their descriptions.
 .. code-block:: shell
   ./vllm_benchmark_report.sh -s $test_option -m $model_repo -g $num_gpu -d $datatype
 See the :ref:`examples <vllm-benchmark-run-benchmark>` for more information.
 .. note::
   The input sequence length, output sequence length, and tensor parallel (TP) are
   already configured. You don't need to specify them with this script.
 .. note::
   If you encounter the following error, pass your access-authorized Hugging
   Face token to the gated models.
   .. code-block:: shell
      OSError: You are trying to access a gated repo.
      # pass your HF_TOKEN
      export HF_TOKEN=$your_personal_hf_token
 .. _vllm-benchmark-standalone-options:
 Options
 -------
 .. list-table::
   :header-rows: 1
   :align: center
   * - Name
     - Options
     - Description
   * - ``$test_option``
     - latency
     - Measure decoding token latency
   * -
     - throughput
     - Measure token generation throughput
   * -
     - all
     - Measure both throughput and latency
   * - ``$model_repo``
     - ``meta-llama/Meta-Llama-3.1-8B-Instruct``
     - Llama 3.1 8B
   * - (``float16``)
     - ``meta-llama/Meta-Llama-3.1-70B-Instruct``
     - Llama 3.1 70B
   * -
     - ``meta-llama/Meta-Llama-3.1-405B-Instruct``
     - Llama 3.1 405B
   * -
     - ``meta-llama/Llama-2-7b-chat-hf``
     - Llama 2 7B
   * -
     - ``meta-llama/Llama-2-70b-chat-hf``
     - Llama 2 70B
   * -
     - ``mistralai/Mixtral-8x7B-Instruct-v0.1``
     - Mixtral 8x7B
   * -
     - ``mistralai/Mixtral-8x22B-Instruct-v0.1``
     - Mixtral 8x22B
   * -
     - ``mistralai/Mistral-7B-Instruct-v0.3``
     - Mixtral 7B
   * -
     - ``Qwen/Qwen2-7B-Instruct``
     - Qwen2 7B
   * -
     - ``Qwen/Qwen2-72B-Instruct``
     - Qwen2 72B
   * -
     - ``core42/jais-13b-chat``
     - JAIS 13B
   * -
     - ``core42/jais-30b-chat-v3``
     - JAIS 30B
   * - ``$model_repo``
     - ``amd/Meta-Llama-3.1-8B-Instruct-FP8-KV``
     - Llama 3.1 8B
   * - (``float8``)
     - ``amd/Meta-Llama-3.1-70B-Instruct-FP8-KV``
     - Llama 3.1 70B
   * -
     - ``amd/Meta-Llama-3.1-405B-Instruct-FP8-KV``
     - Llama 3.1 405B
   * -
     - ``amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV``
     - Mixtral 8x7B
   * -
     - ``amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV``
     - Mixtral 8x22B
   * - ``$num_gpu``
     - 1 or 8
     - Number of GPUs
   * - ``$datatype``
     - ``float16`` or ``float8``
     - Data type
 .. _vllm-benchmark-run-benchmark:
 Running the benchmark on the MI300X accelerator
 -----------------------------------------------
 Here are some examples of running the benchmark with various options.
 See :ref:`Options <vllm-benchmark-standalone-options>` for the list of
 options and their descriptions.
 Example 1: latency benchmark
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 Use this command to benchmark the latency of the Llama 3.1 8B model on one GPU with the ``float16`` and ``float8`` data types.
 .. code-block::
   ./vllm_benchmark_report.sh -s latency -m meta-llama/Meta-Llama-3.1-8B-Instruct -g 1 -d float16
   ./vllm_benchmark_report.sh -s latency -m amd/Meta-Llama-3.1-8B-Instruct-FP8-KV -g 1 -d float8
 Find the latency reports at:
 - ``./reports_float16/summary/Meta-Llama-3.1-8B-Instruct_latency_report.csv``
 - ``./reports_float8/summary/Meta-Llama-3.1-8B-Instruct-FP8-KV_latency_report.csv``
 Example 2: throughput benchmark
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 Use this command to benchmark the throughput of the Llama 3.1 8B model on one GPU with the ``float16`` and ``float8`` data types.
 .. code-block:: shell
   ./vllm_benchmark_report.sh -s throughput -m meta-llama/Meta-Llama-3.1-8B-Instruct -g 1 -d float16
   ./vllm_benchmark_report.sh -s throughput -m amd/Meta-Llama-3.1-8B-Instruct-FP8-KV -g 1 -d float8
 Find the throughput reports at:
 - ``./reports_float16/summary/Meta-Llama-3.1-8B-Instruct_throughput_report.csv``
 - ``./reports_float8/summary/Meta-Llama-3.1-8B-Instruct-FP8-KV_throughput_report.csv``
 .. raw:: html
   <style>
   mjx-container[jax="CHTML"][display="true"] {
       text-align: left;
       margin: 0;
   }
   </style>
 .. note::
   Throughput is calculated as:
   - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
   - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
 Further reading
 ===============
 - For application performance optimization strategies for HPC and AI workloads,
  including inference with vLLM, see :doc:`/how-to/tuning-guides/mi300x/workload`.
 - To learn more about the options for latency and throughput benchmark scripts,
  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
 - To learn more about system settings and management practices to configure your system for
  MI300X accelerators, see :doc:`/how-to/system-optimization/mi300x`.
 - To learn how to run LLM models from Hugging Face or your own model, see
  :doc:`Using ROCm for AI </how-to/rocm-for-ai/index>`.
 - To learn how to optimize inference on LLMs, see
  :doc:`Fine-tuning LLMs and inference optimization </how-to/llm-fine-tuning-optimization/index>`.
 - For a list of other ready-made Docker images for ROCm, see the
  :doc:`Docker image support matrix <rocm-install-on-linux:reference/docker-image-support-matrix>`.
 - To compare with the previous version of the ROCm vLLM Docker image for performance validation, refer to
  `LLM inference performance validation on AMD Instinct MI300X (ROCm 6.2.0) <https://rocm.docs.amd.com/en/docs-6.2.0/how-to/performance-validation/mi300x/vllm-benchmark.html>`_.
 Previous versions
 =================
 See :doc:`vllm-history` to find documentation for previous releases
 of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.6.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.6.rst
@@ -0,0 +1,461 @@
 :orphan:
 .. meta::
   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
                 ROCm vLLM Docker image.
   :keywords: model, MAD, automation, dashboarding, validate
 ***********************************************************
 LLM inference performance validation on AMD Instinct MI300X
 ***********************************************************
 .. caution::
   This documentation does not reflect the latest version of ROCm vLLM
   inference performance documentation. See :doc:`../vllm` for the latest version.
 .. _vllm-benchmark-unified-docker:
 The `ROCm vLLM Docker <https://hub.docker.com/r/rocm/vllm/tags>`_ image offers
 a prebuilt, optimized environment for validating large language model (LLM)
 inference performance on the AMD Instinct™ MI300X accelerator. This ROCm vLLM
 Docker image integrates vLLM and PyTorch tailored specifically for the MI300X
 accelerator and includes the following components:
 * `ROCm 6.3.1 <https://github.com/ROCm/ROCm>`_
 * `vLLM 0.6.6 <https://docs.vllm.ai/en/latest>`_
 * `PyTorch 2.7.0 (2.7.0a0+git3a58512) <https://github.com/pytorch/pytorch>`_
 With this Docker image, you can quickly validate the expected inference
 performance numbers for the MI300X accelerator. This topic also provides tips on
 optimizing performance with popular AI models. For more information, see the lists of
 :ref:`available models for MAD-integrated benchmarking <vllm-benchmark-mad-models>`
 and :ref:`standalone benchmarking <vllm-benchmark-standalone-options>`.
 .. _vllm-benchmark-vllm:
 .. note::
   vLLM is a toolkit and library for LLM inference and serving. AMD implements
   high-performance custom kernels and modules in vLLM to enhance performance.
   See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
   more information.
 Getting started
 ===============
 Use the following procedures to reproduce the benchmark results on an
 MI300X accelerator with the prebuilt vLLM Docker image.
 .. _vllm-benchmark-get-started:
 1. Disable NUMA auto-balancing.
   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
   might hang until the periodic balancing is finalized. For more information,
   see :ref:`AMD Instinct MI300X system optimization <mi300x-disable-numa>`.
   .. code-block:: shell
      # disable automatic NUMA balancing
      sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
      # check if NUMA balancing is disabled (returns 0 if disabled)
      cat /proc/sys/kernel/numa_balancing
      0
 2. Download the :ref:`ROCm vLLM Docker image <vllm-benchmark-unified-docker>`.
   Use the following command to pull the Docker image from Docker Hub.
   .. code-block:: shell
      docker pull rocm/vllm:rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6
 Once the setup is complete, choose between two options to reproduce the
 benchmark results:
 -  :ref:`MAD-integrated benchmarking <vllm-benchmark-mad>`
 -  :ref:`Standalone benchmarking <vllm-benchmark-standalone>`
 .. _vllm-benchmark-mad:
 MAD-integrated benchmarking
 ===========================
 Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
 directory and install the required packages on the host machine.
 .. code-block:: shell
   git clone https://github.com/ROCm/MAD
   cd MAD
   pip install -r requirements.txt
 Use this command to run a performance benchmark test of the Llama 3.1 8B model
 on one GPU with ``float16`` data type in the host machine.
 .. code-block:: shell
   export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
   python3 tools/run_models.py --tags pyt_vllm_llama-3.1-8b --keep-model-dir --live-output --timeout 28800
 ROCm MAD launches a Docker container with the name
 ``container_ci-pyt_vllm_llama-3.1-8b``. The latency and throughput reports of the
 model are collected in the following path: ``~/MAD/reports_float16/``.
 Although the following models are preconfigured to collect latency and
 throughput performance data, you can also change the benchmarking parameters.
 Refer to the :ref:`Standalone benchmarking <vllm-benchmark-standalone>` section.
 .. _vllm-benchmark-mad-models:
 Available models
 ----------------
 .. list-table::
   :header-rows: 1
   :widths: 2, 3
   * - Model name
     - Tag
   * - `Llama 3.1 8B <https://huggingface.co/meta-llama/Llama-3.1-8B>`_
     - ``pyt_vllm_llama-3.1-8b``
   * - `Llama 3.1 70B <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
     - ``pyt_vllm_llama-3.1-70b``
   * - `Llama 3.1 405B <https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct>`_
     - ``pyt_vllm_llama-3.1-405b``
   * - `Llama 3.2 11B Vision <https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct>`_
     - ``pyt_vllm_llama-3.2-11b-vision-instruct``
   * - `Llama 2 7B <https://huggingface.co/meta-llama/Llama-2-7b-chat-hf>`_
     - ``pyt_vllm_llama-2-7b``
   * - `Llama 2 70B <https://huggingface.co/meta-llama/Llama-2-70b-chat-hf>`_
     - ``pyt_vllm_llama-2-70b``
   * - `Mixtral MoE 8x7B <https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1>`_
     - ``pyt_vllm_mixtral-8x7b``
   * - `Mixtral MoE 8x22B <https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1>`_
     - ``pyt_vllm_mixtral-8x22b``
   * - `Mistral 7B <https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3>`_
     - ``pyt_vllm_mistral-7b``
   * - `Qwen2 7B <https://huggingface.co/Qwen/Qwen2-7B-Instruct>`_
     - ``pyt_vllm_qwen2-7b``
   * - `Qwen2 72B <https://huggingface.co/Qwen/Qwen2-72B-Instruct>`_
     - ``pyt_vllm_qwen2-72b``
   * - `JAIS 13B <https://huggingface.co/core42/jais-13b-chat>`_
     - ``pyt_vllm_jais-13b``
   * - `JAIS 30B <https://huggingface.co/core42/jais-30b-chat-v3>`_
     - ``pyt_vllm_jais-30b``
   * - `DBRX Instruct <https://huggingface.co/databricks/dbrx-instruct>`_
     - ``pyt_vllm_dbrx-instruct``
   * - `Gemma 2 27B <https://huggingface.co/google/gemma-2-27b>`_
     - ``pyt_vllm_gemma-2-27b``
   * - `C4AI Command R+ 08-2024 <https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024>`_
     - ``pyt_vllm_c4ai-command-r-plus-08-2024``
   * - `DeepSeek MoE 16B <https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat>`_
     - ``pyt_vllm_deepseek-moe-16b-chat``
   * - `Llama 3.1 70B FP8 <https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV>`_
     - ``pyt_vllm_llama-3.1-70b_fp8``
   * - `Llama 3.1 405B FP8 <https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV>`_
     - ``pyt_vllm_llama-3.1-405b_fp8``
   * - `Mixtral MoE 8x7B FP8 <https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV>`_
     - ``pyt_vllm_mixtral-8x7b_fp8``
   * - `Mixtral MoE 8x22B FP8 <https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV>`_
     - ``pyt_vllm_mixtral-8x22b_fp8``
   * - `Mistral 7B FP8 <https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV>`_
     - ``pyt_vllm_mistral-7b_fp8``
   * - `DBRX Instruct FP8 <https://huggingface.co/amd/dbrx-instruct-FP8-KV>`_
     - ``pyt_vllm_dbrx_fp8``
   * - `C4AI Command R+ 08-2024 FP8 <https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV>`_
     - ``pyt_vllm_command-r-plus_fp8``
 .. _vllm-benchmark-standalone:
 Standalone benchmarking
 =======================
 You can run the vLLM benchmark tool independently by starting the
 :ref:`Docker container <vllm-benchmark-get-started>` as shown in the following
 snippet.
 .. code-block::
   docker pull rocm/vllm:rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6
   docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name vllm_v0.6.6 rocm/vllm:rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6
 In the Docker container, clone the ROCm MAD repository and navigate to the
 benchmark scripts directory at ``~/MAD/scripts/vllm``.
 .. code-block::
   git clone https://github.com/ROCm/MAD
   cd MAD/scripts/vllm
 Command
 -------
 To start the benchmark, use the following command with the appropriate options.
 See :ref:`Options <vllm-benchmark-standalone-options>` for the list of
 options and their descriptions.
 .. code-block:: shell
   ./vllm_benchmark_report.sh -s $test_option -m $model_repo -g $num_gpu -d $datatype
 See the :ref:`examples <vllm-benchmark-run-benchmark>` for more information.
 .. note::
   The input sequence length, output sequence length, and tensor parallel (TP) are
   already configured. You don't need to specify them with this script.
 .. note::
   If you encounter the following error, pass your access-authorized Hugging
   Face token to the gated models.
   .. code-block:: shell
      OSError: You are trying to access a gated repo.
      # pass your HF_TOKEN
      export HF_TOKEN=$your_personal_hf_token
 .. _vllm-benchmark-standalone-options:
 Options and available models
 ----------------------------
 .. list-table::
   :header-rows: 1
   :align: center
   * - Name
     - Options
     - Description
   * - ``$test_option``
     - latency
     - Measure decoding token latency
   * -
     - throughput
     - Measure token generation throughput
   * -
     - all
     - Measure both throughput and latency
   * - ``$model_repo``
     - ``meta-llama/Llama-3.1-8B-Instruct``
     - `Llama 3.1 8B <https://huggingface.co/meta-llama/Llama-3.1-8B>`_
   * - (``float16``)
     - ``meta-llama/Llama-3.1-70B-Instruct``
     - `Llama 3.1 70B <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
   * -
     - ``meta-llama/Llama-3.1-405B-Instruct``
     - `Llama 3.1 405B <https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct>`_
   * -
     - ``meta-llama/Llama-3.2-11B-Vision-Instruct``
     - `Llama 3.2 11B Vision <https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct>`_
   * -
     - ``meta-llama/Llama-2-7b-chat-hf``
     - `Llama 2 7B <https://huggingface.co/meta-llama/Llama-2-7b-chat-hf>`_
   * -
     - ``meta-llama/Llama-2-70b-chat-hf``
     - `Llama 2 7B <https://huggingface.co/meta-llama/Llama-2-70b-chat-hf>`_
   * -
     - ``mistralai/Mixtral-8x7B-Instruct-v0.1``
     - `Mixtral MoE 8x7B <https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1>`_
   * -
     - ``mistralai/Mixtral-8x22B-Instruct-v0.1``
     - `Mixtral MoE 8x22B <https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1>`_
   * -
     - ``mistralai/Mistral-7B-Instruct-v0.3``
     - `Mistral 7B <https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3>`_
   * -
     - ``Qwen/Qwen2-7B-Instruct``
     - `Qwen2 7B <https://huggingface.co/Qwen/Qwen2-7B-Instruct>`_
   * -
     - ``Qwen/Qwen2-72B-Instruct``
     - `Qwen2 72B <https://huggingface.co/Qwen/Qwen2-72B-Instruct>`_
   * -
     - ``core42/jais-13b-chat``
     - `JAIS 13B <https://huggingface.co/core42/jais-13b-chat>`_
   * -
     - ``core42/jais-30b-chat-v3``
     - `JAIS 30B <https://huggingface.co/core42/jais-30b-chat-v3>`_
   * -
     - ``databricks/dbrx-instruct``
     - `DBRX Instruct <https://huggingface.co/databricks/dbrx-instruct>`_
   * -
     - ``google/gemma-2-27b``
     - `Gemma 2 27B <https://huggingface.co/google/gemma-2-27b>`_
   * -
     - ``CohereForAI/c4ai-command-r-plus-08-2024``
     - `C4AI Command R+ 08-2024 <https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024>`_
   * -
     - ``deepseek-ai/deepseek-moe-16b-chat``
     - `DeepSeek MoE 16B <https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat>`_
   * - ``$model_repo``
     - ``amd/Llama-3.1-70B-Instruct-FP8-KV``
     - `Llama 3.1 70B FP8 <https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV>`_
   * - (``float8``)
     - ``amd/Llama-3.1-405B-Instruct-FP8-KV``
     - `Llama 3.1 405B FP8 <https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV>`_
   * -
     - ``amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV``
     - `Mixtral MoE 8x7B FP8 <https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV>`_
   * -
     - ``amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV``
     - `Mixtral MoE 8x22B FP8 <https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV>`_
   * -
     - ``amd/Mistral-7B-v0.1-FP8-KV``
     - `Mistral 7B FP8 <https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV>`_
   * -
     - ``amd/dbrx-instruct-FP8-KV``
     - `DBRX Instruct FP8 <https://huggingface.co/amd/dbrx-instruct-FP8-KV>`_
   * -
     - ``amd/c4ai-command-r-plus-FP8-KV``
     - `C4AI Command R+ 08-2024 FP8 <https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV>`_
   * - ``$num_gpu``
     - 1 or 8
     - Number of GPUs
   * - ``$datatype``
     - ``float16`` or ``float8``
     - Data type
 .. _vllm-benchmark-run-benchmark:
 Running the benchmark on the MI300X accelerator
 -----------------------------------------------
 Here are some examples of running the benchmark with various options.
 See :ref:`Options <vllm-benchmark-standalone-options>` for the list of
 options and their descriptions.
 Example 1: latency benchmark
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 Use this command to benchmark the latency of the Llama 3.1 70B model on eight GPUs with the ``float16`` and ``float8`` data types.
 .. code-block::
   ./vllm_benchmark_report.sh -s latency -m meta-llama/Llama-3.1-70B-Instruct -g 8 -d float16
   ./vllm_benchmark_report.sh -s latency -m amd/Llama-3.1-70B-Instruct-FP8-KV -g 8 -d float8
 Find the latency reports at:
 - ``./reports_float16/summary/Llama-3.1-70B-Instruct_latency_report.csv``
 - ``./reports_float8/summary/Llama-3.1-70B-Instruct-FP8-KV_latency_report.csv``
 Example 2: throughput benchmark
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 Use this command to benchmark the throughput of the Llama 3.1 70B model on eight GPUs with the ``float16`` and ``float8`` data types.
 .. code-block:: shell
   ./vllm_benchmark_report.sh -s throughput -m meta-llama/Llama-3.1-70B-Instruct -g 8 -d float16
   ./vllm_benchmark_report.sh -s throughput -m amd/Llama-3.1-70B-Instruct-FP8-KV -g 8 -d float8
 Find the throughput reports at:
 - ``./reports_float16/summary/Llama-3.1-70B-Instruct_throughput_report.csv``
 - ``./reports_float8/summary/Llama-3.1-70B-Instruct-FP8-KV_throughput_report.csv``
 .. raw:: html
   <style>
   mjx-container[jax="CHTML"][display="true"] {
       text-align: left;
       margin: 0;
   }
   </style>
 .. note::
   Throughput is calculated as:
   - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
   - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
 Further reading
 ===============
 - For application performance optimization strategies for HPC and AI workloads,
  including inference with vLLM, see :doc:`../inference-optimization/workload`.
 - To learn more about the options for latency and throughput benchmark scripts,
  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
 - To learn more about system settings and management practices to configure your system for
  MI300X accelerators, see :doc:`../../system-optimization/mi300x`.
 - To learn how to run LLM models from Hugging Face or your own model, see
  :doc:`Running models from Hugging Face <hugging-face-models>`.
 - To learn how to optimize inference on LLMs, see
  :doc:`Inference optimization <../inference-optimization/index>`.
 - To learn how to fine-tune LLMs, see
  :doc:`Fine-tuning LLMs <../fine-tuning/index>`.
 Previous versions
 =================
 See :doc:`vllm-history` to find documentation for previous releases
 of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.7.3-20250325.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.7.3-20250325.rst
@@ -0,0 +1,329 @@
 :orphan:
 .. meta::
   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
                 ROCm vLLM Docker image.
   :keywords: model, MAD, automation, dashboarding, validate
 **********************************
 vLLM inference performance testing
 **********************************
 .. caution::
   This documentation does not reflect the latest version of ROCm vLLM
   inference performance documentation. See :doc:`../vllm` for the latest version.
 .. _vllm-benchmark-unified-docker:
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.7.3_20250325-benchmark-models.yaml
   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
   {% set model_groups = data.vllm_benchmark.model_groups %}
   The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
   a prebuilt, optimized environment for validating large language model (LLM)
   inference performance on AMD Instinct™ MI300X series accelerator. This ROCm vLLM
   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
   accelerators and includes the following components:
   * `ROCm {{ unified_docker.rocm_version }} <https://github.com/ROCm/ROCm>`_
   * `vLLM {{ unified_docker.vllm_version }} <https://docs.vllm.ai/en/latest>`_
   * `PyTorch {{ unified_docker.pytorch_version }} <https://github.com/pytorch/pytorch>`_
   * `hipBLASLt {{ unified_docker.hipblaslt_version }} <https://github.com/ROCm/hipBLASLt>`_
   With this Docker image, you can quickly test the :ref:`expected
   inference performance numbers <vllm-benchmark-performance-measurements>` for
   MI300X series accelerators.
   .. _vllm-benchmark-available-models:
   Available models
   ================
   .. raw:: html
      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
        <div class="row">
          <div class="col-2 me-2 model-param-head">Model</div>
          <div class="row col-10">
   {% for model_group in model_groups %}
            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
   {% endfor %}
          </div>
        </div>
        <div class="row mt-1">
          <div class="col-2 me-2 model-param-head">Model variant</div>
          <div class="row col-10">
   {% for model_group in model_groups %}
      {% set models = model_group.models %}
      {% for model in models %}
         {% if models|length % 3 == 0 %}
            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
         {% else %}
            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
         {% endif %}
      {% endfor %}
   {% endfor %}
          </div>
        </div>
      </div>
   .. _vllm-benchmark-vllm:
   {% for model_group in model_groups %}
      {% for model in model_group.models %}
   .. container:: model-doc {{model.mad_tag}}
      .. note::
         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
         Some models require access authorization prior to use via an external license agreement through a third party.
      {% endfor %}
   {% endfor %}
   .. note::
      vLLM is a toolkit and library for LLM inference and serving. AMD implements
      high-performance custom kernels and modules in vLLM to enhance performance.
      See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
      more information.
   .. _vllm-benchmark-performance-measurements:
   Performance measurements
   ========================
   To evaluate performance, the
   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
   page provides reference throughput and latency measurements for inferencing
   popular AI models.
   .. important::
      The performance data presented in
      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
      only reflects the :doc:`latest version of this inference benchmarking environment <../vllm>`_.
      The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
   Advanced features and known issues
   ==================================
   For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
   see the developer's guide at `<https://github.com/ROCm/vllm/tree/25070a1841df0dca585b7ddcb967c42aaec4b7c5/docs/dev-docker>`__.
   Getting started
   ===============
   Use the following procedures to reproduce the benchmark results on an
   MI300X accelerator with the prebuilt vLLM Docker image.
   .. _vllm-benchmark-get-started:
   1. Disable NUMA auto-balancing.
      To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
      might hang until the periodic balancing is finalized. For more information,
      see :ref:`AMD Instinct MI300X system optimization <mi300x-disable-numa>`.
      .. code-block:: shell
         # disable automatic NUMA balancing
         sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
         # check if NUMA balancing is disabled (returns 0 if disabled)
         cat /proc/sys/kernel/numa_balancing
         0
   2. Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
      Use the following command to pull the Docker image from Docker Hub.
      .. code-block:: shell
         docker pull {{ unified_docker.pull_tag }}
   Benchmarking
   ============
   Once the setup is complete, choose between two options to reproduce the
   benchmark results:
   .. _vllm-benchmark-mad:
   {% for model_group in model_groups %}
      {% for model in model_group.models %}
   .. container:: model-doc {{model.mad_tag}}
      .. tab-set::
         .. tab-item:: MAD-integrated benchmarking
            Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
            directory and install the required packages on the host machine.
            .. code-block:: shell
               git clone https://github.com/ROCm/MAD
               cd MAD
               pip install -r requirements.txt
            Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
            using one GPU with the ``{{model.precision}}`` data type on the host machine.
            .. code-block:: shell
               export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
               python3 tools/run_models.py --tags {{model.mad_tag}} --keep-model-dir --live-output --timeout 28800
            MAD launches a Docker container with the name
            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
            model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
            Although the :ref:`available models <vllm-benchmark-available-models>` are preconfigured
            to collect latency and throughput performance data, you can also change the benchmarking
            parameters. See the standalone benchmarking tab for more information.
         .. tab-item:: Standalone benchmarking
            Run the vLLM benchmark tool independently by starting the
            `Docker container <{{ unified_docker.docker_hub_url }}>`_
            as shown in the following snippet.
            .. code-block::
               docker pull {{ unified_docker.pull_tag }}
               docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name test {{ unified_docker.pull_tag }}
            In the Docker container, clone the ROCm MAD repository and navigate to the
            benchmark scripts directory at ``~/MAD/scripts/vllm``.
            .. code-block::
               git clone https://github.com/ROCm/MAD
               cd MAD/scripts/vllm
            To start the benchmark, use the following command with the appropriate options.
            .. code-block::
               ./vllm_benchmark_report.sh -s $test_option -m {{model.model_repo}} -g $num_gpu -d {{model.precision}}
            .. list-table::
               :header-rows: 1
               :align: center
               * - Name
                 - Options
                 - Description
               * - ``$test_option``
                 - latency
                 - Measure decoding token latency
               * -
                 - throughput
                 - Measure token generation throughput
               * -
                 - all
                 - Measure both throughput and latency
               * - ``$num_gpu``
                 - 1 or 8
                 - Number of GPUs
               * - ``$datatype``
                 - ``float16`` or ``float8``
                 - Data type
            .. note::
               The input sequence length, output sequence length, and tensor parallel (TP) are
               already configured. You don't need to specify them with this script.
            .. note::
               If you encounter the following error, pass your access-authorized Hugging
               Face token to the gated models.
               .. code-block::
                  OSError: You are trying to access a gated repo.
                  # pass your HF_TOKEN
                  export HF_TOKEN=$your_personal_hf_token
            Here are some examples of running the benchmark with various options.
            * Latency benchmark
              Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with the ``{{model.precision}}`` data type.
              .. code-block::
                 ./vllm_benchmark_report.sh -s latency -m {{model.model_repo}} -g 8 -d {{model.precision}}
              Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
            * Throughput benchmark
              Use this command to throughput the latency of the {{model.model}} model on eight GPUs with the ``{{model.precision}}`` data type.
              .. code-block:: shell
                 ./vllm_benchmark_report.sh -s latency -m {{model.model_repo}} -g 8 -d {{model.precision}}
              Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
            .. raw:: html
               <style>
               mjx-container[jax="CHTML"][display="true"] {
                  text-align: left;
                  margin: 0;
               }
               </style>
            .. note::
               Throughput is calculated as:
               - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
               - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
      {% endfor %}
   {% endfor %}
 Further reading
 ===============
 - For application performance optimization strategies for HPC and AI workloads,
  including inference with vLLM, see :doc:`../inference-optimization/workload`.
 - To learn more about the options for latency and throughput benchmark scripts,
  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
 - To learn more about system settings and management practices to configure your system for
  MI300X accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
 - To learn how to run LLM models from Hugging Face or your own model, see
  :doc:`Running models from Hugging Face <hugging-face-models>`.
 - To learn how to optimize inference on LLMs, see
  :doc:`Inference optimization <../inference-optimization/index>`.
 - To learn how to fine-tune LLMs, see
  :doc:`Fine-tuning LLMs <../fine-tuning/index>`.
 Previous versions
 =================
 See :doc:`vllm-history` to find documentation for previous releases
 of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.3-20250415.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.3-20250415.rst
@@ -9,7 +9,7 @@ vLLM inference performance testing
 .. _vllm-benchmark-unified-docker:
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.8.3_20250415-benchmark-models.yaml
   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
   {% set model_groups = data.vllm_benchmark.model_groups %}
@@ -98,18 +98,18 @@ vLLM inference performance testing
   page provides reference throughput and latency measurements for inferencing
   popular AI models.
-   .. note::
+   .. important::
      The performance data presented in
      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
-      should not be interpreted as the peak performance achievable by AMD
+      only reflects the :doc:`latest version of this inference benchmarking environment <../vllm>`_.
-      Instinct MI325X and MI300X accelerators or ROCm software.
+      The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
   Advanced features and known issues
   ==================================
   For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
-   see the developer's guide at `<https://github.com/ROCm/vllm/blob/main/docs/dev-docker/README.md>`__.
+   see the developer's guide at `<https://github.com/ROCm/vllm/tree/7a9f58aae0e7215a5f3dccde60e35072c41656c2/docs/dev-docker>`__.
   System validation
   =================
@@ -339,43 +339,5 @@ Further reading
 Previous versions
 =================
-This table lists previous versions of the ROCm vLLM inference Docker image for
+See :doc:`vllm-history` to find documentation for previous releases
-inference performance testing. For detailed information about available models
+of the ``ROCm/vllm`` Docker image.
 for benchmarking, see the version-specific documentation.
 .. list-table::
   :header-rows: 1
   :stub-columns: 1
   * - ROCm version
     - vLLM version
     - PyTorch version
     - Resources
   * - 6.3.1
     - 0.7.3
     - 2.7.0
     - 
       * `Documentation <https://rocm.docs.amd.com/en/docs-6.3.3/how-to/rocm-for-ai/inference/vllm-benchmark.html>`_
       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.7.3_20250325/images/sha256-25245924f61750b19be6dcd8e787e46088a496c1fe17ee9b9e397f3d84d35640>`_
   * - 6.3.1
     - 0.6.6
     - 2.7.0
     - 
       * `Documentation <https://rocm.docs.amd.com/en/docs-6.3.2/how-to/rocm-for-ai/inference/vllm-benchmark.html>`_
       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6/images/sha256-9a12ef62bbbeb5a4c30a01f702c8e025061f575aa129f291a49fbd02d6b4d6c9>`_
   * - 6.2.1
     - 0.6.4
     - 2.5.0
     - 
       * `Documentation <https://rocm.docs.amd.com/en/docs-6.3.0/how-to/performance-validation/mi300x/vllm-benchmark.html>`_
       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4/images/sha256-ccbb74cc9e7adecb8f7bdab9555f7ac6fc73adb580836c2a35ca96ff471890d8>`_
   * - 6.2.0
     - 0.4.3
     - 2.4.0
     - 
       * `Documentation <https://rocm.docs.amd.com/en/docs-6.2.0/how-to/performance-validation/mi300x/vllm-benchmark.html>`_
       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.2_mi300_ubuntu22.04_py3.9_vllm_7c5fd50/images/sha256-9e4dd4788a794c3d346d7d0ba452ae5e92d39b8dfac438b2af8efdc7f15d22c0>`_
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250513.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250513.rst
@@ -0,0 +1,354 @@
 :orphan:
 .. meta::
   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
                 ROCm vLLM Docker image.
   :keywords: model, MAD, automation, dashboarding, validate
 **********************************
 vLLM inference performance testing
 **********************************
 .. caution::
   This documentation does not reflect the latest version of ROCm vLLM
   inference performance documentation. See :doc:`../vllm` for the latest version.
 .. _vllm-benchmark-unified-docker:
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.8.5_20250513-benchmark-models.yaml
   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
   {% set model_groups = data.vllm_benchmark.model_groups %}
   The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
   a prebuilt, optimized environment for validating large language model (LLM)
   inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
   accelerators and includes the following components:
   * `ROCm {{ unified_docker.rocm_version }} <https://github.com/ROCm/ROCm>`_
   * `vLLM {{ unified_docker.vllm_version }} <https://docs.vllm.ai/en/latest>`_
   * `PyTorch {{ unified_docker.pytorch_version }} <https://github.com/pytorch/pytorch>`_
   * `hipBLASLt {{ unified_docker.hipblaslt_version }} <https://github.com/ROCm/hipBLASLt>`_
   With this Docker image, you can quickly test the :ref:`expected
   inference performance numbers <vllm-benchmark-performance-measurements>` for
   MI300X series accelerators.
   .. _vllm-benchmark-available-models:
   Supported models
   ================
   The following models are supported for inference performance benchmarking
   with vLLM and ROCm. Some instructions, commands, and recommendations in this
   documentation might vary by model -- select one to get started.
   .. raw:: html
      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
        <div class="row">
          <div class="col-2 me-2 model-param-head">Model group</div>
          <div class="row col-10">
   {% for model_group in model_groups %}
            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
   {% endfor %}
          </div>
        </div>
        <div class="row mt-1">
          <div class="col-2 me-2 model-param-head">Model</div>
          <div class="row col-10">
   {% for model_group in model_groups %}
      {% set models = model_group.models %}
      {% for model in models %}
         {% if models|length % 3 == 0 %}
            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
         {% else %}
            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
         {% endif %}
      {% endfor %}
   {% endfor %}
          </div>
        </div>
      </div>
   .. _vllm-benchmark-vllm:
   {% for model_group in model_groups %}
      {% for model in model_group.models %}
   .. container:: model-doc {{model.mad_tag}}
      .. note::
         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
         Some models require access authorization prior to use via an external license agreement through a third party.
      {% endfor %}
   {% endfor %}
   .. note::
      vLLM is a toolkit and library for LLM inference and serving. AMD implements
      high-performance custom kernels and modules in vLLM to enhance performance.
      See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
      more information.
   .. _vllm-benchmark-performance-measurements:
   Performance measurements
   ========================
   To evaluate performance, the
   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
   page provides reference throughput and latency measurements for inferencing
   popular AI models.
   .. important::
      The performance data presented in
      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
      only reflects the :doc:`latest version of this inference benchmarking environment <../vllm>`_.
      The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
   Advanced features and known issues
   ==================================
   For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
   see the developer's guide at `<https://github.com/ROCm/vllm/tree/16d2b92ebcf90fe55cf73fa0b9329a6c9d3dede8/docs/dev-docker>`__.
   System validation
   =================
   Before running AI workloads, it's important to validate that your AMD hardware is configured
   correctly and performing optimally.
   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
   might hang until the periodic balancing is finalized. For more information,
   see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
   .. code-block:: shell
      # disable automatic NUMA balancing
      sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
      # check if NUMA balancing is disabled (returns 0 if disabled)
      cat /proc/sys/kernel/numa_balancing
      0
   To test for optimal performance, consult the recommended :ref:`System health benchmarks
   <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
   system's configuration.
   Pull the Docker image
   =====================
   Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
   Use the following command to pull the Docker image from Docker Hub.
   .. code-block:: shell
      docker pull {{ unified_docker.pull_tag }}
   Benchmarking
   ============
   Once the setup is complete, choose between two options to reproduce the
   benchmark results:
   .. _vllm-benchmark-mad:
   {% for model_group in model_groups %}
      {% for model in model_group.models %}
   .. container:: model-doc {{model.mad_tag}}
      .. tab-set::
         .. tab-item:: MAD-integrated benchmarking
            Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
            directory and install the required packages on the host machine.
            .. code-block:: shell
               git clone https://github.com/ROCm/MAD
               cd MAD
               pip install -r requirements.txt
            Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
            using one GPU with the ``{{model.precision}}`` data type on the host machine.
            .. code-block:: shell
               export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
               python3 tools/run_models.py --tags {{model.mad_tag}} --keep-model-dir --live-output --timeout 28800
            MAD launches a Docker container with the name
            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
            model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
            Although the :ref:`available models <vllm-benchmark-available-models>` are preconfigured
            to collect latency and throughput performance data, you can also change the benchmarking
            parameters. See the standalone benchmarking tab for more information.
            {% if model.tunableop %}
            .. note::
               For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
               TunableOp automatically explores different implementations and configurations of certain PyTorch
               operators to find the fastest one for your hardware.
               By default, ``{{model.mad_tag}}`` runs with TunableOp disabled
               (see
               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To
               enable it, edit the default run behavior in the ``models.json``
               configuration before running inference -- update the model's run
               ``args`` by changing ``--tunableop off`` to ``--tunableop on``.
               Enabling TunableOp triggers a two-pass run -- a warm-up followed by the performance-collection run.
            {% endif %}
         .. tab-item:: Standalone benchmarking
            Run the vLLM benchmark tool independently by starting the
            `Docker container <{{ unified_docker.docker_hub_url }}>`_
            as shown in the following snippet.
            .. code-block::
               docker pull {{ unified_docker.pull_tag }}
               docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name test {{ unified_docker.pull_tag }}
            In the Docker container, clone the ROCm MAD repository and navigate to the
            benchmark scripts directory at ``~/MAD/scripts/vllm``.
            .. code-block::
               git clone https://github.com/ROCm/MAD
               cd MAD/scripts/vllm
            To start the benchmark, use the following command with the appropriate options.
            .. code-block::
               ./vllm_benchmark_report.sh -s $test_option -m {{model.model_repo}} -g $num_gpu -d {{model.precision}}
            .. list-table::
               :header-rows: 1
               :align: center
               * - Name
                 - Options
                 - Description
               * - ``$test_option``
                 - latency
                 - Measure decoding token latency
               * -
                 - throughput
                 - Measure token generation throughput
               * -
                 - all
                 - Measure both throughput and latency
               * - ``$num_gpu``
                 - 1 or 8
                 - Number of GPUs
               * - ``$datatype``
                 - ``float16`` or ``float8``
                 - Data type
            .. note::
               The input sequence length, output sequence length, and tensor parallel (TP) are
               already configured. You don't need to specify them with this script.
            .. note::
               If you encounter the following error, pass your access-authorized Hugging
               Face token to the gated models.
               .. code-block::
                  OSError: You are trying to access a gated repo.
                  # pass your HF_TOKEN
                  export HF_TOKEN=$your_personal_hf_token
            Here are some examples of running the benchmark with various options.
            * Latency benchmark
              Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision.
              .. code-block::
                 ./vllm_benchmark_report.sh -s latency -m {{model.model_repo}} -g 8 -d {{model.precision}}
              Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
            * Throughput benchmark
              Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision.
              .. code-block:: shell
                 ./vllm_benchmark_report.sh -s throughput -m {{model.model_repo}} -g 8 -d {{model.precision}}
              Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
            .. raw:: html
               <style>
               mjx-container[jax="CHTML"][display="true"] {
                  text-align: left;
                  margin: 0;
               }
               </style>
            .. note::
               Throughput is calculated as:
               - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
               - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
      {% endfor %}
   {% endfor %}
 Further reading
 ===============
 - To learn more about the options for latency and throughput benchmark scripts,
  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
 - To learn more about system settings and management practices to configure your system for
  MI300X accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
 - For application performance optimization strategies for HPC and AI workloads,
  including inference with vLLM, see :doc:`../../../inference-optimization/workload`.
 - To learn how to run LLM models from Hugging Face or your own model, see
  :doc:`Running models from Hugging Face <../../hugging-face-models>`.
 - To learn how to optimize inference on LLMs, see
  :doc:`Inference optimization <../../../inference-optimization/index>`.
 - To learn how to fine-tune LLMs, see
  :doc:`Fine-tuning LLMs <../../../fine-tuning/index>`.
 Previous versions
 =================
 See :doc:`vllm-history` to find documentation for previous releases
 of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250521.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250521.rst
@@ -0,0 +1,354 @@
 :orphan:
 .. meta::
   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
                 ROCm vLLM Docker image.
   :keywords: model, MAD, automation, dashboarding, validate
 **********************************
 vLLM inference performance testing
 **********************************
 .. caution::
   This documentation does not reflect the latest version of ROCm vLLM
   inference performance documentation. See :doc:`../vllm` for the latest version.
 .. _vllm-benchmark-unified-docker:
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.8.5_20250521-benchmark-models.yaml
   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
   {% set model_groups = data.vllm_benchmark.model_groups %}
   The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
   a prebuilt, optimized environment for validating large language model (LLM)
   inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
   accelerators and includes the following components:
   * `ROCm {{ unified_docker.rocm_version }} <https://github.com/ROCm/ROCm>`_
   * `vLLM {{ unified_docker.vllm_version }} <https://docs.vllm.ai/en/latest>`_
   * `PyTorch {{ unified_docker.pytorch_version }} <https://github.com/ROCm/pytorch.git>`_
   * `hipBLASLt {{ unified_docker.hipblaslt_version }} <https://github.com/ROCm/hipBLASLt>`_
   With this Docker image, you can quickly test the :ref:`expected
   inference performance numbers <vllm-benchmark-performance-measurements>` for
   MI300X series accelerators.
   .. _vllm-benchmark-available-models:
   Supported models
   ================
   The following models are supported for inference performance benchmarking
   with vLLM and ROCm. Some instructions, commands, and recommendations in this
   documentation might vary by model -- select one to get started.
   .. raw:: html
      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
        <div class="row">
          <div class="col-2 me-2 model-param-head">Model group</div>
          <div class="row col-10">
   {% for model_group in model_groups %}
            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
   {% endfor %}
          </div>
        </div>
        <div class="row mt-1">
          <div class="col-2 me-2 model-param-head">Model</div>
          <div class="row col-10">
   {% for model_group in model_groups %}
      {% set models = model_group.models %}
      {% for model in models %}
         {% if models|length % 3 == 0 %}
            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
         {% else %}
            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
         {% endif %}
      {% endfor %}
   {% endfor %}
          </div>
        </div>
      </div>
   .. _vllm-benchmark-vllm:
   {% for model_group in model_groups %}
      {% for model in model_group.models %}
   .. container:: model-doc {{model.mad_tag}}
      .. note::
         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
         Some models require access authorization prior to use via an external license agreement through a third party.
      {% endfor %}
   {% endfor %}
   .. note::
      vLLM is a toolkit and library for LLM inference and serving. AMD implements
      high-performance custom kernels and modules in vLLM to enhance performance.
      See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
      more information.
   .. _vllm-benchmark-performance-measurements:
   Performance measurements
   ========================
   To evaluate performance, the
   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
   page provides reference throughput and latency measurements for inferencing
   popular AI models.
   .. note::
      The performance data presented in
      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
      should not be interpreted as the peak performance achievable by AMD
      Instinct MI325X and MI300X accelerators or ROCm software.
   Advanced features and known issues
   ==================================
   For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
   see the developer's guide at `<https://github.com/ROCm/vllm/blob/main/docs/dev-docker/README.md>`__.
   System validation
   =================
   Before running AI workloads, it's important to validate that your AMD hardware is configured
   correctly and performing optimally.
   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
   might hang until the periodic balancing is finalized. For more information,
   see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
   .. code-block:: shell
      # disable automatic NUMA balancing
      sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
      # check if NUMA balancing is disabled (returns 0 if disabled)
      cat /proc/sys/kernel/numa_balancing
      0
   To test for optimal performance, consult the recommended :ref:`System health benchmarks
   <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
   system's configuration.
   Pull the Docker image
   =====================
   Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
   Use the following command to pull the Docker image from Docker Hub.
   .. code-block:: shell
      docker pull {{ unified_docker.pull_tag }}
   Benchmarking
   ============
   Once the setup is complete, choose between two options to reproduce the
   benchmark results:
   .. _vllm-benchmark-mad:
   {% for model_group in model_groups %}
      {% for model in model_group.models %}
   .. container:: model-doc {{model.mad_tag}}
      .. tab-set::
         .. tab-item:: MAD-integrated benchmarking
            Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
            directory and install the required packages on the host machine.
            .. code-block:: shell
               git clone https://github.com/ROCm/MAD
               cd MAD
               pip install -r requirements.txt
            Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
            using one GPU with the ``{{model.precision}}`` data type on the host machine.
            .. code-block:: shell
               export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
               python3 tools/run_models.py --tags {{model.mad_tag}} --keep-model-dir --live-output --timeout 28800
            MAD launches a Docker container with the name
            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
            model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
            Although the :ref:`available models <vllm-benchmark-available-models>` are preconfigured
            to collect latency and throughput performance data, you can also change the benchmarking
            parameters. See the standalone benchmarking tab for more information.
            {% if model.tunableop %}
            .. note::
               For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
               TunableOp automatically explores different implementations and configurations of certain PyTorch
               operators to find the fastest one for your hardware.
               By default, ``{{model.mad_tag}}`` runs with TunableOp disabled
               (see
               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To
               enable it, edit the default run behavior in the ``models.json``
               configuration before running inference -- update the model's run
               ``args`` by changing ``--tunableop off`` to ``--tunableop on``.
               Enabling TunableOp triggers a two-pass run -- a warm-up followed by the performance-collection run.
            {% endif %}
         .. tab-item:: Standalone benchmarking
            Run the vLLM benchmark tool independently by starting the
            `Docker container <{{ unified_docker.docker_hub_url }}>`_
            as shown in the following snippet.
            .. code-block::
               docker pull {{ unified_docker.pull_tag }}
               docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name test {{ unified_docker.pull_tag }}
            In the Docker container, clone the ROCm MAD repository and navigate to the
            benchmark scripts directory at ``~/MAD/scripts/vllm``.
            .. code-block::
               git clone https://github.com/ROCm/MAD
               cd MAD/scripts/vllm
            To start the benchmark, use the following command with the appropriate options.
            .. code-block::
               ./vllm_benchmark_report.sh -s $test_option -m {{model.model_repo}} -g $num_gpu -d {{model.precision}}
            .. list-table::
               :header-rows: 1
               :align: center
               * - Name
                 - Options
                 - Description
               * - ``$test_option``
                 - latency
                 - Measure decoding token latency
               * -
                 - throughput
                 - Measure token generation throughput
               * -
                 - all
                 - Measure both throughput and latency
               * - ``$num_gpu``
                 - 1 or 8
                 - Number of GPUs
               * - ``$datatype``
                 - ``float16`` or ``float8``
                 - Data type
            .. note::
               The input sequence length, output sequence length, and tensor parallel (TP) are
               already configured. You don't need to specify them with this script.
            .. note::
               If you encounter the following error, pass your access-authorized Hugging
               Face token to the gated models.
               .. code-block::
                  OSError: You are trying to access a gated repo.
                  # pass your HF_TOKEN
                  export HF_TOKEN=$your_personal_hf_token
            Here are some examples of running the benchmark with various options.
            * Latency benchmark
              Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision.
              .. code-block::
                 ./vllm_benchmark_report.sh -s latency -m {{model.model_repo}} -g 8 -d {{model.precision}}
              Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
            * Throughput benchmark
              Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision.
              .. code-block:: shell
                 ./vllm_benchmark_report.sh -s throughput -m {{model.model_repo}} -g 8 -d {{model.precision}}
              Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
            .. raw:: html
               <style>
               mjx-container[jax="CHTML"][display="true"] {
                  text-align: left;
                  margin: 0;
               }
               </style>
            .. note::
               Throughput is calculated as:
               - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
               - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
      {% endfor %}
   {% endfor %}
 Further reading
 ===============
 - To learn more about the options for latency and throughput benchmark scripts,
  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
 - To learn more about system settings and management practices to configure your system for
  MI300X accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
 - For application performance optimization strategies for HPC and AI workloads,
  including inference with vLLM, see :doc:`../../inference-optimization/workload`.
 - To learn how to run LLM models from Hugging Face or your own model, see
  :doc:`Running models from Hugging Face <../hugging-face-models>`.
 - To learn how to optimize inference on LLMs, see
  :doc:`Inference optimization <../../inference-optimization/index>`.
 - To learn how to fine-tune LLMs, see
  :doc:`Fine-tuning LLMs <../../fine-tuning/index>`.
 Previous versions
 =================
 See :doc:`vllm-history` to find documentation for previous releases
 of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst
@@ -0,0 +1,75 @@
 :orphan:
 **************************************************
 vLLM inference performance testing version history
 **************************************************
 This table lists previous versions of the ROCm vLLM inference Docker image for
 inference performance testing. For detailed information about available models
 for benchmarking, see the version-specific documentation. You can find tagged
 previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub <https://hub.docker.com/r/rocm/vllm/tags>`_.
 .. list-table::
   :header-rows: 1
   :stub-columns: 1
   * - ROCm version
     - vLLM version
     - PyTorch version
     - Resources
   * - 6.4.0
     - 0.9.0.1
     - 2.7.0
     - 
       * :doc:`Documentation <../vllm>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250521/images/sha256-38410c51af7208897cd8b737c9bdfc126e9bc8952d4aa6b88c85482f03092a11>`_
   * - 6.3.1
     - 0.8.5 (0.8.6.dev)
     - 2.7.0
     - 
       * :doc:`Documentation <vllm-0.8.5-20250521>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250521/images/sha256-38410c51af7208897cd8b737c9bdfc126e9bc8952d4aa6b88c85482f03092a11>`_
   * - 6.3.1
     - 0.8.5
     - 2.7.0
     - 
       * :doc:`Documentation <vllm-0.8.5-20250513>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250513/images/sha256-5c8b4436dd0464119d9df2b44c745fadf81512f18ffb2f4b5dc235c71ebe26b4>`_
   * - 6.3.1
     - 0.8.3
     - 2.7.0
     - 
       * :doc:`Documentation <vllm-0.8.3-20250415>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.8.3_20250415/images/sha256-ad9062dea3483d59dedb17c67f7c49f30eebd6eb37c3fac0a171fb19696cc845>`_
   * - 6.3.1
     - 0.7.3
     - 2.7.0
     - 
       * :doc:`Documentation <vllm-0.7.3-20250325>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.7.3_20250325/images/sha256-25245924f61750b19be6dcd8e787e46088a496c1fe17ee9b9e397f3d84d35640>`_
   * - 6.3.1
     - 0.6.6
     - 2.7.0
     - 
       * :doc:`Documentation <vllm-0.6.6>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6/images/sha256-9a12ef62bbbeb5a4c30a01f702c8e025061f575aa129f291a49fbd02d6b4d6c9>`_
   * - 6.2.1
     - 0.6.4
     - 2.5.0
     - 
       * :doc:`Documentation <vllm-0.6.4>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4/images/sha256-ccbb74cc9e7adecb8f7bdab9555f7ac6fc73adb580836c2a35ca96ff471890d8>`_
   * - 6.2.0
     - 0.4.3
     - 2.4.0
     - 
       * :doc:`Documentation <vllm-0.4.3>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.2_mi300_ubuntu22.04_py3.9_vllm_7c5fd50/images/sha256-9e4dd4788a794c3d346d7d0ba452ae5e92d39b8dfac438b2af8efdc7f15d22c0>`_
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference.rst
@@ -24,20 +24,24 @@ PyTorch inference performance testing
   Supported models
   ================
   The following models are supported for inference performance benchmarking
   with PyTorch and ROCm. Some instructions, commands, and recommendations in this
   documentation might vary by model -- select one to get started.
   .. raw:: html
      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
        <div class="row">
-          <div class="col-2 me-2 model-param-head">Model</div>
+          <div class="col-2 me-2 model-param-head">Model group</div>
          <div class="row col-10">
   {% for model_group in model_groups %}
-            <div class="col-6 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+            <div class="col-4 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
   {% endfor %}
          </div>
        </div>
        <div class="row mt-1" style="display: none;">
-          <div class="col-2 me-2 model-param-head">Model variant</div>
+          <div class="col-2 me-2 model-param-head">Model</div>
          <div class="row col-10">
   {% for model_group in model_groups %}
      {% set models = model_group.models %}
@@ -99,7 +103,7 @@ PyTorch inference performance testing
         The Chai-1 benchmark uses a specifically selected Docker image using ROCm 6.2.3 and PyTorch 2.3.0 to address an accuracy issue.
-   .. container:: model-doc pyt_clip_inference
+   .. container:: model-doc pyt_clip_inference pyt_mochi_video_inference
      Use the following command to pull the `ROCm PyTorch Docker image <https://hub.docker.com/layers/rocm/pytorch/latest/images/sha256-05b55983e5154f46e7441897d0908d79877370adca4d1fff4899d9539d6c4969>`_ from Docker Hub.
@@ -162,11 +166,14 @@ Further reading
 - To learn more about system settings and management practices to configure your system for
  MI300X accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
 - For application performance optimization strategies for HPC and AI workloads,
  including inference with vLLM, see :doc:`../../inference-optimization/workload`.
 - To learn how to run LLM models from Hugging Face or your model, see
-  :doc:`Running models from Hugging Face <hugging-face-models>`.
+  :doc:`Running models from Hugging Face <../hugging-face-models>`.
 - To learn how to optimize inference on LLMs, see
-  :doc:`Inference optimization <../inference-optimization/index>`.
+  :doc:`Inference optimization <../../inference-optimization/index>`.
 - To learn how to fine-tune LLMs, see
-  :doc:`Fine-tuning LLMs <../fine-tuning/index>`.
+  :doc:`Fine-tuning LLMs <../../fine-tuning/index>`.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
@@ -0,0 +1,346 @@
 .. meta::
   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
                 ROCm vLLM Docker image.
   :keywords: model, MAD, automation, dashboarding, validate
 **********************************
 vLLM inference performance testing
 **********************************
 .. _vllm-benchmark-unified-docker:
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
   {% set model_groups = data.vllm_benchmark.model_groups %}
   The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
   a prebuilt, optimized environment for validating large language model (LLM)
   inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
   accelerators and includes the following components:
   * `ROCm {{ unified_docker.rocm_version }} <https://github.com/ROCm/ROCm>`_
   * `vLLM {{ unified_docker.vllm_version }} <https://docs.vllm.ai/en/latest>`_
   * `PyTorch {{ unified_docker.pytorch_version }} <https://github.com/ROCm/pytorch.git>`_
   * `hipBLASLt {{ unified_docker.hipblaslt_version }} <https://github.com/ROCm/hipBLASLt>`_
   With this Docker image, you can quickly test the :ref:`expected
   inference performance numbers <vllm-benchmark-performance-measurements>` for
   MI300X series accelerators.
   .. _vllm-benchmark-available-models:
   Supported models
   ================
   The following models are supported for inference performance benchmarking
   with vLLM and ROCm. Some instructions, commands, and recommendations in this
   documentation might vary by model -- select one to get started.
   .. raw:: html
      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
        <div class="row">
          <div class="col-2 me-2 model-param-head">Model group</div>
          <div class="row col-10">
   {% for model_group in model_groups %}
            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
   {% endfor %}
          </div>
        </div>
        <div class="row mt-1">
          <div class="col-2 me-2 model-param-head">Model</div>
          <div class="row col-10">
   {% for model_group in model_groups %}
      {% set models = model_group.models %}
      {% for model in models %}
         {% if models|length % 3 == 0 %}
            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
         {% else %}
            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
         {% endif %}
      {% endfor %}
   {% endfor %}
          </div>
        </div>
      </div>
   .. _vllm-benchmark-vllm:
   {% for model_group in model_groups %}
      {% for model in model_group.models %}
   .. container:: model-doc {{model.mad_tag}}
      .. note::
         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
         Some models require access authorization prior to use via an external license agreement through a third party.
      {% endfor %}
   {% endfor %}
   .. note::
      vLLM is a toolkit and library for LLM inference and serving. AMD implements
      high-performance custom kernels and modules in vLLM to enhance performance.
      See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
      more information.
   .. _vllm-benchmark-performance-measurements:
   Performance measurements
   ========================
   To evaluate performance, the
   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
   page provides reference throughput and latency measurements for inferencing popular AI models.
   .. important::
      The performance data presented in
      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
      only reflects the latest version of this inference benchmarking environment.
      The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
   Advanced features and known issues
   ==================================
   For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
   see the developer's guide at `<https://github.com/ROCm/vllm/tree/7bb0618b1fe725b7d4fad9e525aa44da12c94a8b/docs/dev-docker>`__.
   System validation
   =================
   Before running AI workloads, it's important to validate that your AMD hardware is configured
   correctly and performing optimally.
   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
   might hang until the periodic balancing is finalized. For more information,
   see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
   .. code-block:: shell
      # disable automatic NUMA balancing
      sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
      # check if NUMA balancing is disabled (returns 0 if disabled)
      cat /proc/sys/kernel/numa_balancing
      0
   To test for optimal performance, consult the recommended :ref:`System health benchmarks
   <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
   system's configuration.
   Pull the Docker image
   =====================
   Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
   Use the following command to pull the Docker image from Docker Hub.
   .. code-block:: shell
      docker pull {{ unified_docker.pull_tag }}
   Benchmarking
   ============
   Once the setup is complete, choose between two options to reproduce the
   benchmark results:
   .. _vllm-benchmark-mad:
   {% for model_group in model_groups %}
      {% for model in model_group.models %}
   .. container:: model-doc {{model.mad_tag}}
      .. tab-set::
         .. tab-item:: MAD-integrated benchmarking
            Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
            directory and install the required packages on the host machine.
            .. code-block:: shell
               git clone https://github.com/ROCm/MAD
               cd MAD
               pip install -r requirements.txt
            Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
            using one GPU with the ``{{model.precision}}`` data type on the host machine.
            .. code-block:: shell
               export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
               python3 tools/run_models.py --tags {{model.mad_tag}} --keep-model-dir --live-output --timeout 28800
            MAD launches a Docker container with the name
            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
            model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
            Although the :ref:`available models <vllm-benchmark-available-models>` are preconfigured
            to collect latency and throughput performance data, you can also change the benchmarking
            parameters. See the standalone benchmarking tab for more information.
            {% if model.tunableop %}
            .. note::
               For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
               TunableOp automatically explores different implementations and configurations of certain PyTorch
               operators to find the fastest one for your hardware.
               By default, ``{{model.mad_tag}}`` runs with TunableOp disabled
               (see
               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To
               enable it, edit the default run behavior in the ``models.json``
               configuration before running inference -- update the model's run
               ``args`` by changing ``--tunableop off`` to ``--tunableop on``.
               Enabling TunableOp triggers a two-pass run -- a warm-up followed by the performance-collection run.
            {% endif %}
         .. tab-item:: Standalone benchmarking
            Run the vLLM benchmark tool independently by starting the
            `Docker container <{{ unified_docker.docker_hub_url }}>`_
            as shown in the following snippet.
            .. code-block::
               docker pull {{ unified_docker.pull_tag }}
               docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name test {{ unified_docker.pull_tag }}
            In the Docker container, clone the ROCm MAD repository and navigate to the
            benchmark scripts directory at ``~/MAD/scripts/vllm``.
            .. code-block::
               git clone https://github.com/ROCm/MAD
               cd MAD/scripts/vllm
            To start the benchmark, use the following command with the appropriate options.
            .. code-block::
               ./vllm_benchmark_report.sh -s $test_option -m {{model.model_repo}} -g $num_gpu -d {{model.precision}}
            .. list-table::
               :header-rows: 1
               :align: center
               * - Name
                 - Options
                 - Description
               * - ``$test_option``
                 - latency
                 - Measure decoding token latency
               * -
                 - throughput
                 - Measure token generation throughput
               * -
                 - all
                 - Measure both throughput and latency
               * - ``$num_gpu``
                 - 1 or 8
                 - Number of GPUs
               * - ``$datatype``
                 - ``float16`` or ``float8``
                 - Data type
            .. note::
               The input sequence length, output sequence length, and tensor parallel (TP) are
               already configured. You don't need to specify them with this script.
            .. note::
               If you encounter the following error, pass your access-authorized Hugging
               Face token to the gated models.
               .. code-block::
                  OSError: You are trying to access a gated repo.
                  # pass your HF_TOKEN
                  export HF_TOKEN=$your_personal_hf_token
            Here are some examples of running the benchmark with various options.
            * Latency benchmark
              Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision.
              .. code-block::
                 ./vllm_benchmark_report.sh -s latency -m {{model.model_repo}} -g 8 -d {{model.precision}}
              Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
            * Throughput benchmark
              Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision.
              .. code-block:: shell
                 ./vllm_benchmark_report.sh -s throughput -m {{model.model_repo}} -g 8 -d {{model.precision}}
              Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
            .. raw:: html
               <style>
               mjx-container[jax="CHTML"][display="true"] {
                  text-align: left;
                  margin: 0;
               }
               </style>
            .. note::
               Throughput is calculated as:
               - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
               - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
      {% endfor %}
   {% endfor %}
 Further reading
 ===============
 - To learn more about the options for latency and throughput benchmark scripts,
  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
 - To learn more about system settings and management practices to configure your system for
  MI300X accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
 - For application performance optimization strategies for HPC and AI workloads,
  including inference with vLLM, see :doc:`../../inference-optimization/workload`.
 - To learn how to run LLM models from Hugging Face or your own model, see
  :doc:`Running models from Hugging Face <../hugging-face-models>`.
 - To learn how to optimize inference on LLMs, see
  :doc:`Inference optimization <../../inference-optimization/index>`.
 - To learn how to fine-tune LLMs, see
  :doc:`Fine-tuning LLMs <../../fine-tuning/index>`.
 Previous versions
 =================
 See :doc:`previous-versions/vllm-history` to find documentation for previous releases
 of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
@@ -374,22 +374,5 @@ own cluster setup.
 Previous versions
 =================
-This table lists previous versions of the ROCm JAX MaxText Docker image for training
+See :doc:`previous-versions/jax-maxtext-history` to find documentation for previous releases
-performance testing. For detailed information about available models for
+of the ``ROCm/jax-training`` Docker image.
 benchmarking, see the version-specific documentation.
 .. list-table::
   :header-rows: 1
   :stub-columns: 1
   * - Image version
     - ROCm version
     - JAX version
     - Resources
   * - 25.4
     - 6.3.0
     - 0.4.31
     - 
       * `Documentation <https://rocm.docs.amd.com/en/docs-6.3.3/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.html>`_
       * `Docker Hub <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.4/images/sha256-fb3eb71cd74298a7b3044b7130cf84113f14d518ff05a2cd625c11ea5f6a7b01>`_
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history.rst
@@ -0,0 +1,34 @@
 :orphan:
 ********************************************************
 JAX MaxText training performance testing version history
 ********************************************************
 This table lists previous versions of the ROCm JAX MaxText Docker image for training
 performance testing. For detailed information about available models for
 benchmarking, see the version-specific documentation.
 You can find tagged
 previous releases of the ``ROCm/jax-training`` Docker image on `Docker Hub <https://hub.docker.com/r/rocm/jax-training/tags>`_.
 .. list-table::
   :header-rows: 1
   :stub-columns: 1
   * - Image version
     - ROCm version
     - JAX version
     - Resources
   * - 25.5
     - 6.3.4
     - 0.4.35
     - 
       * :doc:`Documentation <../jax-maxtext>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.5/images/sha256-4e0516358a227cae8f552fb866ec07e2edcf244756f02e7b40212abfbab5217b>`_
   * - 25.4
     - 6.3.0
     - 0.4.31
     - 
       * :doc:`Documentation <jax-maxtext-v25.4>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.4/images/sha256-fb3eb71cd74298a7b3044b7130cf84113f14d518ff05a2cd625c11ea5f6a7b01>`_
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4.rst
@@ -0,0 +1,358 @@
 :orphan:
 .. meta::
   :description: How to train a model using JAX MaxText for ROCm.
   :keywords: ROCm, AI, LLM, train, jax, torch, Llama, flux, tutorial, docker
 **************************************
 Training a model with MaxText for ROCm
 **************************************
 .. caution::
   This documentation does not reflect the latest version of ROCm JAX MaxText
   training performance documentation. See :doc:`../jax-maxtext` for the latest version.
 MaxText is a high-performance, open-source framework built on the Google JAX
 machine learning library to train LLMs at scale. The MaxText framework for
 ROCm is an optimized fork of the upstream
 `<https://github.com/AI-Hypercomputer/maxtext>`__ enabling efficient AI workloads
 on AMD MI300X series accelerators.
 The MaxText for ROCm training Docker (``rocm/jax-training:maxtext-v25.4``) image
 provides a prebuilt environment for training on AMD Instinct MI300X and MI325X accelerators,
 including essential components like JAX, XLA, ROCm libraries, and MaxText utilities.
 It includes the following software components:
 +--------------------------+--------------------------------+
 | Software component       | Version                        |
 +==========================+================================+
 | ROCm                     | 6.3.0                          |
 +--------------------------+--------------------------------+
 | JAX                      | 0.4.31                         |
 +--------------------------+--------------------------------+
 | Python                   | 3.10                           |
 +--------------------------+--------------------------------+
 | Transformer Engine       | 1.12.0.dev0+f81a3eb            |
 +--------------------------+--------------------------------+
 | hipBLASLt                | git78ec8622                    |
 +--------------------------+--------------------------------+
 Supported features and models
 =============================
 MaxText provides the following key features to train large language models efficiently:
 - Transformer Engine (TE)
 - Flash Attention (FA) 3
 - GEMM tuning
 - Multi-node support
 .. _amd-maxtext-model-support:
 The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
 * Llama 3.1 8B
 * Llama 3.1 70B
 * Llama 3 8B
 * Llama 3 70B
 * Llama 2 7B
 * Llama 2 70B
 * DeepSeek-V2-Lite
 .. note::
   Some models, such as Llama 3, require an external license agreement through
   a third party (for example, Meta).
 Unsupported features
 --------------------
 Currently, MaxText's default packed input format is not supported. Using this format
 with the current Docker image results in incorrect attention calculations
 across different input sequences. Support for packed input format is planned for a future release.
 System validation
 =================
 If you have already validated your system settings, including NUMA
 auto-balancing, skip this step. Otherwise, complete the :ref:`system validation
 and optimization steps <train-a-model-system-validation>` to set up your system
 before starting training.
 Environment setup
 =================
 This Docker image is optimized for specific model configurations outlined
 as follows. Performance can vary for other training workloads, as AMD
 doesn’t validate configurations and run conditions outside those described.
 .. _amd-maxtext-multi-node-setup:
 Multi-node setup
 ----------------
 For multi-node environments, ensure you have all the necessary packages for
 your network device, such as, RDMA. If you're not using a multi-node setup
 with RDMA, skip ahead to :ref:`amd-maxtext-download-docker`.
 1. Install the following packages to build and install the RDMA driver.
   .. code-block:: shell
      sudo apt install iproute2 -y
      sudo apt install -y linux-headers-"$(uname-r)" libelf-dev
      sudo apt install -y gcc make libtool autoconf librdmacm-dev rdmacm-utils infiniband-diags ibverbs-utils perftest ethtool libibverbs-dev rdma-core strace libibmad5 libibnetdisc5 ibverbs-providers libibumad-dev libibumad3 libibverbs1 libnl-3-dev libnl-route-3-dev
   Refer to your NIC manufacturer's documentation for further steps on
   compiling and installing the RoCE driver. For example, for Broadcom,
   see `Compiling Broadcom NIC software from source <https://docs.broadcom.com/doc/957608-AN2XX#G3.484341>`_
   in `Ethernet networking guide for AMD Instinct MI300X GPU clusters <https://docs.broadcom.com/doc/957608-AN2XX>`_.
 2. Set the following environment variables.
   a. Master address
      Change `localhost` to the master node's resolvable hostname or IP address:
      .. code-block:: bash
         export MASTER_ADDR="${MASTER_ADDR:-localhost}"
   b. Number of nodes
      Set the number of nodes you want to train on (for example, ``2``, ``4``, or ``8``):
      .. code-block:: bash
         export NNODES="${NNODES:-1}"
   c. Node ranks
      Set the rank of each node (``0`` for master, ``1`` for the first worker node, and so on)
      Node ranks should be unique across all nodes in the cluster.
      .. code-block:: bash
         export NODE_RANK="${NODE_RANK:-0}"
   d. Network interface
      Update the network interface in the script to match your system's network interface. To
      find your network interface, run the following (outside of any Docker container):
      .. code-block:: bash
         ip a
      Look for an active interface with an IP address in the same subnet as
      your other nodes. Then, update the following variable in the script, for
      example:
      .. code-block:: bash
         export NCCL_SOCKET_IFNAME=ens50f0np0
      This variable specifies which network interface to use for inter-node communication.
      Setting this variable to the incorrect interface can result in communication failures
      or significantly reduced performance.
   e. RDMA interface
      Ensure the :ref:`required packages <amd-maxtext-multi-node-setup>` are installed on all nodes.
      Then, set the RDMA interfaces to use for communication.
      .. code-block:: bash
         # If using Broadcom NIC
         export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
         # If using Mellanox NIC
         export NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_8,mlx5_9
 .. _amd-maxtext-download-docker:
 Download the Docker image
 -------------------------
 1. Use the following command to pull the Docker image from Docker Hub.
   .. code-block:: shell
      docker pull rocm/jax-training:maxtext-v25.4
 2. Run the Docker container.
   .. code-block:: shell
      docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME/.ssh:/root/.ssh --shm-size 128G --name maxtext_training rocm/jax-training:maxtext-v25.4
 .. _amd-maxtext-get-started:
 Getting started
 ===============
 The following examples demonstrate how to get started with single node
 and multi-node training using the benchmarking scripts provided at
 `<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/>`__.
 .. important::
   The provided scripts launch a Docker container and execute a benchmark. Ensure you run these commands outside of any existing Docker container.
 Before running any benchmarks, ensure the ``$HF_HOME`` environment variable is
 set correctly and points to your Hugging Face cache directory. Refer to the
 README at `<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/>`__
 for more detailed instructions.
 Single node training benchmarking examples
 ------------------------------------------
 * Example 1: Single node training with Llama 2 7B
  Download the benchmarking script:
  .. code-block:: shell
     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_7b.sh
  Run the single node training benchmark:
  IMAGE="rocm/jax-training:maxtext-v25.4" bash ./llama2_7b.sh
 * Example 2: Single node training with Llama 2 70B
  Download the benchmarking script:
  .. code-block:: shell
     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_70b.sh
  Run the single node training benchmark:
  .. code-block:: shell
     IMAGE="rocm/jax-training:maxtext-v25.4" bash ./llama2_70b.sh
 * Example 3: Single node training with Llama 3 8B
  Download the benchmarking script:
  .. code-block:: shell
     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_8b.sh
  Run the single node training benchmark:
  .. code-block:: shell
     IMAGE="rocm/jax-training:maxtext-v25.4" bash ./llama3_8b.sh
 * Example 4: Single node training with Llama 3 70B
  Download the benchmarking script:
  .. code-block:: shell
     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_70b.sh
  Run the single node training benchmark:
  .. code-block:: shell
     IMAGE="rocm/jax-training:maxtext-v25.4" bash ./llama3_70b.sh
 * Example 5: Single node training with DeepSeek V2 16B
  Download the benchmarking script:
  .. code-block:: shell
     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/deepseek_v2_16b.sh
  Run the single node training benchmark:
  .. code-block:: shell
     IMAGE="rocm/jax-training:maxtext-v25.4" bash ./deepseek_v2_16b.sh
  .. note::
     The reported TFLOP/s by MaxText for DeepSeek is not accurate. Use
     the tokens/s as a performance indicator.
 Multi-node training benchmarking examples
 -----------------------------------------
 The following examples use SLURM for running on multiple nodes -- the commands might need to be adjusted for your
 own cluster setup.
 * Example 1: Multi-node training with Llama 2 7B
  Download the benchmarking script:
  .. code-block:: shell
     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_7b_multinode.sh
  Run the multi-node training benchmark. For example:
  .. code-block:: shell
     sbatch -N <num_nodes> llama2_7b_multinode.sh
 * Example 2: Multi-node training with Llama 2 70B
  Download the benchmarking script:
  .. code-block:: shell
     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_70b_multinode.sh
  Run the multi-node training benchmark. For example:
  .. code-block:: shell
     sbatch -N <num_nodes> llama2_70b_multinode.sh
 * Example 3: Multi-node training with Llama 3 8B model
  Download the benchmarking script:
  .. code-block:: shell
     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_8b_multinode.sh
  Run the multi-node training benchmark. For example:
  .. code-block:: shell
     sbatch -N <num_nodes> llama3_8b_multinode.sh
 * Example 4: Multi-node training with Llama 3 70B model
  Download the benchmarking script:
  .. code-block:: shell
     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_70b_multinode.sh
  Run the multi-node training benchmark. For example:
  .. code-block:: shell
     sbatch -N <num_nodes> llama3_70b_multinode.sh
 Previous versions
 =================
 See :doc:`jax-maxtext-history` to find documentation for previous releases
 of the ``ROCm/jax-training`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst
@@ -0,0 +1,47 @@
 :orphan:
 ********************************************************
 Megatron-LM training performance testing version history
 ********************************************************
 This table lists previous versions of the ROCm Megatron-LM training Docker image for
 inference performance testing. For detailed information about available models
 for benchmarking, see the version-specific documentation. You can find tagged
 previous releases of the ``ROCm/megatron-lm`` Docker image on `Docker Hub <https://hub.docker.com/r/rocm/megatron-lm/tags>`_.
 .. list-table::
   :header-rows: 1
   :stub-columns: 1
   * - Image version
     - ROCm version
     - PyTorch version
     - Resources
   * - v25.5
     - 6.3.4
     - 2.8.0a0+gite2f9759
     - 
       * `Documentation <../megatron-lm>`_
       * `Docker Hub <https://hub.docker.com/layers/rocm/megatron-lm/v25.5_py312/images/sha256-4506f18ba188d24189c6b1f95130b425f52c528a543bb3f420351824edceadc2>`_
   * - v25.4
     - 6.3.0
     - 2.7.0a0+git637433 
     - 
       * :doc:`Documentation <megatron-lm-v25.4>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/megatron-lm/v25.4/images/sha256-941aa5387918ea91c376c13083aa1e6c9cab40bb1875abbbb73bbb65d8736b3f>`_
   * - v25.3
     - 6.3.0
     - 2.7.0a0+git637433 
     - 
       * :doc:`Documentation <megatron-lm-v25.3>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/megatron-lm/v25.3/images/sha256-1e6ed9bdc3f4ca397300d5a9907e084ab5e8ad1519815ee1f868faf2af1e04e2>`_
   * - v24.12-dev
     - 6.1.0
     - 2.4.0
     - 
       * :doc:`Documentation <megatron-lm-v24.12-dev>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/megatron-lm/24.12-dev/images/sha256-5818c50334ce3d69deeeb8f589d83ec29003817da34158ebc9e2d112b929bf2e>`_
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v24.12-dev.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v24.12-dev.rst
@@ -0,0 +1,515 @@
 :orphan:
 .. meta::
   :description: How to train a model using ROCm Megatron-LM
   :keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
 **************************************
 Training a model with ROCm Megatron-LM
 **************************************
 .. caution::
   This documentation does not reflect the latest version of ROCm Megatron-LM
   training performance documentation. See :doc:`../megatron-lm` for the latest version.
 .. _amd-megatron-lm:
 The ROCm Megatron-LM framework is a specialized fork of the robust Megatron-LM, designed to
 enable efficient training of large-scale language models on AMD GPUs. By leveraging AMD Instinct™ MI300X
 accelerators, AMD Megatron-LM delivers enhanced scalability, performance, and resource utilization for AI
 workloads. It is purpose-built to :ref:`support models <amd-megatron-lm-model-support>`
 like Meta's Llama 2, Llama 3, and Llama 3.1, enabling developers to train next-generation AI models with greater
 efficiency. See the GitHub repository at `<https://github.com/ROCm/Megatron-LM>`__.
 For ease of use, AMD provides a ready-to-use Docker image for MI300X accelerators containing essential
 components, including PyTorch, PyTorch Lightning, ROCm libraries, and Megatron-LM utilities. It contains the
 following software to accelerate training workloads:
 +--------------------------+--------------------------------+
 | Software component       | Version                        |
 +==========================+================================+
 | ROCm                     | 6.1                            |
 +--------------------------+--------------------------------+
 | PyTorch                  | 2.4.0                          |
 +--------------------------+--------------------------------+
 | PyTorch Lightning        | 2.4.0                          |
 +--------------------------+--------------------------------+
 | Megatron Core            | 0.9.0                          |
 +--------------------------+--------------------------------+
 | Transformer Engine       | 1.5.0                          |
 +--------------------------+--------------------------------+
 | Flash Attention          | v2.6                           |
 +--------------------------+--------------------------------+
 | Transformers             | 4.44.0                         |
 +--------------------------+--------------------------------+
 Supported features and models
 =============================
 Megatron-LM provides the following key features to train large language models efficiently:
 - Transformer Engine (TE)
 - APEX
 - GEMM tuning
 - Torch.compile
 - 3D parallelism: TP + SP + CP
 - Distributed optimizer
 - Flash Attention (FA) 2
 - Fused kernels
 - Pre-training
 .. _amd-megatron-lm-model-support:
 The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.
 * Llama 2 7B
 * Llama 2 70B
 * Llama 3 8B
 * Llama 3 70B
 * Llama 3.1 8B
 * Llama 3.1 70B
 Prerequisite system validation steps
 ====================================
 Complete the following system validation and optimization steps to set up your system before starting training.
 Disable NUMA auto-balancing
 ---------------------------
 Generally, application performance can benefit from disabling NUMA auto-balancing. However,
 it might be detrimental to performance with certain types of workloads.
 Run the command ``cat /proc/sys/kernel/numa_balancing`` to check your current NUMA (Non-Uniform
 Memory Access) settings. Output ``0`` indicates this setting is disabled. If there is no output or
 the output is ``1``, run the following command to disable NUMA auto-balancing.
 .. code-block:: shell
   sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
 See :ref:`mi300x-disable-numa` for more information.
 Hardware verification with ROCm
 -------------------------------
 Use the command ``rocm-smi --setperfdeterminism 1900`` to set the max clock speed up to 1900 MHz
 instead of the default 2100 MHz. This can reduce the chance of a PCC event lowering the attainable
 GPU clocks. This setting will not be required for new IFWI releases with the production PRC feature.
 You can restore this setting to its default value with the ``rocm-smi -r`` command.
 Run the command:
 .. code-block:: shell
   rocm-smi --setperfdeterminism 1900
 See :ref:`mi300x-hardware-verification-with-rocm` for more information.
 RCCL Bandwidth Test
 -------------------
 ROCm Collective Communications Library (RCCL) is a standalone library of standard collective communication
 routines for GPUs. See the :doc:`RCCL documentation <rccl:index>` for more information. Before starting
 pre-training, running a RCCL bandwidth test helps ensure that the multi-GPU or multi-node setup is optimized
 for efficient distributed training.
 Running the RCCL bandwidth test helps verify that:
 - The GPUs can communicate across nodes or within a single node.
 - The interconnect (such as InfiniBand, Ethernet, or Infinite fabric) is functioning as expected and
  provides adequate bandwidth for communication.
 - No hardware setup or cabling issues could affect the communication between GPUs
 Tuning and optimizing hyperparameters
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 In distributed training, specific hyperparameters related to distributed communication can be tuned based on
 the results of the RCCL bandwidth test. These variables are already set in the Docker image:
 .. code-block:: shell
   # force all RCCL streams to be high priority
   export TORCH_NCCL_HIGH_PRIORITY=1
   # specify which RDMA interfaces to use for communication
   export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
   # define the Global ID index used in RoCE mode
   export NCCL_IB_GID_INDEX=3
   # avoid data corruption/mismatch issue that existed in past releases
   export RCCL_MSCCL_ENABLE=0
 Running the RCCL Bandwidth Test
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 It's recommended you run the RCCL bandwidth test before launching training. It ensures system
 performance is sufficient to launch training. RCCL is not included in the AMD Megatron-LM Docker
 image; follow the instructions in `<https://github.com/ROCm/rccl-tests>`__ to get started.
 See :ref:`mi300x-rccl` for more information.
 Run on 8 GPUs (``-g 8``), scanning from 8 bytes to 10 GB:
 .. code-block:: shell
   ./build/all_reduce_perf -b 8 -e 10G -f 2 -g 8
 .. image:: ../../data/how-to/rocm-for-ai/rccl-tests-8-gpu.png
   :width: 800
 Using one MPI process per GPU and ``-g 1`` for performance-oriented runs on both single-node and multi-node is
 recommended. So, a run on 8 GPUs looks something like:
 .. code-block:: shell
   mpirun -np 8 --bind-to numa ./build/all_reduce_perf -b 8 -e 10G -f 2 -g 1
 .. image:: ../../data/how-to/rocm-for-ai/rccl-tests-1-mpi-process-per-gpu.png
   :width: 800
 Running with one MPI process per GPU ensures a one-to-one mapping for CPUs and GPUs, which can be beneficial
 for smaller message sizes. This better represents the real-world use of RCCL in deep learning frameworks like
 PyTorch and TensorFlow.
 Use the following script to run the RCCL test for four MI300X GPU nodes. Modify paths and node addresses as needed.
 .. code-block::
   /home/$USER/ompi_for_gpu/ompi/bin/mpirun -np 32 -H tw022:8,tw024:8,tw010:8, tw015:8 \
   --mca pml ucx \
   --mca btl ^openib \
   -x NCCL_SOCKET_IFNAME=ens50f0np0 \
   -x NCCL_IB_HCA=rdma0:1,rdma1:1,rdma2:1,rdma3:1,rdma4:1,rdma5:1,rdma6:1,rdma7:1 \
   -x NCCL_IB_GID_INDEX=3 \
   -x NCCL_MIN_NCHANNELS=40 \
   -x NCCL_DEBUG=version \
   $HOME/rccl-tests/build/all_reduce_perf -b 8 -e 8g -f 2 -g 1
 .. image:: ../../data/how-to/rocm-for-ai/rccl-tests-4-mi300x-gpu-nodes.png
   :width: 800
 .. _mi300x-amd-megatron-lm-training:
 Start training on MI300X accelerators
 =====================================
 The pre-built ROCm Megatron-LM environment allows users to quickly validate system performance, conduct
 training benchmarks, and achieve superior performance for models like Llama 2 and Llama 3.1.
 Use the following instructions to set up the environment, configure the script to train models, and
 reproduce the benchmark results on the MI300X accelerators with the AMD Megatron-LM Docker
 image.
 .. _amd-megatron-lm-requirements:
 Download the Docker image and required packages
 -----------------------------------------------
 1. Use the following command to pull the Docker image from Docker Hub.
   .. code-block:: shell
      docker pull rocm/megatron-lm:24.12-dev
 2. Launch the Docker container.
   .. code-block:: shell
      docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $CACHE_DIR:/root/.cache --name megatron-dev-env rocm/megatron-lm:24.12-dev /bin/bash
 3. Clone the ROCm Megatron-LM repository to a local directory and install the required packages on the host machine.
   .. code-block:: shell
      git clone https://github.com/ROCm/Megatron-LM
      cd Megatron-LM
   .. note::
      This release is validated with ``ROCm/Megatron-LM`` commit `bb93ccb <https://github.com/ROCm/Megatron-LM/tree/bb93ccbfeae6363c67b361a97a27c74ab86e7e92>`_.
      Checking out this specific commit is recommended for a stable and reproducible environment.
      .. code-block:: shell
         git checkout bb93ccbfeae6363c67b361a97a27c74ab86e7e92
 Prepare training datasets
 -------------------------
 If you already have the preprocessed data, you can skip this section.
 Use the following command to process datasets. We use GPT data as an example. You may change the merge table, use an
 end-of-document token, remove sentence splitting, and use the tokenizer type.
 .. code-block:: shell
   python tools/preprocess_data.py \
       --input my-corpus.json \
       --output-prefix my-gpt2 \
       --vocab-file gpt2-vocab.json \
       --tokenizer-type GPT2BPETokenizer \
       --merge-file gpt2-merges.txt \
       --append-eod
 In this case, the automatically generated output files are named ``my-gpt2_text_document.bin`` and
 ``my-gpt2_text_document.idx``.
 .. image:: ../../data/how-to/rocm-for-ai/prep-training-datasets-my-gpt2-text-document.png
   :width: 800
 .. _amd-megatron-lm-environment-setup:
 Environment setup
 -----------------
 In the ``examples/llama`` directory of Megatron-LM, if you're working with Llama 2 7B or Llama 2 70 B, use the
 ``train_llama2.sh`` configuration script. Likewise, if you're working with Llama 3 or Llama 3.1, then use
 ``train_llama3.sh`` and update the configuration script accordingly.
 Network interface
 ^^^^^^^^^^^^^^^^^
 To avoid connectivity issues, ensure the correct network interface is set in your training scripts.
 1. Run the following command to find the active network interface on your system.
   .. code-block:: shell
      ip a
 2. Update the ``NCCL_SOCKET_IFNAME`` and ``GLOO_SOCKET_IFNAME`` variables with your system’s network interface. For
   example:
   .. code-block:: shell
      export NCCL_SOCKET_IFNAME=ens50f0np0
      export GLOO_SOCKET_IFNAME=ens50f0np0
 Dataset options
 ^^^^^^^^^^^^^^^
 You can use either mock data or real data for training.
 * If you're using a real dataset, update the ``DATA_PATH`` variable to point to the location of your dataset.
  .. code-block:: shell
     DATA_DIR="/root/.cache/data" # Change to where your dataset is stored
     DATA_PATH=${DATA_DIR}/bookcorpus_text_sentence
  .. code-block:: shell
     --data-path $DATA_PATH
  Ensure that the files are accessible inside the Docker container.
 * Mock data can be useful for testing and validation. If you're using mock data, replace ``--data-path $DATA_PATH`` with the ``--mock-data`` option.
  .. code-block:: shell
     --mock-data
 Tokenizer
 ^^^^^^^^^
 Tokenization is the process of converting raw text into tokens that can be processed by the model. For Llama
 models, this typically involves sub-word tokenization, where words are broken down into smaller units based on
 a fixed vocabulary. The tokenizer is trained along with the model on a large corpus of text, and it learns a
 fixed vocabulary that can represent a wide range of text from different domains. This allows Llama models to
 handle a variety of input sequences, including unseen words or domain-specific terms.
 To train any of the Llama 2 models that this Docker image supports, use the ``Llama2Tokenizer``.
 To train any of Llama 3 and Llama 3.1 models that this Docker image supports, use the ``HuggingFaceTokenizer``.
 Set the Hugging Face model link in the ``TOKENIZER_MODEL`` variable.
 For example, if you're using the Llama 3.1 8B model:
 .. code-block:: shell
   TOKENIZER_MODEL=meta-llama/Llama-3.1-8B
 Run benchmark tests
 -------------------
 .. note::
   If you're running **multi node training**, update the following environment variables. They can
   also be passed as command line arguments.
   * Change ``localhost`` to the master node's hostname:
     .. code-block:: shell
        MASTER_ADDR="${MASTER_ADDR:-localhost}"
   * Set the number of nodes you want to train on (for instance, ``2``, ``4``, ``8``):
     .. code-block:: shell
        NNODES="${NNODES:-1}"
   * Set the rank of each node (0 for master, 1 for the first worker node, and so on):
     .. code-block:: shell
        NODE_RANK="${NODE_RANK:-0}"
 * Use this command to run a performance benchmark test of any of the Llama 2 models that this Docker image supports (see :ref:`variables <amd-megatron-lm-benchmark-test-vars>`).
  .. code-block:: shell
     {variables} bash examples/llama/train_llama2.sh
 * Use this command to run a performance benchmark test of any of the Llama 3 and Llama 3.1 models that this Docker image supports (see :ref:`variables <amd-megatron-lm-benchmark-test-vars>`).
  .. code-block:: shell
     {variables} bash examples/llama/train_llama3.sh
 .. _amd-megatron-lm-benchmark-test-vars:
 The benchmark tests support the same set of variables:
 +--------------------------+-----------------------+-----------------------+
 | Name                     | Options               | Description           |
 +==========================+=======================+=======================+
 | ``TEE_OUTPUT``           | 0 or 1                | 0: disable training   |
 |                          |                       | log                   |
 |                          |                       |                       |
 |                          |                       | 1: enable training    |
 |                          |                       | log                   |
 +--------------------------+-----------------------+-----------------------+
 | ``MBS``                  |                       | Micro batch size      |
 +--------------------------+-----------------------+-----------------------+
 | ``BS``                   |                       | Batch size            |
 +--------------------------+-----------------------+-----------------------+
 | ``TP``                   | 1, 2, 4, 8            | Tensor parallel       |
 +--------------------------+-----------------------+-----------------------+
 | ``TE_FP8``               | 0 or 1                | Datatype.             |
 |                          |                       | If it is set to 1,    |
 |                          |                       | FP8.                  |
 |                          |                       |                       |
 |                          |                       | If it is set to 0.    |
 |                          |                       | BP16                  |
 +--------------------------+-----------------------+-----------------------+
 | ``NO_TORCH_COMPILE``     | 0 or 1                | If it is set to 1,    |
 |                          |                       | enable torch.compile. |
 |                          |                       |                       |
 |                          |                       | If it is set to 0.    |
 |                          |                       | Disable torch.compile |
 |                          |                       | (default)             |
 +--------------------------+-----------------------+-----------------------+
 | ``SEQ_LENGTH``           |                       | Input sequence length |
 +--------------------------+-----------------------+-----------------------+
 | ``GEMM_TUNING``          | 0 or 1                | If it is set to 1,    |
 |                          |                       | enable gemm tuning.   |
 |                          |                       |                       |
 |                          |                       | If it is set to 0,    |
 |                          |                       | disable gemm tuning   |
 +--------------------------+-----------------------+-----------------------+
 | ``USE_FLASH_ATTN``       | 0 or 1                | 0: disable flash      |
 |                          |                       | attention             |
 |                          |                       |                       |
 |                          |                       | 1: enable flash       |
 |                          |                       | attention             |
 +--------------------------+-----------------------+-----------------------+
 | ``ENABLE_PROFILING``     | 0 or 1                | 0: disable torch      |
 |                          |                       | profiling             |
 |                          |                       |                       |
 |                          |                       | 1: enable torch       |
 |                          |                       | profiling             |
 +--------------------------+-----------------------+-----------------------+
 | ``MODEL_SIZE``           |                       | The size of the mode: |
 |                          |                       | 7B/70B, etc.          |
 +--------------------------+-----------------------+-----------------------+
 | ``TOTAL_ITERS``          |                       | Total number of       |
 |                          |                       | iterations            |
 +--------------------------+-----------------------+-----------------------+
 | ``transformer-impl``     | transformer_engine or | Enable transformer    |
 |                          | local                 | engine by default     |
 +--------------------------+-----------------------+-----------------------+
 Benchmarking examples
 ^^^^^^^^^^^^^^^^^^^^^
 .. tab-set::
   .. tab-item:: Single node training
      :sync: single
      Use this command to run training with Llama 2 7B model on a single node. You can specify MBS, BS, FP,
      datatype, and so on.
      .. code-block:: bash
         TEE_OUTPUT=1 MBS=5 BS=120 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
         SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
      You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup>`.
      See the sample output:
      .. image:: ../../data/how-to/rocm-for-ai/llama2-7b-training-log-sample.png
         :width: 800
   .. tab-item:: Multi node training
      :sync: multi
      Launch the Docker container on each node.
      In this example, run training with Llama 2 7B model on 2 nodes with specific MBS, BS, FP, datatype, and
      so on.
      On the master node:
      .. code-block:: bash
         TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
         SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
      On the worker node:
      .. code-block:: bash
         TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
         SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
      You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup>`.
      Sample output for 2-node training:
      Master node:
      .. image:: ../../data/how-to/rocm-for-ai/2-node-training-master.png
         :width: 800
      Worker node:
      .. image:: ../../data/how-to/rocm-for-ai/2-node-training-worker.png
         :width: 800
 Previous versions
 =================
 See :doc:`megatron-lm-history` to find documentation for previous releases
 of the ``ROCm/megatron-lm`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3.rst
@@ -0,0 +1,535 @@
 :orphan:
 .. meta::
   :description: How to train a model using Megatron-LM for ROCm.
   :keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
 ******************************************
 Training a model with Megatron-LM for ROCm
 ******************************************
 .. caution::
   This documentation does not reflect the latest version of ROCm Megatron-LM
   training performance documentation. See :doc:`../megatron-lm` for the latest version.
 The Megatron-LM framework for ROCm is a specialized fork of the robust Megatron-LM,
 designed to enable efficient training of large-scale language models on AMD
 GPUs. By leveraging AMD Instinct™ MI300X series accelerators, Megatron-LM delivers
 enhanced scalability, performance, and resource utilization for AI workloads.
 It is purpose-built to support models like Llama 2, Llama 3, Llama 3.1, and
 DeepSeek, enabling developers to train next-generation AI models more
 efficiently. See the GitHub repository at `<https://github.com/ROCm/Megatron-LM>`__.
 AMD provides a ready-to-use Docker image for MI300X accelerators containing
 essential components, including PyTorch, ROCm libraries, and Megatron-LM
 utilities. It contains the following software components to accelerate training
 workloads:
 +--------------------------+--------------------------------+
 | Software component       | Version                        |
 +==========================+================================+
 | ROCm                     | 6.3.0                          |
 +--------------------------+--------------------------------+
 | PyTorch                  | 2.7.0a0+git637433              |
 +--------------------------+--------------------------------+
 | Python                   | 3.10                           |
 +--------------------------+--------------------------------+
 | Transformer Engine       | 1.11                           |
 +--------------------------+--------------------------------+
 | Flash Attention          | 3.0.0                          |
 +--------------------------+--------------------------------+
 | hipBLASLt                | git258a2162                    |
 +--------------------------+--------------------------------+
 | Triton                   | 3.1                            |
 +--------------------------+--------------------------------+
 Supported features and models
 =============================
 Megatron-LM provides the following key features to train large language models efficiently:
 - Transformer Engine (TE)
 - APEX
 - GEMM tuning
 - Torch.compile
 - 3D parallelism: TP + SP + CP
 - Distributed optimizer
 - Flash Attention (FA) 3
 - Fused kernels
 - Pre-training
 .. _amd-megatron-lm-model-support:
 The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.
 * Llama 2 7B
 * Llama 2 70B
 * Llama 3 8B
 * Llama 3 70B
 * Llama 3.1 8B
 * Llama 3.1 70B
 * DeepSeek-V2-Lite
 .. note::
   Some models, such as Llama 3, require an external license agreement through
   a third party (for example, Meta).
 System validation
 =================
 If you have already validated your system settings, skip this step. Otherwise,
 complete the :ref:`system validation and optimization steps <train-a-model-system-validation>`
 to set up your system before starting training.
 Disable NUMA auto-balancing
 ---------------------------
 Generally, application performance can benefit from disabling NUMA auto-balancing. However,
 it might be detrimental to performance with certain types of workloads.
 Run the command ``cat /proc/sys/kernel/numa_balancing`` to check your current NUMA (Non-Uniform
 Memory Access) settings. Output ``0`` indicates this setting is disabled. If there is no output or
 the output is ``1``, run the following command to disable NUMA auto-balancing.
 .. code-block:: shell
   sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
 See :ref:`mi300x-disable-numa` for more information.
 .. _mi300x-amd-megatron-lm-training:
 Environment setup
 =================
 The pre-built ROCm Megatron-LM environment allows users to quickly validate system performance, conduct
 training benchmarks, and achieve superior performance for models like Llama 3.1, Llama 2, and DeepSeek V2.
 Use the following instructions to set up the environment, configure the script to train models, and
 reproduce the benchmark results on the MI300X accelerators with the AMD Megatron-LM Docker
 image.
 .. _amd-megatron-lm-requirements:
 Download the Docker image
 -------------------------
 1. Use the following command to pull the Docker image from Docker Hub.
   .. code-block:: shell
      docker pull rocm/megatron-lm:v25.3
 2. Launch the Docker container.
   .. code-block:: shell
      docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 64G --name megatron_training_env rocm/megatron-lm:v25.3
 3. Use these commands if you exit the ``megatron_training_env`` container and need to return to it.
   .. code-block:: shell
      docker start megatron_training_env
      docker exec -it megatron_training_env bash
 The Docker container includes a pre-installed, verified version of Megatron-LM from the `release branch <https://github.com/ROCm/Megatron-LM/tree/megatron_release_v25.3>`_.
 .. _amd-megatron-lm-environment-setup:
 Configuration scripts
 ---------------------
 .. tab-set::
   .. tab-item:: Llama
      :sync: llama
      If you're working with Llama 2 7B or Llama 2 70 B, use the ``train_llama2.sh`` configuration
      script in the ``examples/llama`` directory of
      `<https://github.com/ROCm/Megatron-LM/tree/megatron_release_v25.3/examples/llama>`__.
      Likewise, if you're working with Llama 3 or Llama 3.1, then use ``train_llama3.sh`` and update
      the configuration script accordingly.
   .. tab-item:: DeepSeek V2
      :sync: deepseek
      Use the ``train_deepseek_v2.sh`` configuration script in the ``examples/deepseek_v2``
      directory of
      `<https://github.com/ROCm/Megatron-LM/tree/megatron_release_v25.3/examples/deepseek_v2>`__
      and update the configuration script accordingly.
 Network interface
 ^^^^^^^^^^^^^^^^^
 .. tab-set::
   .. tab-item:: Llama
      :sync: llama
      To avoid connectivity issues in multi-node deployments, ensure the correct network interface
      is set in your training scripts.
      1. Run the following command (outside the container) to find the active network interface on your system.
         .. code-block:: shell
            ip a
      2. Update the ``NCCL_SOCKET_IFNAME`` and ``GLOO_SOCKET_IFNAME`` variables with your system’s network interface. For
         example:
         .. code-block:: shell
            export NCCL_SOCKET_IFNAME=ens50f0np0
            export GLOO_SOCKET_IFNAME=ens50f0np0
 Dataset options
 ^^^^^^^^^^^^^^^
 .. tab-set::
   .. tab-item:: Llama
      :sync: llama
      You can use either mock data or real data for training.
      * Mock data can be useful for testing and validation. Use the ``MOCK_DATA`` variable to toggle between mock and real data. The default
        value is ``1`` for enabled.
        .. code-block:: bash
           MOCK_DATA=1
      * If you're using a real dataset, update the ``DATA_PATH`` variable to point to the location of your dataset.
        .. code-block:: bash
           MOCK_DATA=0
           DATA_PATH=${DATA_PATH:-"/data/bookcorpus_text_sentence"}  # Change to where your dataset is stored
        Ensure that the files are accessible inside the Docker container.
   .. tab-item:: DeepSeek V2
      :sync: deepseek
      If you don't already have the dataset, download the DeepSeek dataset using the following
      commands:
      .. code-block:: shell
         mkdir deepseek-datasets
         cd deepseek-datasets
         wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/SlimPajama.json
         wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-train.json
         wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-valid.json
         wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.bin
         wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.idx
      You can use either mock data or real data for training.
      * Mock data can be useful for testing and validation. Use the ``MOCK_DATA`` variable to toggle between mock and real data. The default
        value is ``1`` for enabled.
        .. code-block:: bash
           MOCK_DATA=1
      * If you're using a real dataset, update the ``DATA_DIR`` variable to point to the location of your dataset.
        .. code-block:: bash
           MOCK_DATA=0
           DATA_DIR="/root/data/deepseek-datasets"  # Change to where your dataset is stored
        Ensure that the files are accessible inside the Docker container.
 Tokenizer
 ^^^^^^^^^
 Tokenization is the process of converting raw text into tokens that can be processed by the model. For Llama
 models, this typically involves sub-word tokenization, where words are broken down into smaller units based on
 a fixed vocabulary. The tokenizer is trained along with the model on a large corpus of text, and it learns a
 fixed vocabulary that can represent a wide range of text from different domains. This allows Llama models to
 handle a variety of input sequences, including unseen words or domain-specific terms.
 .. tab-set::
   .. tab-item:: Llama
      :sync: llama
      To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``Llama2Tokenizer``.
      To train any of Llama 3 and Llama 3.1 models that this Docker image supports, use the ``HuggingFaceTokenizer``.
      Set the Hugging Face model link in the ``TOKENIZER_MODEL`` variable.
      For example, if you're using the Llama 3.1 8B model:
      .. code-block:: shell
         TOKENIZER_MODEL=meta-llama/Llama-3.1-8B
   .. tab-item:: DeepSeek V2
      :sync: deepseek
      To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``DeepSeekV2Tokenizer``.
 Multi-node training
 ^^^^^^^^^^^^^^^^^^^
 .. tab-set::
   .. tab-item:: Llama
      :sync: llama
      If you're running multi-node training, update the following environment variables. They can
      also be passed as command line arguments.
      * Change ``localhost`` to the master node's hostname:
        .. code-block:: shell
           MASTER_ADDR="${MASTER_ADDR:-localhost}"
      * Set the number of nodes you want to train on (for instance, ``2``, ``4``, ``8``):
        .. code-block:: shell
           NNODES="${NNODES:-1}"
      * Set the rank of each node (0 for master, 1 for the first worker node, and so on):
        .. code-block:: shell
           NODE_RANK="${NODE_RANK:-0}"
      * Set ``DATA_CACHE_PATH`` to a common directory accessible by all the nodes (for example, an
        NFS directory) for multi-node runs:
        .. code-block:: shell
           DATA_CACHE_PATH=/root/cache # Set to a common directory for multi-node runs
      * For multi-node runs, make sure the correct network drivers are installed on the nodes. If
        inside a Docker, either install the drivers inside the Docker container or pass the network
        drivers from the host while creating the Docker container.
 Start training on AMD Instinct accelerators
 ===========================================
 The prebuilt Megatron-LM with ROCm training environment allows users to quickly validate
 system performance, conduct training benchmarks, and achieve superior
 performance for models like Llama 3.1 and Llama 2. This container should not be
 expected to provide generalized performance across all training workloads. You
 can expect the container to perform in the model configurations described in
 the following section, but other configurations are not validated by AMD.
 Use the following instructions to set up the environment, configure the script
 to train models, and reproduce the benchmark results on MI300X series
 accelerators with the AMD Megatron-LM Docker image.
 .. tab-set::
   .. tab-item:: Llama
      :sync: llama
      .. tab-set::
         .. tab-item:: Single node training
            :sync: single-node
            To run training on a single node, navigate to the Megatron-LM folder and use the
            following command:
            .. code-block:: shell
               TEE_OUTPUT=1 MBS=2 BS=128 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 bash examples/llama/train_llama3.sh
         .. tab-item:: Multi-node training
            :sync: multi-node
            To run training on multiple nodes, launch the Docker container on each node. For example, for a two node setup (``NODE0`` as the master node), use these commands.
            * On the master node ``NODE0``:
              .. code-block:: shell
                 TEE_OUTPUT=1 MBS=2 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 MASTER_ADDR=IP_NODE0 NNODES=2 NODE_RANK=0 bash examples/llama/train_llama3.sh
            * On the worker node ``NODE1``:
              .. code-block:: shell
                 TEE_OUTPUT=1 MBS=2 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 MASTER_ADDR=IP_NODE0 NNODES=2 NODE_RANK=1 bash examples/llama/train_llama3.sh
   .. tab-item:: DeepSeek V2
      :sync: deepseek
      To run the training on a single node, go to ``/Megatron-LM`` folder and use the following command:
      .. code-block:: shell
         cd /workspace/Megatron-LM
         GEMM_TUNING=1 PR=bf16 MBS=4 AC=none bash examples/deepseek_v2/train_deepseekv2.sh
 Key options
 -----------
 .. _amd-megatron-lm-benchmark-test-vars:
 The benchmark tests support the following sets of variables:
 .. tab-set::
   .. tab-item:: Llama
      :sync: llama
      ``TEE_OUTPUT``
        ``1`` to enable training logs or ``0`` to disable.
      ``TE_FP8``
        ``0`` for BP16 (default) or ``1`` for FP8 GEMMs.
      ``GEMM_TUNING``
        ``1`` to enable GEMM tuning, which boosts performance by using the best GEMM kernels.
      ``USE_FLASH_ATTN``
        ``1`` to enable Flash Attention.
      ``ENABLE_PROFILING``
        ``1`` to enable PyTorch profiling for performance analysis.
      ``transformer-impl``
        ``transformer_engine`` to use the Transformer Engine (TE) or ``local`` to disable TE.
      ``MODEL_SIZE``
        ``8B`` or ``70B`` for Llama 3 and 3.1. ``7B`` or ``70B`` for Llama 2.
      ``TOTAL_ITERS``
        The total number of iterations -- ``10`` by default.
      ``MOCK_DATA``
        ``1`` to use mock data or ``0`` to use real data provided by you.
      ``MBS``
        Micro batch size.
      ``BS``
        Global batch size.
      ``TP``
        Tensor parallel (``1``, ``2``, ``4``, ``8``).
      ``SEQ_LENGTH``
        Input sequence length.
   .. tab-item:: DeepSeek V2
      :sync: deepseek
      ``PR``
        Precision for training. ``bf16`` for BF16 (default) or ``fp8`` for FP8 GEMMs.
      ``GEMM_TUNING``
        ``1`` to enable GEMM tuning, which boosts performance by using the best GEMM kernels.
      ``TOTAL_ITERS``
        The total number of iterations -- ``10`` by default.
      ``MOCK_DATA``
        ``1`` to use mock data or ``0`` to use real data provided by you.
      ``MBS``
        Micro batch size.
      ``GBS``
        Global batch size.
 Benchmarking examples
 ---------------------
 .. tab-set::
   .. tab-item:: Llama
      :sync: llama
      .. tab-set::
         .. tab-item:: Single node training
            :sync: single-node
            Use this command to run training with Llama 2 7B model on a single node. You can specify MBS, BS, FP,
            datatype, and so on.
            .. code-block:: bash
               TEE_OUTPUT=1 MBS=5 BS=120 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
               SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
            You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup>`.
            See the sample output:
            .. image:: ../../../../data/how-to/rocm-for-ai/llama2-7b-training-log-sample.png
               :width: 800
         .. tab-item:: Multi-node training
            :sync: multi-node
            Launch the Docker container on each node.
            In this example, run training with Llama 2 7B model on 2 nodes with specific MBS, BS, FP, datatype, and
            so on.
            On the master node:
            .. code-block:: bash
               TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
               SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
            On the worker node:
            .. code-block:: bash
               TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
               SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
            You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup>`.
            Sample output for 2-node training:
            Master node:
            .. image:: ../../../../data/how-to/rocm-for-ai/2-node-training-master.png
               :width: 800
            Worker node:
            .. image:: ../../../../data/how-to/rocm-for-ai/2-node-training-worker.png
               :width: 800
 Previous versions
 =================
 See :doc:`megatron-lm-history` to find documentation for previous releases
 of the ``ROCm/megatron-lm`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4.rst
@@ -0,0 +1,618 @@
 :orphan:
 .. meta::
   :description: How to train a model using Megatron-LM for ROCm.
   :keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
 ******************************************
 Training a model with Megatron-LM for ROCm
 ******************************************
 .. caution::
   This documentation does not reflect the latest version of ROCm Megatron-LM
   training performance documentation. See :doc:`../megatron-lm` for the latest version.
 The Megatron-LM framework for ROCm is a specialized fork of the robust Megatron-LM,
 designed to enable efficient training of large-scale language models on AMD
 GPUs. By leveraging AMD Instinct™ MI300X series accelerators, Megatron-LM delivers
 enhanced scalability, performance, and resource utilization for AI workloads.
 It is purpose-built to support models like Llama 2, Llama 3, Llama 3.1, and
 DeepSeek, enabling developers to train next-generation AI models more
 efficiently. See the GitHub repository at `<https://github.com/ROCm/Megatron-LM>`__.
 AMD provides a ready-to-use Docker image for MI300X series accelerators containing
 essential components, including PyTorch, ROCm libraries, and Megatron-LM
 utilities. It contains the following software components to accelerate training
 workloads:
 +--------------------------+--------------------------------+
 | Software component       | Version                        |
 +==========================+================================+
 | ROCm                     | 6.3.0                          |
 +--------------------------+--------------------------------+
 | PyTorch                  | 2.7.0a0+git637433              |
 +--------------------------+--------------------------------+
 | Python                   | 3.10                           |
 +--------------------------+--------------------------------+
 | Transformer Engine       | 1.11                           |
 +--------------------------+--------------------------------+
 | Flash Attention          | 3.0.0                          |
 +--------------------------+--------------------------------+
 | hipBLASLt                | git258a2162                    |
 +--------------------------+--------------------------------+
 | Triton                   | 3.1                            |
 +--------------------------+--------------------------------+
 Supported features and models
 =============================
 Megatron-LM provides the following key features to train large language models efficiently:
 - Transformer Engine (TE)
 - APEX
 - GEMM tuning
 - Torch.compile
 - 3D parallelism: TP + SP + CP
 - Distributed optimizer
 - Flash Attention (FA) 3
 - Fused kernels
 - Pre-training
 .. _amd-megatron-lm-model-support:
 The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
 * Llama 3.1 8B
 * Llama 3.1 70B
 * Llama 3 8B
 * Llama 3 70B
 * Llama 2 7B
 * Llama 2 70B
 * DeepSeek-V2-Lite
 .. note::
   Some models, such as Llama, require an external license agreement through
   a third party (for example, Meta).
 .. _amd-megatron-lm-performance-measurements:
 Performance measurements
 ========================
 To evaluate performance, the
 `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
 page provides reference throughput and latency measurements for training
 popular AI models.
 .. important::
   The performance data presented in
   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
   only reflects the :doc:`latest version of this training benchmarking environment <../megatron-lm>`_.
   The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
 System validation
 =================
 If you have already validated your system settings, including NUMA
 auto-balancing, skip this step. Otherwise, complete the :ref:`system validation
 and optimization steps <train-a-model-system-validation>` to set up your system
 before starting training.
 .. _mi300x-amd-megatron-lm-training:
 Environment setup
 =================
 The prebuilt ROCm Megatron-LM environment allows users to quickly validate system performance, conduct
 training benchmarks, and achieve superior performance for models like Llama 3.1, Llama 2, and DeepSeek V2.
 Use the following instructions to set up the environment, configure the script to train models, and
 reproduce the benchmark results on MI300X series accelerators with the AMD Megatron-LM Docker
 image.
 .. _amd-megatron-lm-requirements:
 Download the Docker image
 -------------------------
 1. Use the following command to pull the Docker image from Docker Hub.
   .. code-block:: shell
      docker pull rocm/megatron-lm:v25.4
 2. Launch the Docker container.
   .. code-block:: shell
      docker run -it --device /dev/dri --device /dev/kfd --device /dev/infiniband --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 64G --name megatron_training_env rocm/megatron-lm:v25.4
 3. Use these commands if you exit the ``megatron_training_env`` container and need to return to it.
   .. code-block:: shell
      docker start megatron_training_env
      docker exec -it megatron_training_env bash
 The Docker container includes a pre-installed, verified version of the ROCm Megatron-LM development branch `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev>`__
 (commit `fd6f01 <https://github.com/ROCm/Megatron-LM/tree/fd6f0d11d7f9480ace32f22eb7e4dab5314fa350>`_).
 .. _amd-megatron-lm-environment-setup:
 Configuration scripts
 ---------------------
 .. tab-set::
   .. tab-item:: Llama
      :sync: llama
      If you're working with Llama 2 7B or Llama 2 70 B, use the ``train_llama2.sh`` configuration
      script in the ``examples/llama`` directory of
      `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/llama>`__.
      Likewise, if you're working with Llama 3 or Llama 3.1, use ``train_llama3.sh`` and update
      the configuration script accordingly.
   .. tab-item:: DeepSeek V2
      :sync: deepseek
      Use the ``train_deepseek_v2.sh`` configuration script in the ``examples/deepseek_v2``
      directory of
      `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/deepseek_v2>`__
      and update the configuration script accordingly.
 Network interface
 ^^^^^^^^^^^^^^^^^
 .. tab-set::
   .. tab-item:: Llama
      :sync: llama
      Update the network interface in the script to match your system's network interface. To
      find your network interface, run the following (outside of any Docker container):
      .. code-block:: bash
         ip a
      Look for an active interface that has an IP address in the same subnet as
      your other nodes. Then, update the following variables in the script, for
      example:
      .. code-block:: bash
         export NCCL_SOCKET_IFNAME=ens50f0np0
         export GLOO_SOCKET_IFNAME=ens50f0np0
 Dataset options
 ^^^^^^^^^^^^^^^
 .. tab-set::
   .. tab-item:: Llama
      :sync: llama
      You can use either mock data or real data for training.
      * Mock data can be useful for testing and validation. Use the ``MOCK_DATA`` variable to toggle between mock and real data. The default
        value is ``1`` for enabled.
        .. code-block:: bash
           MOCK_DATA=1
      * If you're using a real dataset, update the ``DATA_PATH`` variable to point to the location of your dataset.
        .. code-block:: bash
           MOCK_DATA=0
           DATA_PATH="/data/bookcorpus_text_sentence"  # Change to where your dataset is stored
        Ensure that the files are accessible inside the Docker container.
        To download the dataset, set the ``DATASET`` variable to the dataset you'd like to use. Two datasets are supported: ``DATASET=wiki`` and ``DATASET=bookcorpus``.
        Use the following command to download the dataset.
        .. code-block:: shell
           DATASET=wiki bash examples/llama/prepare_dataset.sh # For wiki-en dataset
           DATASET=bookcorpus bash examples/llama/prepare_dataset.sh # For bookcorpus dataset
   .. tab-item:: DeepSeek V2
      :sync: deepseek
      If you don't already have the dataset, download the DeepSeek dataset using the following
      commands:
      .. code-block:: shell
         mkdir deepseek-datasets
         cd deepseek-datasets
         wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/SlimPajama.json
         wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-train.json
         wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-valid.json
         wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.bin
         wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.idx
      You can use either mock data or real data for training.
      * Mock data can be useful for testing and validation. Use the ``MOCK_DATA`` variable to toggle between mock and real data. The default
        value is ``1`` for enabled.
        .. code-block:: bash
           MOCK_DATA=1
      * If you're using a real dataset, update the ``DATA_DIR`` variable to point to the location of your dataset.
        .. code-block:: bash
           MOCK_DATA=0
           DATA_DIR="/root/data/deepseek-datasets"  # Change to where your dataset is stored
        Ensure that the files are accessible inside the Docker container.
 Tokenizer
 ^^^^^^^^^
 Tokenization is the process of converting raw text into tokens that can be processed by the model. For Llama
 models, this typically involves sub-word tokenization, where words are broken down into smaller units based on
 a fixed vocabulary. The tokenizer is trained along with the model on a large corpus of text, and it learns a
 fixed vocabulary that can represent a wide range of text from different domains. This allows Llama models to
 handle a variety of input sequences, including unseen words or domain-specific terms.
 You can assign the path of an existing tokenizer to the ``TOKENIZER_MODEL`` as shown in the following examples.
 If the tokenizer is not found, it'll be downloaded to the default tokenizer model path: ``${DATA_DIR}/tokenizer_llama3``
 or ``${DATA_DIR}/tokenizer_llama2``.
 .. tab-set::
   .. tab-item:: Llama
      :sync: llama
      To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``Llama2Tokenizer``
      or the default ``HuggingFaceTokenizer``.
      To train any of Llama 3 and Llama 3.1 models that this Docker image supports, use the ``HuggingFaceTokenizer``.
      Set the Hugging Face model path in the ``TOKENIZER_MODEL`` variable.
      For example, if you're using the Llama 3.1 8B model:
      .. code-block:: shell
         TOKENIZER_MODEL=meta-llama/Llama-3.1-8B
      .. note::
         If you don't already have the Llama 3.1 tokenizer locally, set your
         personal Hugging Face access token ``HF_TOKEN`` to download the
         tokenizer. If you encounter the following error, set ``HF_TOKEN`` to
         your access-authorized Hugging Face token.
         .. code-block:: shell
            OSError: You are trying to access a gated repo.
            # pass your HF_TOKEN
            export HF_TOKEN=$your_personal_hf_token
   .. tab-item:: DeepSeek V2
      :sync: deepseek
      To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``DeepSeekV2Tokenizer``.
 Multi-node training
 ^^^^^^^^^^^^^^^^^^^
 .. tab-set::
   .. tab-item:: Llama
      :sync: llama
      If you're running multi-node training, update the following environment variables. They can
      also be passed as command line arguments.
      * Change ``localhost`` to the master node's hostname:
        .. code-block:: shell
           MASTER_ADDR="${MASTER_ADDR:-localhost}"
      * Set the number of nodes you want to train on (for instance, ``2``, ``4``, ``8``):
        .. code-block:: shell
           NNODES="${NNODES:-1}"
      * Set the rank of each node (0 for master, 1 for the first worker node, and so on):
        .. code-block:: shell
           NODE_RANK="${NODE_RANK:-0}"
      * Set ``DATA_CACHE_PATH`` to a common directory accessible by all the nodes (for example, an
        NFS directory) for multi-node runs:
        .. code-block:: shell
           DATA_CACHE_PATH=/root/cache # Set to a common directory for multi-node runs
      * For multi-node runs, make sure the correct network drivers are installed on the nodes. If
        inside a Docker container, either install the drivers inside the Docker container or pass the network
        drivers from the host while creating the Docker container.
        .. code-block:: shell
           # Specify which RDMA interfaces to use for communication
           export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
 Start training on AMD Instinct accelerators
 ===========================================
 The prebuilt Megatron-LM with ROCm training environment allows users to quickly validate
 system performance, conduct training benchmarks, and achieve superior
 performance for models like Llama 3.1 and Llama 2. This container should not be
 expected to provide generalized performance across all training workloads. You
 can expect the container to perform in the model configurations described in
 the following section, but other configurations are not validated by AMD.
 Use the following instructions to set up the environment, configure the script
 to train models, and reproduce the benchmark results on MI300X series
 accelerators with the AMD Megatron-LM Docker image.
 .. tab-set::
   .. tab-item:: Llama
      :sync: llama
      .. tab-set::
         .. tab-item:: Single node training
            :sync: single-node
            To run training on a single node, navigate to the Megatron-LM folder and use one of the
            following commands.
            - For Llama 3.1 8B FP8:
              .. code-block:: shell
                 TEE_OUTPUT=1 MBS=2 BS=128 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 TOTAL_ITERS=50 bash examples/llama/train_llama3.sh
            - For Llama 3.1 8B BF16:
              .. code-block:: shell
                 TEE_OUTPUT=1 MBS=2 BS=128 TP=1 TE_FP8=0 SEQ_LENGTH=8192 MODEL_SIZE=8 TOTAL_ITERS=50 bash examples/llama/train_llama3.sh
            - For Llama 2 7B FP8:
              .. code-block:: shell
                 TEE_OUTPUT=1 MBS=4 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=4096 MODEL_SIZE=7 TOTAL_ITERS=50 bash examples/llama/train_llama2.sh
            - For Llama 2 7B BF16:
              .. code-block:: shell
                 TEE_OUTPUT=1 MBS=4 BS=256 TP=1 TE_FP8=0 SEQ_LENGTH=4096 MODEL_SIZE=7 TOTAL_ITERS=50 bash examples/llama/train_llama2.sh
            To run training with FSDP2 enabled, add the ``FSDP=1`` argument. For example:
            - For Llama 3 70B BF16:
              .. code-block:: shell
                 TEE_OUTPUT=1 MBS=3 BS=24 TP=1 TE_FP8=0 FSDP=1 RECOMPUTE=1 SEQ_LENGTH=8192 MODEL_SIZE=70 TOTAL_ITERS=50 bash examples/llama/train_llama3.sh
            - For Llama 2 70B BF16:
              .. code-block:: shell
                 TEE_OUTPUT=1 MBS=3 BS=56 TP=1 TE_FP8=0 FSDP=1 RECOMPUTE=1 SEQ_LENGTH=4096 MODEL_SIZE=70 TOTAL_ITERS=50 bash examples/llama/train_llama2.sh
            .. note::
               It's suggested to use ``TP=1`` when FSDP is enabled for higher throughput. FSDP2 is not supported with pipeline parallelism,
               expert parallelism, MCore's distributed optimizer, gradient accumulation fusion, and ``FP16`` precision.
         .. tab-item:: Multi-node training
            :sync: multi-node
            To run training on multiple nodes, launch the Docker container on each node. For example, for a two node setup (``NODE0`` as the master node), use these commands.
            * On the master node ``NODE0``:
              .. code-block:: shell
                 TEE_OUTPUT=1 MBS=2 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 MASTER_ADDR=IP_NODE0 NNODES=2 NODE_RANK=0 bash examples/llama/train_llama3.sh
            * On the worker node ``NODE1``:
              .. code-block:: shell
                 TEE_OUTPUT=1 MBS=2 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 MASTER_ADDR=IP_NODE0 NNODES=2 NODE_RANK=1 bash examples/llama/train_llama3.sh
   .. tab-item:: DeepSeek V2
      :sync: deepseek
      To run the training on a single node, go to ``/Megatron-LM`` folder and use the following command:
      .. code-block:: shell
         cd /workspace/Megatron-LM
         GEMM_TUNING=1 PR=bf16 MBS=4 AC=none SEQ_LEN=4096 PAD_LEN=4096 TRAIN_ITERS=50 bash examples/deepseek_v2/train_deepseekv2.sh
 Key options
 -----------
 .. _amd-megatron-lm-benchmark-test-vars:
 The benchmark tests support the following sets of variables:
 .. tab-set::
   .. tab-item:: Llama
      :sync: llama
      ``TEE_OUTPUT``
        ``1`` to enable training logs or ``0`` to disable.
      ``TE_FP8``
        ``0`` for B16 or ``1`` for FP8 -- ``0`` by default.
      ``GEMM_TUNING``
        ``1`` to enable GEMM tuning, which boosts performance by using the best GEMM kernels.
      ``USE_FLASH_ATTN``
        ``1`` to enable Flash Attention.
      ``FSDP``
        ``1`` to enable PyTorch FSDP2. If FSDP is enabled, ``--use-distributed-optimizer``,
        ``--overlap-param-gather``, and ``--sequence-parallel`` are automaticallyu disabled.
      ``ENABLE_PROFILING``
        ``1`` to enable PyTorch profiling for performance analysis.
      ``transformer-impl``
        ``transformer_engine`` to use the Transformer Engine (TE) or ``local`` to disable TE.
      ``MODEL_SIZE``
        ``8B`` or ``70B`` for Llama 3 and 3.1. ``7B`` or ``70B`` for Llama 2.
      ``TOTAL_ITERS``
        The total number of iterations -- ``10`` by default.
      ``MOCK_DATA``
        ``1`` to use mock data or ``0`` to use real data you provide.
      ``MBS``
        Micro batch size.
      ``BS``
        Global batch size.
      ``TP``
        Tensor parallel (``1``, ``2``, ``4``, ``8``). ``TP`` is disabled when ``FSDP`` is turned on.
      ``SEQ_LENGTH``
        Input sequence length.
   .. tab-item:: DeepSeek V2
      :sync: deepseek
      ``PR``
        Precision for training. ``bf16`` for BF16 (default) or ``fp8`` for FP8 GEMMs.
      ``GEMM_TUNING``
        ``1`` to enable GEMM tuning, which boosts performance by using the best GEMM kernels.
      ``TRAIN_ITERS``
        The total number of iterations.
      ``MOCK_DATA``
        ``1`` to use mock data or ``0`` to use real data you provide.
      ``MBS``
        Micro batch size.
      ``GBS``
        Global batch size.
      ``SEQ_LEN``
        Input sequence length.
      ``AC``
        Activation checkpointing (``none``, ``sel``, or ``full``) -- ``sel`` by default.
 Benchmarking examples
 ---------------------
 .. tab-set::
   .. tab-item:: Llama
      :sync: llama
      .. tab-set::
         .. tab-item:: Single node training
            :sync: single-node
            Use this command to run training with Llama 2 7B model on a single node. You can specify MBS, BS, FP,
            datatype, and so on.
            .. code-block:: bash
               TEE_OUTPUT=1 MBS=5 BS=120 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
               SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
            You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup>`.
            See the sample output:
            .. image:: ../../../../data/how-to/rocm-for-ai/llama2-7b-training-log-sample.png
               :width: 800
         .. tab-item:: Multi-node training
            :sync: multi-node
            Launch the Docker container on each node.
            In this example, run training with Llama 2 7B model on 2 nodes with specific MBS, BS, FP, datatype, and
            so on.
            On the master node:
            .. code-block:: bash
               TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
               SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
            On the worker node:
            .. code-block:: bash
               TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
               SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
            You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup>`.
            Sample output for 2-node training:
            Master node:
            .. image:: ../../../../data/how-to/rocm-for-ai/2-node-training-master.png
               :width: 800
            Worker node:
            .. image:: ../../../../data/how-to/rocm-for-ai/2-node-training-worker.png
               :width: 800
 Previous versions
 =================
 See :doc:`megatron-lm-history` to find documentation for previous releases
 of the ``ROCm/megatron-lm`` Docker image.
--- a/Show More
+++ b/Show More