Docs: Pytorch compatibility page update

Merge pull request #4925 from peterjunpark/docs/6.4.1
[docs/6.4.1] Fix Sphinx issue in vllm-benchmark 0.8.5-20250513 previous version (#…
2026-01-11 07:38:17 -05:00 · 2025-06-18 11:14:50 +02:00 · 2025-06-13 15:15:46 -04:00 · 2025-06-13 15:04:54 -04:00 · 2025-06-12 15:25:28 +02:00 · 2025-06-10 17:19:45 -04:00
110 changed files with 3220 additions and 12093 deletions
--- a/.azuredevops/ci-builds/mathlibs-trigger.yml
+++ b/.azuredevops/ci-builds/mathlibs-trigger.yml
@@ -1,33 +0,0 @@
-variables:
- group: common
- template: /.azuredevops/variables-global.yml@pipelines_repo
-
-parameters:
- name: pipelinesRepoRef
-  type: string
-  default: refs/heads/develop
- name: librariesRepoRef
-  type: string
-  default: refs/heads/develop
-
-resources:
-  repositories:
-  - repository: pipelines_repo
-    type: github
-    endpoint: ROCm
-    name: ROCm/ROCm
-    ref: ${{ parameters.pipelinesRepoRef }}
-  - repository: libraries_repo
-    type: github
-    endpoint: ROCm
-    name: ROCm/rocm-libraries
-    ref: ${{ parameters.librariesRepoRef }}
-
-trigger: none
-pr: none
-
-jobs:
-  - template: /.azuredevops/ci-builds/mathlibs.yml@pipelines_repo
-    parameters:
-      checkoutRepo: libraries_repo
-      buildDependsOn: false
--- a/.azuredevops/ci-builds/mathlibs.yml
+++ b/.azuredevops/ci-builds/mathlibs.yml
@@ -1,38 +0,0 @@
-# entrypoint for kicking off a unified build of the mathlibs
-# this template is designed to be called by another pipeline (llvm, clr, etc.)
-# `buildDependsOn` will need to be set when calling this template
-# passes a `unifiedBuild` param to downstream pipelines, which will prevent duplicate jobs
-# logic needs to be added in individual mathlib pipelines for handling `unifiedBuild`
-
-parameters:
- name: checkoutRepo
-  type: string
-  default: monorepo
- name: buildDependsOn
-  type: object
-  default: false
- name: downstreamComponentMatrix
-  type: object
-  default:
-    - rocRAND:
-      name: rocRAND
-      sparseCheckoutDir: projects/rocrand
-    - rocPRIM:
-      name: rocPRIM
-      sparseCheckoutDir: projects/rocprim
-    - hipBLAS-common:
-      name: hipBLAS-common
-      sparseCheckoutDir: projects/hipblas-common
-    # - composable_kernel:
-    #   name: composable_kernel
-    #   sparseCheckoutDir: projects/composablekernel
-
-jobs:
- ${{ each component in parameters.downstreamComponentMatrix }}:
-  - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
-    parameters:
-      checkoutRepo: ${{ parameters.checkoutRepo }}
-      sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
-      buildDependsOn: ${{ parameters.buildDependsOn }}
-      triggerDownstreamJobs: true
-      unifiedBuild: true
--- a/.azuredevops/components/HIP.yml
+++ b/.azuredevops/components/HIP.yml
@@ -20,7 +20,7 @@ parameters:
    - ocl-icd-libopencl1
    - ocl-icd-opencl-dev
    - opencl-headers
-    - zlib1g-dev
+    - python3-pip
 - name: pipModules
  type: object
  default:
@@ -41,148 +41,120 @@ parameters:
 # any changes for clr should just trigger HIP pipeline
 # similarly for hipother repo, for Nvidia backend

- name: jobMatrix
-  type: object
-  default:
-    buildJobs:
-      - { os: ubuntu2204, packageManager: apt }
-      - { os: almalinux8, packageManager: dnf }
-
 # HIP with AMD backend
 jobs:
- ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: hip_clr_combined_${{ job.os }}_amd
-    pool:
-      vmImage: 'ubuntu-22.04'
-    ${{ if eq(job.os, 'almalinux8') }}:
-      container:
-        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-        endpoint: ContainerService3
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-        packageManager: ${{ job.packageManager }}
-  # checkout triggering repo (either HIP or clr)
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      parameters:
-        checkoutRepo: ${{ parameters.checkoutRepo }}
-  # if this is triggered by HIP repo, matching repo is clr
-  # if this is triggered by clr repo, matching repo is HIP
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      parameters:
-        checkoutRepo: matching_repo
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      parameters:
-        checkoutRepo: hipother_repo
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmDependenciesAMD }}
-        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-        os: ${{ job.os }}
-  # compile clr
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-      parameters:
-        componentName: clr
-        cmakeBuildDir: '$(Build.SourcesDirectory)/clr/build'
-        cmakeSourceDir: '$(Build.SourcesDirectory)/clr'
-        os: ${{ job.os }}
-        useAmdclang: false
-        extraBuildFlags: >-
-          -DHIP_COMMON_DIR=$(Build.SourcesDirectory)/HIP
-          -DHIP_PLATFORM=amd
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-          -DROCM_PATH=$(Agent.BuildDirectory)/rocm
-          -DHIPCC_BIN_DIR=$(Agent.BuildDirectory)/rocm/bin
-          -DCLR_BUILD_HIP=ON
-          -DCLR_BUILD_OCL=ON
-          -GNinja
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-      parameters:
-        artifactName: amd
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-      parameters:
-        artifactName: amd
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-    #   parameters:
-    #     aptPackages: ${{ parameters.aptPackages }}
-    #     pipModules: ${{ parameters.pipModules }}
-    #     environment: amd
+- job: hip_clr_combined_amd
+  variables:
+  - group: common
+  - template: /.azuredevops/variables-global.yml
+  pool:
+    vmImage: ${{ variables.BASE_BUILD_POOL }}
+  workspace:
+    clean: all
+  steps:
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+    parameters:
+      aptPackages: ${{ parameters.aptPackages }}
+      pipModules: ${{ parameters.pipModules }}
+# checkout triggering repo (either HIP or clr)
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+    parameters:
+      checkoutRepo: ${{ parameters.checkoutRepo }}
+# if this is triggered by HIP repo, matching repo is clr
+# if this is triggered by clr repo, matching repo is HIP
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+    parameters:
+      checkoutRepo: matching_repo
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+    parameters:
+      checkoutRepo: hipother_repo
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+    parameters:
+      checkoutRef: ${{ parameters.checkoutRef }}
+      dependencyList: ${{ parameters.rocmDependenciesAMD }}
+      aggregatePipeline: ${{ parameters.aggregatePipeline }}
+# compile clr
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+    parameters:
+      componentName: clr
+      cmakeBuildDir: '$(Build.SourcesDirectory)/clr/build'
+      cmakeSourceDir: '$(Build.SourcesDirectory)/clr'
+      extraBuildFlags: >-
+        -DHIP_COMMON_DIR=$(Build.SourcesDirectory)/HIP
+        -DHIP_PLATFORM=amd
+        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+        -DROCM_PATH=$(Agent.BuildDirectory)/rocm
+        -DHIPCC_BIN_DIR=$(Agent.BuildDirectory)/rocm/bin
+        -DCLR_BUILD_HIP=ON
+        -DCLR_BUILD_OCL=ON
+        -GNinja
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+    parameters:
+      artifactName: amd
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+    parameters:
+      artifactName: amd
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+  # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+  #   parameters:
+  #     aptPackages: ${{ parameters.aptPackages }}
+  #     pipModules: ${{ parameters.pipModules }}
+  #     environment: amd

 # HIP with Nvidia backend
- ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: hip_clr_combined_${{ job.os }}_nvidia
-    pool:
-      vmImage: 'ubuntu-22.04'
-    ${{ if eq(job.os, 'almalinux8') }}:
-      container:
-        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-        endpoint: ContainerService3
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-        packageManager: ${{ job.packageManager }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-  # checkout triggering repo (either HIP or clr)
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      parameters:
-        checkoutRepo: ${{ parameters.checkoutRepo }}
-  # if this is triggered by HIP repo, matching repo is clr
-  # if this is triggered by clr repo, matching repo is HIP
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      parameters:
-        checkoutRepo: matching_repo
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      parameters:
-        checkoutRepo: hipother_repo
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmDependenciesNvidia }}
-        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-        os: ${{ job.os }}
-    - script: 'ls -1R $(Agent.BuildDirectory)/rocm'
-      displayName: 'Artifact listing'
-  # compile clr
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-      parameters:
-        componentName: clr
-        cmakeBuildDir: '$(Build.SourcesDirectory)/clr/build'
-        cmakeSourceDir: '$(Build.SourcesDirectory)/clr'
-        os: ${{ job.os }}
-        useAmdclang: false
-        extraBuildFlags: >-
-          -DHIP_COMMON_DIR=$(Build.SourcesDirectory)/HIP
-          -DHIP_PLATFORM=nvidia
-          -DHIPCC_BIN_DIR=$(Agent.BuildDirectory)/rocm/bin
-          -DCLR_BUILD_HIP=ON
-          -DCLR_BUILD_OCL=OFF
-          -DHIPNV_DIR=$(Build.SourcesDirectory)/hipother/hipnv
-          -GNinja
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-      parameters:
-        artifactName: nvidia
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-    #   parameters:
-    #     aptPackages: ${{ parameters.aptPackages }}
-    #     pipModules: ${{ parameters.pipModules }}
-    #     environment: nvidia
+- job: hip_clr_combined_nvidia
+  variables:
+  - group: common
+  - template: /.azuredevops/variables-global.yml
+  pool:
+    vmImage: ${{ variables.BASE_BUILD_POOL }}
+  workspace:
+    clean: all
+  steps:
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+    parameters:
+      aptPackages: ${{ parameters.aptPackages }}
+      pipModules: ${{ parameters.pipModules }}
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+# checkout triggering repo (either HIP or clr)
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+    parameters:
+      checkoutRepo: ${{ parameters.checkoutRepo }}
+# if this is triggered by HIP repo, matching repo is clr
+# if this is triggered by clr repo, matching repo is HIP
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+    parameters:
+      checkoutRepo: matching_repo
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+    parameters:
+      checkoutRepo: hipother_repo
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+    parameters:
+      checkoutRef: ${{ parameters.checkoutRef }}
+      dependencyList: ${{ parameters.rocmDependenciesNvidia }}
+      aggregatePipeline: ${{ parameters.aggregatePipeline }}
+  - script: 'ls -1R $(Agent.BuildDirectory)/rocm'
+    displayName: 'Artifact listing'
+# compile clr
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+    parameters:
+      componentName: clr
+      cmakeBuildDir: '$(Build.SourcesDirectory)/clr/build'
+      cmakeSourceDir: '$(Build.SourcesDirectory)/clr'
+      extraBuildFlags: >-
+        -DHIP_COMMON_DIR=$(Build.SourcesDirectory)/HIP
+        -DHIP_PLATFORM=nvidia
+        -DHIPCC_BIN_DIR=$(Agent.BuildDirectory)/rocm/bin
+        -DCLR_BUILD_HIP=ON
+        -DCLR_BUILD_OCL=OFF
+        -DHIPNV_DIR=$(Build.SourcesDirectory)/hipother/hipnv
+        -GNinja
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+    parameters:
+      artifactName: nvidia
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+  # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+  #   parameters:
+  #     aptPackages: ${{ parameters.aptPackages }}
+  #     pipModules: ${{ parameters.pipModules }}
+  #     environment: nvidia
--- a/.azuredevops/components/HIPIFY.yml
+++ b/.azuredevops/components/HIPIFY.yml
@@ -1,7 +1,4 @@
 parameters:
- name: componentName
-  type: string
-  default: HIPIFY
 - name: checkoutRepo
  type: string
  default: 'self'
@@ -16,140 +13,113 @@ parameters:
 - name: aptPackages
  type: object
  default:
-    - cuda-toolkit-12-9
-    - libcudnn9-dev-cuda-12
-    - libnuma-dev
-    - mesa-common-dev
+    - cmake
    - ninja-build
-    - python-is-python3
+    - libnuma-dev
    - python3-dev
    - python3-pip
- name: pipModules
-  type: object
-  default:
-    - lit
- name: rocmDependencies
-  type: object
-  default:
-    - llvm-project
-
- name: jobMatrix
-  type: object
-  default:
-    buildJobs:
-      - { os: ubuntu2204, packageManager: apt }
-      - { os: almalinux8, packageManager: dnf }
-    testJobs:
-      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+    - python-is-python3
+    - mesa-common-dev
+    - ccache
+    - cuda-toolkit
+    - cudnn

 jobs:
- ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: ${{ parameters.componentName }}_build_${{ job.os }}
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool:
-      ${{ if eq(job.os, 'ubuntu2404') }}:
-        name: rocm-ci_medium_build_pool_2404
-      ${{ else }}:
-        name: ${{ variables.MEDIUM_BUILD_POOL }}
-    ${{ if eq(job.os, 'almalinux8') }}:
-      container:
-        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-        endpoint: ContainerService3
-    workspace:
-      clean: all
-    steps:
-    - task: Bash@3
-      displayName: 'Register CUDA packages'
-      inputs:
-        targetType: inline
-        ${{ if eq(job.os, 'ubuntu2204') }}:
-          script: |
-            wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
-            sudo dpkg -i cuda-keyring_1.1-1_all.deb
-            sudo rm -f cuda-keyring_1.1-1_all.deb
-            sudo apt update
-        ${{ if eq(job.os, 'almalinux8') }}:
-          script: |
-            sudo dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-        packageManager: ${{ job.packageManager }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
-    - task: Bash@3
-      displayName: Add lit to PATH
-      inputs:
-        targetType: inline
-        script: |
-          site_packages=$(python3 -m site --user-base)/bin
-          sudo ln -sf $site_packages/bin/lit $(Pipeline.Workspace)/llvm-lit
-          echo "##vso[task.prependpath]$site_packages"
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      parameters:
-        checkoutRepo: ${{ parameters.checkoutRepo }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmDependencies }}
-        os: ${{ job.os }}
-        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-    # cutensor is not available from apt or dnf
-    - task: Bash@3
-      displayName: 'Download and install cutensor'
-      inputs:
-        targetType: inline
-        script: |
-          wget -q --show-progress https://developer.download.nvidia.com/compute/cutensor/redist/libcutensor/linux-x86_64/libcutensor-linux-x86_64-2.2.0.0-archive.tar.xz
-          tar -xvJf libcutensor-linux-x86_64-*.tar.xz
-          mkdir -p $(Pipeline.Workspace)/cutensor
-          cp -r libcutensor-linux-x86_64-*/* $(Pipeline.Workspace)/cutensor/
-    - task: Bash@3
-      displayName: 'List downloaded CUDA files'
-      inputs:
-        targetType: inline
-        script: ls -la1R /usr/local/cuda-12.9
-    # script: cp $(Pipeline.Workspace)/llvm-project/llvm/build/bin/FileCheck $(Pipeline.Workspace)/llvm/bin
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-      parameters:
-        componentName: ${{ parameters.componentName }}
-        os: ${{ job.os }}
-        consolidateBuildAndInstall: true
-        extraBuildFlags: >-
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm/llvm;/usr/local/cuda/targets/x86_64-linux/lib
-          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang++
-          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang
-          -DHIPIFY_CLANG_TESTS=ON
-          -DCMAKE_BUILD_TYPE=Release
-          -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-12.9
-          -DCUDA_DNN_ROOT_DIR=/usr/local/cuda-12.9
-          -DCUDA_CUB_ROOT_DIR=/usr/local/cuda-12.9/targets/x86_64-linux/include/cub
-          -DCUDA_TENSOR_ROOT_DIR=$(Pipeline.Workspace)/cutensor/
-        multithreadFlag: -- -j32
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-      parameters:
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-      parameters:
-        os: ${{ job.os }}
-    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-    #  parameters:
-    #    componentName: HIPIFY
-    #    testDir: $(Build.SourcesDirectory)/build
-    #    testExecutable: make
-    #    testParameters: -j 32 test-hipify
-    #    testPublishResults: false
-    #    os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    - ${{ if eq(job.os, 'ubuntu2204') }}:
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          environment: combined
-          registerCUDAPackages: true
-          extraCopyDirectories:
-            - llvm-project
+- job: HIPIFY
+  variables:
+  - group: common
+  - template: /.azuredevops/variables-global.yml
+  - name: UPSTREAM_LLVM_GIT_URL
+    value: https://github.com/llvm/llvm-project.git
+  - name: UPSTREAM_LLVM_TAG
+    value: llvmorg-18.1.2
+  pool: ${{ variables.MEDIUM_BUILD_POOL }}
+  workspace:
+    clean: all
+  steps:
+  - task: Bash@3
+    displayName: 'Register CUDA packages'
+    inputs:
+      targetType: inline
+      script: |
+        wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+        sudo dpkg -i cuda-keyring_1.1-1_all.deb
+        sudo rm -f cuda-keyring_1.1-1_all.deb
+        sudo apt update
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+    parameters:
+      aptPackages: ${{ parameters.aptPackages }}
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+  - task: Bash@3
+    displayName: git clone upstream llvm-project
+    inputs:
+      targetType: inline
+      script: git clone $(UPSTREAM_LLVM_GIT_URL) --depth=1 --branch $(UPSTREAM_LLVM_TAG) --recurse-submodules
+      workingDirectory: $(Pipeline.Workspace)
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+    parameters:
+      checkoutRepo: ${{ parameters.checkoutRepo }}
+  - script: |
+      mkdir -p $(CCACHE_DIR)
+      echo "##vso[task.prependpath]/usr/lib/ccache:/usr/local/cuda/bin"
+    displayName: Update path for cuda and ccache
+  - task: Cache@2
+    displayName: Ccache caching
+    inputs:
+      key: HIPIFY | $(Agent.OS) | "$(UPSTREAM_LLVM_TAG)"
+      path: $(CCACHE_DIR)
+      restoreKeys: HIPIFY | $(Agent.OS)
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+    parameters:
+      componentName: upstream-llvm
+      cmakeBuildDir: $(Pipeline.Workspace)/llvm-project/llvm/build
+      cmakeSourceDir: $(Pipeline.Workspace)/llvm-project/llvm
+      installDir: $(Pipeline.Workspace)/llvm
+      extraBuildFlags: >-
+        -DCMAKE_BUILD_TYPE=Release
+        -DLLVM_ENABLE_PROJECTS=clang
+        -DLLVM_INCLUDE_TESTS=OFF
+        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
+        -DCMAKE_C_COMPILER_LAUNCHER=ccache
+        -GNinja
+  - task: Bash@3
+    displayName: python install lit
+    inputs:
+      targetType: inline
+      script: sudo python3 $(Pipeline.Workspace)/llvm-project/llvm/utils/lit/setup.py install
+  - task: Bash@3
+    displayName: install FileCheck
+    inputs:
+      targetType: inline
+      script: cp $(Pipeline.Workspace)/llvm-project/llvm/build/bin/FileCheck $(Pipeline.Workspace)/llvm/bin
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+    parameters:
+      componentName: HIPIFY
+      extraBuildFlags: >-
+        -DHIPIFY_CLANG_TESTS=ON
+        -DCMAKE_BUILD_TYPE=Release
+        -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda/targets/x86_64-linux
+        -DCUDA_DNN_ROOT_DIR=/usr/local/cuda/targets/x86_64-linux
+        -DCMAKE_PREFIX_PATH=$(Pipeline.Workspace)/llvm;/usr/local/cuda/targets/x86_64-linux/lib
+        -DLLVM_EXTERNAL_LIT=$(Pipeline.Workspace)/llvm-project/llvm/build/bin/llvm-lit
+      multithreadFlag: -- -j32
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+    parameters:
+      componentName: HIPIFY
+      testDir: $(Build.SourcesDirectory)/build
+      testExecutable: make
+      testParameters: test-hipify
+      testPublishResults: false
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+    parameters:
+      aptPackages: ${{ parameters.aptPackages }}
+      environment: combined
+      registerCUDAPackages: true
+      extraCopyDirectories:
+        - llvm-project
+      extraEnvVars:
+        - UPSTREAM_LLVM_GIT_URL:::https://github.com/llvm/llvm-project.git
+        - UPSTREAM_LLVM_TAG:::llvmorg-18.1.2
--- a/.azuredevops/components/MIOpen.yml
+++ b/.azuredevops/components/MIOpen.yml
@@ -16,7 +16,6 @@ parameters:
    - cmake
    - jq
    - libdrm-dev
-    - libmsgpack-dev
    - libsqlite3-dev
    - libstdc++-12-dev
    - ninja-build
--- a/.azuredevops/components/MIVisionX.yml
+++ b/.azuredevops/components/MIVisionX.yml
@@ -43,20 +43,18 @@ parameters:
 - name: rocmDependencies
  type: object
  default:
-    - AMDMIGraphX
-    - clr
-    - half
-    - hipBLAS-common
-    - hipBLASLt
-    - llvm-project
-    - MIOpen
-    - rocBLAS
-    - rocDecode
    - rocm-cmake
+    - llvm-project
+    - ROCR-Runtime
+    - clr
    - rocminfo
    - rocprofiler-register
-    - ROCR-Runtime
+    - half
+    - rocBLAS
+    - MIOpen
+    - AMDMIGraphX
    - rpp
+    - rocDecode
 - name: rocmTestDependencies
  type: object
  default:
@@ -92,7 +90,8 @@ jobs:
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
-    pool: ${{ variables.MEDIUM_BUILD_POOL }}
+    pool:
+      vmImage: ${{ variables.BASE_BUILD_POOL }}
    workspace:
      clean: all
    steps:
--- a/.azuredevops/components/ROCR-Runtime.yml
+++ b/.azuredevops/components/ROCR-Runtime.yml
@@ -20,6 +20,7 @@ parameters:
    - libnuma-dev
    - ninja-build
    - pkg-config
+    - python3-pip
 - name: rocmDependencies
  type: object
  default:
@@ -35,65 +36,51 @@ parameters:
 - name: jobMatrix
  type: object
  default:
-    buildJobs:
-      - { os: ubuntu2204, packageManager: apt }
-      - { os: almalinux8, packageManager: dnf }
    testJobs:
-      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+      - gfx942:
+        target: gfx942
+      - gfx90a:
+        target: gfx90a

 jobs:
- ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: ROCR_Runtime_build_${{ job.os }}
-    pool:
-      vmImage: 'ubuntu-22.04'
-    ${{ if eq(job.os, 'almalinux8') }}:
-      container:
-        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-        endpoint: ContainerService3
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        packageManager: ${{ job.packageManager }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      parameters:
-        checkoutRepo: ${{ parameters.checkoutRepo }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmDependencies }}
-        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-      parameters:
-        os: ${{ job.os }}
-        useAmdclang: false
-        extraBuildFlags: >-
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-          -DBUILD_SHARED_LIBS=ON
-          -DCMAKE_BUILD_TYPE=Release
-          -GNinja
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-      parameters:
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-      parameters:
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-    #   parameters:
-    #     aptPackages: ${{ parameters.aptPackages }}
+- job: ROCR_Runtime_build
+  variables:
+  - group: common
+  - template: /.azuredevops/variables-global.yml
+  pool:
+    vmImage: ${{ variables.BASE_BUILD_POOL }}
+  workspace:
+    clean: all
+  steps:
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+    parameters:
+      aptPackages: ${{ parameters.aptPackages }}
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+    parameters:
+      checkoutRepo: ${{ parameters.checkoutRepo }}
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+    parameters:
+      checkoutRef: ${{ parameters.checkoutRef }}
+      dependencyList: ${{ parameters.rocmDependencies }}
+      aggregatePipeline: ${{ parameters.aggregatePipeline }}
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+    parameters:
+      extraBuildFlags: >-
+        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+        -DBUILD_SHARED_LIBS=ON
+        -DCMAKE_BUILD_TYPE=Release
+        -GNinja
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+  # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+  #   parameters:
+  #     aptPackages: ${{ parameters.aptPackages }}

 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: ROCR_Runtime_test_${{ job.os }}_${{ job.target }}
-    dependsOn: ROCR_Runtime_build_${{ job.os }}
+  - job: ROCR_Runtime_test_${{ job.target }}
+    dependsOn: ROCR_Runtime_build
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -110,7 +97,6 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
-        packageManager: ${{ job.packageManager }}
    - task: Bash@3
      displayName: Install libhwloc5
      inputs:
@@ -121,15 +107,12 @@ jobs:
          sudo apt install -y --allow-downgrades ./libhwloc5_1.11.12-3_amd64.deb ./libhwloc-dev_1.11.12-3_amd64.deb
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-      parameters:
-        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmTestDependencies }}
        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
@@ -138,13 +121,11 @@ jobs:
        runRocminfo: false
    - task: Bash@3
      displayName: Build kfdtest
+      continueOnError: true
      inputs:
        targetType: 'inline'
        workingDirectory: $(Build.SourcesDirectory)/libhsakmt/tests/kfdtest
        script: |
-          if [ -e /opt/rh/gcc-toolset-14/enable ]; then
-            source /opt/rh/gcc-toolset-14/enable
-          fi
          mkdir build && cd build
          cmake -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm ..
          make
@@ -154,16 +135,13 @@ jobs:
        testExecutable: BIN_DIR=$(Build.SourcesDirectory)/libhsakmt/tests/kfdtest/build ./run_kfdtest.sh
        testParameters: '-p core --gtest_output=xml:./test_output.xml --gtest_color=yes'
        testDir: $(Build.SourcesDirectory)/libhsakmt/tests/kfdtest/scripts
-        os: ${{ job.os }}
    - task: Bash@3
      displayName: Build rocrtst
+      continueOnError: true
      inputs:
        targetType: 'inline'
        workingDirectory: $(Build.SourcesDirectory)/rocrtst/suites/test_common
        script: |
-          if [ -e /opt/rh/gcc-toolset-14/enable ]; then
-            source /opt/rh/gcc-toolset-14/enable
-          fi
          BASE_CLANG_DIR=$(Agent.BuildDirectory)/rocm/llvm/lib/clang
          export NEWEST_CLANG_VER=$(ls -1 $BASE_CLANG_DIR | sort -V | tail -n 1)
          mkdir build && cd build
@@ -181,7 +159,6 @@ jobs:
        testExecutable: ./rocrtst64
        testParameters: '--gtest_filter="-rocrtstNeg.Memory_Negative_Tests:rocrtstFunc.Memory_Max_Mem" --gtest_output=xml:./test_output.xml --gtest_color=yes'
        testDir: $(Build.SourcesDirectory)/rocrtst/suites/test_common/build/${{ job.target }}
-        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/ROCdbgapi.yml
+++ b/.azuredevops/components/ROCdbgapi.yml
@@ -15,6 +15,7 @@ parameters:
  default:
    - cmake
    - ninja-build
+    - python3-pip
 - name: rocmDependencies
  type: object
  default:
@@ -23,57 +24,37 @@ parameters:
    - rocminfo
    - ROCR-Runtime

- name: jobMatrix
-  type: object
-  default:
-    buildJobs:
-      - { os: ubuntu2204, packageManager: apt }
-      - { os: almalinux8, packageManager: dnf }
-
 jobs:
- ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: ROCdbgapi_build_${{ job.os }}
-    pool:
-      vmImage: 'ubuntu-22.04'
-    ${{ if eq(job.os, 'almalinux8') }}:
-      container:
-        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-        endpoint: ContainerService3
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        packageManager: ${{ job.packageManager }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      parameters:
-        checkoutRepo: ${{ parameters.checkoutRepo }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmDependencies }}
-        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-      parameters:
-        os: ${{ job.os }}
-        useAmdclang: false
-        extraBuildFlags: >-
-          -DCMAKE_BUILD_TYPE=Release
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-          -GNinja
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-      parameters:
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-      parameters:
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-    #   parameters:
-    #     aptPackages: ${{ parameters.aptPackages }}
+- job: ROCdbgapi
+  variables:
+  - group: common
+  - template: /.azuredevops/variables-global.yml
+  pool:
+    vmImage: ${{ variables.BASE_BUILD_POOL }}
+  workspace:
+    clean: all
+  steps:
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+    parameters:
+      aptPackages: ${{ parameters.aptPackages }}
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+    parameters:
+      checkoutRepo: ${{ parameters.checkoutRepo }}
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+    parameters:
+      checkoutRef: ${{ parameters.checkoutRef }}
+      dependencyList: ${{ parameters.rocmDependencies }}
+      aggregatePipeline: ${{ parameters.aggregatePipeline }}
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+    parameters:
+      extraBuildFlags: >-
+        -DCMAKE_BUILD_TYPE=Release
+        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+        -GNinja
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+  # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+  #   parameters:
+  #     aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/ROCgdb.yml
+++ b/.azuredevops/components/ROCgdb.yml
@@ -1,7 +1,4 @@
 parameters:
- name: componentName
-  type: string
-  default: ROCgdb
 - name: checkoutRepo
  type: string
  default: 'self'
@@ -26,10 +23,8 @@ parameters:
    - libgmp-dev
    - liblzma-dev
    - libmpfr-dev
-    - ncurses-dev
    - pkg-config
-    - python3-dev
-    - python3-pip
+    - ncurses-dev
    - texinfo
    - zlib1g-dev
 - name: rocmDependencies
@@ -45,87 +40,67 @@ parameters:
 - name: jobMatrix
  type: object
  default:
-    buildJobs:
-      - { os: ubuntu2204, packageManager: apt }
-      - { os: almalinux8, packageManager: dnf }
    testJobs:
-      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+      - gfx942:
+        target: gfx942
+      - gfx90a:
+        target: gfx90a

 jobs:
- ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: ${{ parameters.componentName }}_build_${{ job.os }}
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    - name: PKG_CONFIG_PATH
-      value: $(Agent.BuildDirectory)/rocm/share/pkgconfig
-    pool:
-      ${{ if eq(job.os, 'ubuntu2404') }}:
-        name: rocm-ci_medium_build_pool_2404
-      ${{ else }}:
-        name: ${{ variables.MEDIUM_BUILD_POOL }}
-    ${{ if eq(job.os, 'almalinux8') }}:
-      container:
-        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-        endpoint: ContainerService3
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        packageManager: ${{ job.packageManager }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      parameters:
-        checkoutRepo: ${{ parameters.checkoutRepo }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-      parameters:
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmDependencies }}
-        os: ${{ job.os }}
-        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-autotools.yml
-      parameters:
-        os: ${{ job.os }}
-        configureFlags: >-
-          --program-prefix=roc
-          --enable-64-bit-bfd
-          --enable-targets="x86_64-linux-gnu,amdgcn-amd-amdhsa"
-          --disable-ld
-          --disable-gas
-          --disable-gdbserver
-          --disable-sim
-          --enable-tui
-          --disable-gdbtk
-          --disable-shared
-          --disable-gprofng
-          --with-expat
-          --with-system-zlib
-          --without-guile
-          --with-babeltrace
-          --with-lzma
-          --with-python=python3
-          --with-rocm-dbgapi=$(Agent.BuildDirectory)/rocm
-          LDFLAGS="-Wl,--enable-new-dtags,-rpath=$(Agent.BuildDirectory)/rocm/lib"
-        makeCallPrefix: LD_RUN_PATH='${ORIGIN}/../lib'
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-      parameters:
-        componentName: ${{ parameters.componentName }}
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-      parameters:
-        componentName: ${{ parameters.componentName }}
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+- job: ROCgdb
+  variables:
+  - group: common
+  - template: /.azuredevops/variables-global.yml
+  - name: PKG_CONFIG_PATH
+    value: $(Agent.BuildDirectory)/rocm/share/pkgconfig
+  pool:
+    vmImage: ${{ variables.BASE_BUILD_POOL }}
+  workspace:
+    clean: all
+  steps:
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+    parameters:
+      aptPackages: ${{ parameters.aptPackages }}
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+    parameters:
+      checkoutRepo: ${{ parameters.checkoutRepo }}
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+    parameters:
+      checkoutRef: ${{ parameters.checkoutRef }}
+      dependencyList: ${{ parameters.rocmDependencies }}
+      aggregatePipeline: ${{ parameters.aggregatePipeline }}
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-autotools.yml
+    parameters:
+      configureFlags: >-
+        --program-prefix=roc
+        --enable-64-bit-bfd
+        --enable-targets="x86_64-linux-gnu,amdgcn-amd-amdhsa"
+        --disable-ld
+        --disable-gas
+        --disable-gdbserver
+        --disable-sim
+        --enable-tui
+        --disable-gdbtk
+        --disable-shared
+        --disable-gprofng
+        --with-expat
+        --with-system-zlib
+        --without-guile
+        --with-babeltrace
+        --with-lzma
+        --with-python=python3
+        --with-rocm-dbgapi=$(Agent.BuildDirectory)/rocm
+        LDFLAGS="-Wl,--enable-new-dtags,-rpath=$(Agent.BuildDirectory)/rocm/lib"
+      makeCallPrefix: LD_RUN_PATH='${ORIGIN}/../lib'
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml

 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
-    dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}
+  - job: ROCgdb_test_${{ job.target }}
+    dependsOn: ROCgdb
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -144,23 +119,18 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
-        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-      parameters:
-        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
-        os: ${{ job.os }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-autotools.yml
      parameters:
-        os: ${{ job.os }}
        configureFlags: >-
          --program-prefix=roc
          --enable-64-bit-bfd
@@ -196,9 +166,7 @@ jobs:
      continueOnError: true
      inputs:
        targetType: inline
-        script: |
-          ${{ iif(eq(job.os, 'almalinux8'), 'source /opt/rh/gcc-toolset-14/enable', '') }}
-          make check-gdb TESTS=gdb.rocm/simple.exp
+        script: make check-gdb TESTS=gdb.rocm/simple.exp
        workingDirectory: $(Build.SourcesDirectory)
    - task: Bash@3
      displayName: print gdb log
--- a/.azuredevops/components/Tensile.yml
+++ b/.azuredevops/components/Tensile.yml
@@ -1,29 +1,10 @@
 parameters:
- name: componentName
-  type: string
-  default: Tensile
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
-# monorepo related parameters
- name: sparseCheckoutDir
-  type: string
-  default: ''
- name: triggerDownstreamJobs
-  type: boolean
-  default: false
- name: downstreamAggregateNames
-  type: string
-  default: ''
- name: buildDependsOn
-  type: object
-  default: null
- name: unifiedBuild
-  type: boolean
-  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -32,6 +13,7 @@ parameters:
 - name: aptPackages
  type: object
  default:
+    - python3-pip
    - cmake
    - libmsgpack-dev
    - libboost-program-options-dev
@@ -56,97 +38,75 @@ parameters:
 - name: jobMatrix
  type: object
  default:
-    buildJobs:
-      - { os: ubuntu2204, packageManager: apt }
-      - { os: almalinux8, packageManager: dnf }
    testJobs:
-      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+      - gfx942:
+        target: gfx942
+      - gfx90a:
+        target: gfx90a

 jobs:
- ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: ${{ parameters.componentName }}_build_${{ job.os }}
-    ${{ if parameters.buildDependsOn }}:
-      dependsOn: ${{ parameters.buildDependsOn[job.target] }}
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    - name: ROCM_PATH
-      value: $(Agent.BuildDirectory)/rocm
-    pool:
-      vmImage: ${{ variables.BASE_BUILD_POOL }}
-    ${{ if eq(job.os, 'almalinux8') }}:
-      container:
-        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-        endpoint: ContainerService3
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-        packageManager: ${{ job.packageManager }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      parameters:
-        checkoutRepo: ${{ parameters.checkoutRepo }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmDependencies }}
-        os: ${{ job.os }}
-        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-      parameters:
-        os: ${{ job.os }}
-    - task: Bash@3
-      displayName: Create wheel file
-      inputs:
-        targetType: inline
-        script: python3 setup.py bdist_wheel
-        workingDirectory: $(Agent.BuildDirectory)/s
-    - task: Bash@3
-      displayName: Rename wheel file with job OS
-      inputs:
-        targetType: inline
-        workingDirectory: $(Agent.BuildDirectory)/s
-        script: |
-          wheelFile=$(find "$(Agent.BuildDirectory)/s/dist" -type f -name "*.whl" | head -n 1)
-          newWheelFile="$(basename "$wheelFile" .whl)-${{ job.os }}.whl"
-          mv "$wheelFile" "$(dirname "$wheelFile")/$newWheelFile"
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-prepare-package.yml
-      parameters:
-        sourceDir: $(Agent.BuildDirectory)/s/dist
-        contentsString: '*.whl'
-        targetDir: $(Build.ArtifactStagingDirectory)
-        clean: false
-    - task: PublishPipelineArtifact@1
-      displayName: 'wheel file Publish'
-      retryCountOnTaskFailure: 3
-      inputs:
-        targetPath: $(Build.ArtifactStagingDirectory)
-    - task: Bash@3
-      displayName: Save pipeline artifact file names
-      inputs:
-        workingDirectory: $(Pipeline.Workspace)
-        targetType: inline
-        script: |
-          whlFile=$(find "$(Build.ArtifactStagingDirectory)" -type f -name "*.whl" | head -n 1)
-          if [ -n "$whlFile" ]; then
-            echo $(basename "$whlFile") >> pipelineArtifacts.txt
-          fi
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-    #   parameters:
-    #     aptPackages: ${{ parameters.aptPackages }}
-    #     pipModules: ${{ parameters.pipModules }}
+- job: Tensile_build
+  variables:
+  - group: common
+  - template: /.azuredevops/variables-global.yml
+  - name: ROCM_PATH
+    value: $(Agent.BuildDirectory)/rocm
+  pool:
+    vmImage: ${{ variables.BASE_BUILD_POOL }}
+  workspace:
+    clean: all
+  steps:
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+    parameters:
+      aptPackages: ${{ parameters.aptPackages }}
+      pipModules: ${{ parameters.pipModules }}
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+    parameters:
+      checkoutRepo: ${{ parameters.checkoutRepo }}
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+    parameters:
+      checkoutRef: ${{ parameters.checkoutRef }}
+      dependencyList: ${{ parameters.rocmDependencies }}
+      aggregatePipeline: ${{ parameters.aggregatePipeline }}
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+  - task: Bash@3
+    displayName: Create wheel file
+    inputs:
+      targetType: inline
+      script: python3 setup.py bdist_wheel
+      workingDirectory: $(Build.SourcesDirectory)
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-prepare-package.yml
+    parameters:
+      sourceDir: $(Build.SourcesDirectory)/dist
+      contentsString: '*.whl'
+      targetDir: $(Build.ArtifactStagingDirectory)
+      clean: false
+  - task: PublishPipelineArtifact@1
+    displayName: 'wheel file Publish'
+    retryCountOnTaskFailure: 3
+    inputs:
+      targetPath: $(Build.ArtifactStagingDirectory)
+  - task: Bash@3
+    displayName: Save pipeline artifact file names
+    inputs:
+      workingDirectory: $(Pipeline.Workspace)
+      targetType: inline
+      script: |
+        whlFile=$(find "$(Build.ArtifactStagingDirectory)" -type f -name "*.whl" | head -n 1)
+        if [ -n "$whlFile" ]; then
+          echo $(basename "$whlFile") >> pipelineArtifacts.txt
+        fi
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+  # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+  #   parameters:
+  #     aptPackages: ${{ parameters.aptPackages }}
+  #     pipModules: ${{ parameters.pipModules }}

 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: Tensile_test_${{ job.os }}_${{ job.target }}
+  - job: Tensile_test_${{ job.target }}
    timeoutInMinutes: 180
-    dependsOn: Tensile_build_${{ job.os }}
+    dependsOn: Tensile_build
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -166,23 +126,20 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
-        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - task: DownloadPipelineArtifact@2
      displayName: 'Download Pipeline Wheel Files'
      inputs:
-        itemPattern: '**/*${{ job.os }}*.whl'
+        itemPattern: '**/*.whl'
        targetPath: $(Agent.BuildDirectory)
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
-        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - task: Bash@3
      displayName: pip install
@@ -207,7 +164,7 @@ jobs:
      inputs:
        targetType: inline
        script: tox run -v -e ci -- -m pre_checkin
-        workingDirectory: $(Agent.BuildDirectory)/s
+        workingDirectory: $(Build.SourcesDirectory)
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/amdsmi.yml
+++ b/.azuredevops/components/amdsmi.yml
@@ -16,66 +16,50 @@ parameters:
    - cmake
    - libdrm-dev
    - ninja-build
+    - python3-pip
    - pkg-config

 - name: jobMatrix
  type: object
  default:
-    buildJobs:
-      - { os: ubuntu2204, packageManager: apt }
-      - { os: ubuntu2404, packageManager: apt }
-      - { os: almalinux8, packageManager: dnf }
    testJobs:
-      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+      - gfx942:
+        target: gfx942
+      - gfx90a:
+        target: gfx90a

 jobs:
- ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: amdsmi_build_${{ job.os }}
-    pool:
-      ${{ if eq(job.os, 'ubuntu2404') }}:
-        vmImage: 'ubuntu-24.04'
-      ${{ else }}:
-        vmImage: 'ubuntu-22.04'
-    ${{ if eq(job.os, 'almalinux8') }}:
-      container:
-        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-        endpoint: ContainerService3
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        packageManager: ${{ job.packageManager }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      parameters:
-        checkoutRepo: ${{ parameters.checkoutRepo }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-      parameters:
-        os: ${{ job.os }}
-        useAmdclang: false
-        extraBuildFlags: >-
-          -DBUILD_TESTS=ON
-          -GNinja
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-      parameters:
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-      parameters:
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-    #   parameters:
-    #     aptPackages: ${{ parameters.aptPackages }}
+- job: amdsmi_build
+  variables:
+  - group: common
+  - template: /.azuredevops/variables-global.yml
+  pool:
+    vmImage: ${{ variables.BASE_BUILD_POOL }}
+  workspace:
+    clean: all
+  steps:
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+    parameters:
+      aptPackages: ${{ parameters.aptPackages }}
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+    parameters:
+      checkoutRepo: ${{ parameters.checkoutRepo }}
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+    parameters:
+      extraBuildFlags: >-
+        -DBUILD_TESTS=ON
+        -GNinja
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+  # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+  #   parameters:
+  #     aptPackages: ${{ parameters.aptPackages }}

 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: amdsmi_test_${{ job.os }}_${{ job.target }}
-    dependsOn: amdsmi_build_${{ job.os }}
+  - job: amdsmi_test_${{ job.target }}
+    dependsOn: amdsmi_build
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -92,11 +76,8 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
-        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-      parameters:
-        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
      parameters:
        runRocminfo: false
@@ -104,9 +85,8 @@ jobs:
      parameters:
        componentName: amdsmi
        testDir: '$(Agent.BuildDirectory)'
-        testExecutable: 'sudo ./rocm/share/amd_smi/tests/amdsmitst'
+        testExecutable: './rocm/share/amd_smi/tests/amdsmitst'
        testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
-        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/aomp.yml
+++ b/.azuredevops/components/aomp.yml
@@ -1,7 +1,4 @@
 parameters:
- name: componentName
-  type: string
-  default: aomp
 - name: checkoutRepo
  type: string
  default: 'self'
@@ -18,187 +15,173 @@ parameters:
 - name: aptPackages
  type: object
  default:
-    - bison
-    - ccache
    - cmake
-    - flex
-    - gawk
-    - git
-    - mesa-common-dev
+    - python3-pip
    - ninja-build
-    - libbabeltrace-dev
-    - libbison-dev
+    - pkg-config
+    - libpci-dev
+    - libnuma-dev
+    - libffi-dev
+    - git
+    - libopenmpi-dev
+    - gawk
+    - mesa-common-dev
+    - libtool
    - libdrm-amdgpu1
    - libdrm-dev
    - libdw-dev
-    - libffi-dev
-    - libgmp-dev
-    - liblzma-dev
-    - libmpfr-dev
-    - libncurses5-dev
-    - libnuma-dev
-    - libopenmpi-dev
-    - libpci-dev
+    - libgtest-dev
+    - libsystemd-dev
    - libssl-dev
    - libstdc++-12-dev
-    - libsystemd-dev
-    - libtool
+    - ccache
+    - libgmp-dev
+    - libmpfr-dev
+    - texinfo
+    - libbison-dev
+    - bison
+    - flex
+    - libbabeltrace-dev
+    - libncurses5-dev
+    - liblzma-dev
+    - python3-setuptools
+    - python3-dev
    - libudev-dev
    - parallel
-    - pkg-config
-    - python3-dev
-    - python3-pip
-    - python3-setuptools
-    - texinfo
+  # Referencing comment snippet.
+  #
+  # snippet from https://github.com/ROCm/aomp/blob/aomp-dev/bin/build_aomp.sh#L131-L134
+  #
+  # For ROCM build (AOMP_STANDALONE_BUILD=0) the components roct, rocr,
+  # libdevice, project, comgr, rocminfo, hipamd, rocdbgapi, rocgdb,
+  # roctracer, rocprofiler, rocm_smi_lib, and amdsmi should be found
+  # in ROCM in /opt/rocm.  The ROCM build only needs these components:
 - name: rocmDependencies
  type: object
  default:
-    - llvm-project
-    - ROCR-Runtime
- name: rocmTestDependencies
-  type: object
-  default:
+    - amdsmi
    - clr
    - llvm-project
+    - ROCdbgapi
+    - ROCgdb
+    - rocm-cmake
    - rocm-core
    - rocminfo
-    - ROCR-Runtime
+    - rocm_smi_lib
+    - rocprofiler
    - rocprofiler-register
+    - rocprofiler-sdk
+    - ROCR-Runtime
+    - roctracer

 - name: jobMatrix
  type: object
  default:
-    buildJobs:
-      - { os: ubuntu2204, packageManager: apt }
-      - { os: almalinux8, packageManager: dnf }
    testJobs:
-      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+      - gfx942:
+        target: gfx942
+      - gfx90a:
+        target: gfx90a

 jobs:
- ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: ${{ parameters.componentName }}_build_${{ job.os }}
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool:
-      ${{ if eq(job.os, 'ubuntu2404') }}:
-        name: rocm-ci_medium_build_pool_2404
-      ${{ else }}:
-        name: ${{ variables.MEDIUM_BUILD_POOL }}
-    ${{ if eq(job.os, 'almalinux8') }}:
-      container:
-        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-        endpoint: ContainerService3
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        packageManager: ${{ job.packageManager }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    # checkout the repos tied to openmp-extras, plus llvm-project
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      parameters:
-        checkoutRepo: ${{ parameters.checkoutRepo }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      parameters:
-        checkoutRepo: aomp-extras_repo
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      parameters:
-        checkoutRepo: flang_repo
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      parameters:
-        checkoutRepo: llvm-project_repo
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
-      parameters:
-        dependencyList:
-          - gtest
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmDependencies }}
-        os: ${{ job.os }}
-        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-      parameters:
-        os: ${{ job.os }}
-        useAmdclang: false
-        componentName: extras
-        cmakeBuildDir: '$(Build.SourcesDirectory)/aomp-extras/build'
-        cmakeSourceDir: '$(Build.SourcesDirectory)/aomp-extras'
-        installDir: '$(Build.BinariesDirectory)/llvm'
-        extraBuildFlags: >-
-          -DLLVM_DIR=$(Agent.BuildDirectory)/rocm/llvm
-          -DCMAKE_BUILD_TYPE=Release
-          -DAOMP_STANDALONE_BUILD=0
-          -DAOMP_VERSION_STRING=9.99.99
-          -GNinja
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-      parameters:
-        os: ${{ job.os }}
-        componentName: openmp
-        cmakeBuildDir: '$(Build.SourcesDirectory)/llvm-project/openmp/build'
-        cmakeSourceDir: '$(Build.SourcesDirectory)/llvm-project/openmp'
-        installDir: '$(Build.BinariesDirectory)/llvm'
-        extraBuildFlags: >-
-          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Build.BinariesDirectory)"
-          -DCMAKE_BUILD_TYPE=Release
-          -DOPENMP_TEST_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang++
-          -DOPENMP_TEST_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang
-          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang++
-          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang
-          -DOPENMP_ENABLE_LIBOMPTARGET=1
-          -DLIBOMP_COPY_EXPORTS=OFF
-          -DLIBOMP_OMPD_SUPPORT=ON
-          -DCMAKE_SKIP_INSTALL_RPATH=TRUE
-          -DLLVM_MAIN_INCLUDE_DIR=$(Build.SourcesDirectory)/llvm-project/llvm/include
-          -DLIBOMP_FORTRAN_MODULES_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/flang
-          -DLIBOMP_MODULES_INSTALL_PATH=$(Build.BinariesDirectory)/llvm/include/flang/
-        multithreadFlag: -- -j32
-    - task: Bash@3
-      displayName: 'ROCm symbolic link'
-      inputs:
-        targetType: inline
-        script: sudo ln -s $(Agent.BuildDirectory)/rocm /opt/rocm
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-      parameters:
-        os: ${{ job.os }}
-        componentName: offload
-        cmakeBuildDir: '$(Build.SourcesDirectory)/llvm-project/offload/build'
-        cmakeSourceDir: '$(Build.SourcesDirectory)/llvm-project/offload'
-        installDir: '$(Build.BinariesDirectory)/llvm'
-        extraBuildFlags: >-
-          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Build.BinariesDirectory)"
-          -DCMAKE_BUILD_TYPE=Release
-          -DOPENMP_TEST_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang++
-          -DOPENMP_TEST_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang
-          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang++
-          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang
-          -DCMAKE_SKIP_INSTALL_RPATH=TRUE
-          -DLLVM_MAIN_INCLUDE_DIR=$(Build.SourcesDirectory)/llvm-project/llvm/include
-          -DLIBOMPTARGET_LLVM_INCLUDE_DIRS=$(Build.SourcesDirectory)/llvm-project/llvm/include
-          -DCMAKE_EXE_LINKER_FLAGS="-L$(Agent.BuildDirectory)/rocm/llvm/lib"
-          -DCMAKE_SHARED_LINKER_FLAGS="-L$(Agent.BuildDirectory)/rocm/llvm/lib"
-          -GNinja
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-      parameters:
-        componentName: ${{ parameters.componentName }}
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-      parameters:
-        componentName: ${{ parameters.componentName }}
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    - ${{ if eq(job.os, 'ubuntu2204') }}:
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
+- job: aomp
+  variables:
+  - group: common
+  - template: /.azuredevops/variables-global.yml
+  pool: ${{ variables.MEDIUM_BUILD_POOL }}
+  workspace:
+    clean: all
+  steps:
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+    parameters:
+      aptPackages: ${{ parameters.aptPackages }}
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+# checkout the repos tied to openmp-extras, plus llvm-project
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+    parameters:
+      checkoutRepo: ${{ parameters.checkoutRepo }}
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+    parameters:
+      checkoutRepo: aomp-extras_repo
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+    parameters:
+      checkoutRepo: flang_repo
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+    parameters:
+      checkoutRepo: llvm-project_repo
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+    parameters:
+      checkoutRef: ${{ parameters.checkoutRef }}
+      dependencyList: ${{ parameters.rocmDependencies }}
+      aggregatePipeline: ${{ parameters.aggregatePipeline }}
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+    parameters:
+      componentName: extras
+      cmakeBuildDir: '$(Build.SourcesDirectory)/aomp-extras/build'
+      cmakeSourceDir: '$(Build.SourcesDirectory)/aomp-extras'
+      installDir: '$(Build.BinariesDirectory)/llvm'
+      extraBuildFlags: >-
+        -DLLVM_DIR=$(Agent.BuildDirectory)/rocm/llvm
+        -DCMAKE_BUILD_TYPE=Release
+        -DAOMP_STANDALONE_BUILD=0
+        -DAOMP_VERSION_STRING=9.99.99
+        -GNinja
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+    parameters:
+      componentName: openmp
+      cmakeBuildDir: '$(Build.SourcesDirectory)/llvm-project/openmp/build'
+      cmakeSourceDir: '$(Build.SourcesDirectory)/llvm-project/openmp'
+      installDir: '$(Build.BinariesDirectory)/llvm'
+      extraBuildFlags: >-
+        -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Build.BinariesDirectory)"
+        -DCMAKE_BUILD_TYPE=Release
+        -DOPENMP_TEST_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang++
+        -DOPENMP_TEST_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang
+        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang++
+        -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang
+        -DOPENMP_ENABLE_LIBOMPTARGET=1
+        -DLIBOMP_COPY_EXPORTS=OFF
+        -DLIBOMP_OMPT_SUPPORT=ON
+        -DLIBOMP_OMPD_SUPPORT=ON
+        -DCMAKE_SKIP_INSTALL_RPATH=TRUE
+        -DLLVM_MAIN_INCLUDE_DIR=$(Build.SourcesDirectory)/llvm-project/llvm/include
+        -DLIBOMP_FORTRAN_MODULES_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/flang
+        -DLIBOMP_MODULES_INSTALL_PATH=$(Build.BinariesDirectory)/llvm/include/flang/
+        -GNinja
+  - task: Bash@3
+    displayName: 'ROCm symbolic link'
+    inputs:
+      targetType: inline
+      script: sudo ln -s $(Agent.BuildDirectory)/rocm /opt/rocm
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+    parameters:
+      componentName: offload
+      cmakeBuildDir: '$(Build.SourcesDirectory)/llvm-project/offload/build'
+      cmakeSourceDir: '$(Build.SourcesDirectory)/llvm-project/offload'
+      installDir: '$(Build.BinariesDirectory)/llvm'
+      extraBuildFlags: >-
+        -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Build.BinariesDirectory)"
+        -DCMAKE_BUILD_TYPE=Release
+        -DOPENMP_TEST_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang++
+        -DOPENMP_TEST_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang
+        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang++
+        -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/clang
+        -DCMAKE_SKIP_INSTALL_RPATH=TRUE
+        -DLLVM_MAIN_INCLUDE_DIR=$(Build.SourcesDirectory)/llvm-project/llvm/include
+        -DLIBOMPTARGET_LLVM_INCLUDE_DIRS=$(Build.SourcesDirectory)/llvm-project/llvm/include
+        -DCMAKE_EXE_LINKER_FLAGS="-L$(Agent.BuildDirectory)/rocm/llvm/lib"
+        -DCMAKE_SHARED_LINKER_FLAGS="-L$(Agent.BuildDirectory)/rocm/llvm/lib"
+        -GNinja
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+    parameters:
+      aptPackages: ${{ parameters.aptPackages }}

 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
-    dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}
+  - job: aomp_test_${{ job.target }}
+    dependsOn: aomp
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -215,16 +198,12 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
-        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-      parameters:
-        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmTestDependencies }}
-        os: ${{ job.os }}
+        dependencyList: ${{ parameters.rocmDependencies }}
    - task: Bash@3
      displayName: ROCm symbolic link
      inputs:
@@ -236,7 +215,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: aomp-extras_repo
-    # these copy steps are from the aomp prototype script for test prep
+  # these copy steps are from the aomp prototype script for test prep
    - task: CopyFiles@2
      displayName: 'Copy AOMP contents'
      inputs:
--- a/.azuredevops/components/copyHIP.yml
+++ b/.azuredevops/components/copyHIP.yml
@@ -1,42 +1,36 @@
 parameters:
- name: jobMatrix
-  type: object
-  default:
-    copyJobs:
-      - { os: ubuntu2204, backend: amd }
-      - { os: almalinux8, backend: amd }
-      - { os: ubuntu2204, backend: nvidia }
-      - { os: almalinux8, backend: nvidia }
+- name: checkoutRepo
+  type: string
+  default: 'self'
+- name: checkoutRef
+  type: string
+  default: ''

 # hip and clr are tightly-coupled
 # run this same template for both repos
 # any changes for clr should just trigger HIP pipeline
 jobs:
- ${{ each job in parameters.jobMatrix.copyJobs }}:
-  - job: hip_clr_combined_${{ job.os }}_${{ job.backend }}
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool:
-      vmImage: ${{ variables.BASE_BUILD_POOL }}
-    workspace:
-      clean: all
-    steps:
-  # checkout nothing, just copy artifacts from triggering HIP job
-  # and then publish for this clr job or for this hipother job to maintain latest
-    - checkout: none
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-download.yml
-      parameters:
-        componentName: HIP
-        pipelineId: $(HIP_PIPELINE_ID)
-        fileFilter: ${{ job.os }}*${{ job.backend }}
-    - task: Bash@3
-      displayName: Copy HIP artifacts
-      inputs:
-        targetType: inline
-        script: cp -a $(Agent.BuildDirectory)/rocm/* $(Build.BinariesDirectory)/
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-      parameters:
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+- job: hip_clr_combined
+  variables:
+  - group: common
+  - template: /.azuredevops/variables-global.yml
+  pool:
+    vmImage: ${{ variables.BASE_BUILD_POOL }}
+  workspace:
+    clean: all
+  steps:
+# checkout nothing, just copy artifacts from triggering HIP job
+# and then publish for this clr job or for this hipother job to maintain latest
+  - checkout: none
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-download.yml
+    parameters:
+      componentName: HIP
+      pipelineId: $(HIP_PIPELINE_ID)
+  - task: Bash@3
+    displayName: Copy HIP artifacts
+    inputs:
+      targetType: inline
+      script: cp -a $(Agent.BuildDirectory)/rocm/* $(Build.BinariesDirectory)/
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
--- a/.azuredevops/components/hipBLAS-common.yml
+++ b/.azuredevops/components/hipBLAS-common.yml
@@ -1,29 +1,10 @@
 parameters:
- name: componentName
-  type: string
-  default: hipBLAS-common
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
-# monorepo related parameters
- name: sparseCheckoutDir
-  type: string
-  default: ''
- name: triggerDownstreamJobs
-  type: boolean
-  default: false
- name: downstreamAggregateNames
-  type: string
-  default: ''
- name: buildDependsOn
-  type: object
-  default: null
- name: unifiedBuild
-  type: boolean
-  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -33,103 +14,54 @@ parameters:
  type: object
  default:
    - cmake
-    - git
    - ninja-build
+    - git
    - wget
+    - python3-pip
 - name: rocmDependencies
  type: object
  default:
-    - clr
-    - llvm-project
    - rocm-cmake
-    - rocminfo
+    - llvm-project
    - ROCR-Runtime
-
- name: jobMatrix
-  type: object
-  default:
-    buildJobs:
-      - { os: ubuntu2204, packageManager: apt }
-      - { os: almalinux8, packageManager: dnf }
-# - name: downstreamComponentMatrix
-#   type: object
-#   default:
-#     - hipBLASLt:
-#       name: hipBLASLt
-#       sparseCheckoutDir: projects/hipblaslt
-#       skipUnifiedBuild: 'false'
-#       buildDependsOn:
-#         - hipBLAS_common_build
+    - clr
+    - rocminfo

 jobs:
- ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: hipBLAS_common_build_${{ job.os }}
-    ${{ if parameters.buildDependsOn }}:
-      dependsOn:
-        - ${{ each build in parameters.buildDependsOn }}:
-          - ${{ build }}_${{ job.os }}
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    - name: ROCM_PATH
-      value: $(Agent.BuildDirectory)/rocm
-    pool:
-      vmImage: ${{ variables.BASE_BUILD_POOL }}
-    ${{ if eq(job.os, 'almalinux8') }}:
-      container:
-        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-        endpoint: ContainerService3
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        packageManager: ${{ job.packageManager }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      parameters:
-        checkoutRepo: ${{ parameters.checkoutRepo }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmDependencies }}
-        os: ${{ job.os }}
-        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-        ${{ if parameters.triggerDownstreamJobs }}:
-            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-      parameters:
-        os: ${{ job.os }}
-        extraBuildFlags: >-
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
-          -GNinja
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-      parameters:
-        componentName: ${{ parameters.componentName }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-      parameters:
-        os: ${{ job.os }}
-        componentName: ${{ parameters.componentName }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-    #   parameters:
-    #     aptPackages: ${{ parameters.aptPackages }}
-    #     extraEnvVars:
-    #       - ROCM_PATH:::/home/user/workspace/rocm
-
-# - ${{ if parameters.triggerDownstreamJobs }}:
-#   - ${{ each component in parameters.downstreamComponentMatrix }}:
-#     - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
-#       - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
-#         parameters:
-#           checkoutRepo: ${{ parameters.checkoutRepo }}
-#           sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
-#           buildDependsOn: ${{ component.buildDependsOn }}
-#           downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
-#           triggerDownstreamJobs: true
-#           unifiedBuild: ${{ parameters.unifiedBuild }}
+- job: hipBLAS_common
+  variables:
+  - group: common
+  - name: ROCM_PATH
+    value: $(Agent.BuildDirectory)/rocm
+  - template: /.azuredevops/variables-global.yml
+  pool:
+    vmImage: ${{ variables.BASE_BUILD_POOL }}
+  workspace:
+    clean: all
+  steps:
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+    parameters:
+      aptPackages: ${{ parameters.aptPackages }}
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+    parameters:
+      checkoutRepo: ${{ parameters.checkoutRepo }}
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+    parameters:
+      checkoutRef: ${{ parameters.checkoutRef }}
+      dependencyList: ${{ parameters.rocmDependencies }}
+      aggregatePipeline: ${{ parameters.aggregatePipeline }}
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+    parameters:
+      extraBuildFlags: >-
+        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
+        -GNinja
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+  # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+  #   parameters:
+  #     aptPackages: ${{ parameters.aptPackages }}
+  #     extraEnvVars:
+  #       - ROCM_PATH:::/home/user/workspace/rocm
--- a/.azuredevops/components/hipBLASLt.yml
+++ b/.azuredevops/components/hipBLASLt.yml
@@ -1,29 +1,10 @@
 parameters:
- name: componentName
-  type: string
-  default: hipBLASLt
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
-# monorepo related parameters
- name: sparseCheckoutDir
-  type: string
-  default: ''
- name: triggerDownstreamJobs
-  type: boolean
-  default: false
- name: downstreamAggregateNames
-  type: string
-  default: ''
- name: buildDependsOn
-  type: object
-  default: null
- name: unifiedBuild
-  type: boolean
-  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -32,8 +13,6 @@ parameters:
 - name: aptPackages
  type: object
  default:
-    - ccache
-    - gfortran
    - git
    - libdrm-dev
    - libmsgpack-dev
@@ -41,6 +20,9 @@ parameters:
    - ninja-build
    - python3-pip
    - python3-venv
+    - gfortran
+    - libblas-dev
+    - ccache
 - name: pipModules
  type: object
  default:
@@ -55,7 +37,6 @@ parameters:
    - hipBLAS-common
    - llvm-project
    - rocminfo
-    - rocm-cmake
    - rocm_smi_lib
    - rocprofiler-register
    - ROCR-Runtime
@@ -77,37 +58,20 @@ parameters:
  type: object
  default:
    buildJobs:
-      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
-      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
-      # - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
-      # - { os: almalinux8, packageManager: dnf, target: gfx942 }
-      # - { os: almalinux8, packageManager: dnf, target: gfx90a }
-      # - { os: almalinux8, packageManager: dnf, target: gfx1201 }
-      # - { os: almalinux8, packageManager: dnf, target: gfx1100 }
-      # - { os: almalinux8, packageManager: dnf, target: gfx1030 }
+      - gfx942:
+        target: gfx942
+      - gfx90a:
+        target: gfx90a
    testJobs:
-      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
-# - name: downstreamComponentMatrix
-#   type: object
-#   default:
-#     - rocBLAS:
-#       name: rocBLAS
-#       sparseCheckoutDir: projects/rocblas
-#       skipUnifiedBuild: 'false'
-#       buildDependsOn:
-#         - hipBLASLt_build
+      - gfx942:
+        target: gfx942
+      - gfx90a:
+        target: gfx90a

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
+  - job: hipBLASLt_build_${{ job.target }}
    timeoutInMinutes: 300
-    ${{ if parameters.buildDependsOn }}:
-      dependsOn:
-        - ${{ each build in parameters.buildDependsOn }}:
-          - ${{ build }}_${{ job.os }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -122,10 +86,6 @@ jobs:
    - name: DAY_STRING
      value: $[format('{0:ddMMyyyy}', pipeline.startTime)]
    pool: ${{ variables.ULTRA_BUILD_POOL }}
-    ${{ if eq(job.os, 'almalinux8') }}:
-      container:
-        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-        endpoint: ContainerService3
    workspace:
      clean: all
    steps:
@@ -133,22 +93,17 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
-        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
-        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-        ${{ if parameters.triggerDownstreamJobs }}:
-          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - task: Bash@3
      displayName: Add ROCm binaries to PATH
      inputs:
@@ -156,20 +111,22 @@ jobs:
        script: |
          echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/bin"
          echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/llvm/bin"
-    # hipBLASLt has a script for gtest and lapack
-    # https://github.com/ROCm/hipBLASLt/blob/develop/deps/CMakeLists.txt
-    # $(Agent.BuildDirectory)/deps is a temporary folder for the build process
-    # $(Agent.BuildDirectory)/s/deps is part of the hipBLASLt repo
-    - task: Bash@3
-      displayName: Build and install external dependencies
-      inputs:
-        targetType: inline
-        script: |
-          mkdir -p $(Agent.BuildDirectory)/deps
-          cd $(Agent.BuildDirectory)/deps
-          cmake -DCMAKE_POSITION_INDEPENDENT_CODE=ON $(Agent.BuildDirectory)/s/deps
-          make
-          sudo make install
+  # Build and install gtest, lapack, hipBLAS-common
+  # $(Pipeline.Workspace)/deps is a temporary folder for the build process
+  # $(Pipeline.Workspace)/s/deps is part of the hipBLASLt repo
+    - script: mkdir $(Pipeline.Workspace)/deps
+      displayName: Create temp folder for external dependencies
+  # hipBLASLt already has a CMake script for external deps, so we can just run that
+  # https://github.com/ROCm/hipBLASLt/blob/develop/deps/CMakeLists.txt
+    - script: cmake $(Pipeline.Workspace)/s/deps
+      displayName: Configure hipBLASLt external dependencies
+      workingDirectory: $(Pipeline.Workspace)/deps
+    - script: make
+      displayName: Build hipBLASLt external dependencies
+      workingDirectory: $(Pipeline.Workspace)/deps
+    - script: sudo make install
+      displayName: Install hipBLASLt external dependencies
+      workingDirectory: $(Pipeline.Workspace)/deps
    - script: |
        mkdir -p $(CCACHE_DIR)
        echo "##vso[task.prependpath]/usr/lib/ccache"
@@ -177,117 +134,93 @@ jobs:
    - task: Cache@2
      displayName: Ccache caching
      inputs:
-        key: hipBLASLt | ${{ job.os }} | ${{ job.target }} | $(DAY_STRING) | $(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
+        key: hipBLASLt | $(Agent.OS) | ${{ job.target }} | $(DAY_STRING) | $(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
        path: $(CCACHE_DIR)
        restoreKeys: |
-          hipBLASLt | ${{ job.os }} | ${{ job.target }} | $(DAY_STRING)
-          hipBLASLt | ${{ job.os }} | ${{ job.target }}
-          hipBLASLt | ${{ job.os }}
+          hipBLASLt | $(Agent.OS) | ${{ job.target }} | $(DAY_STRING)
+          hipBLASLt | $(Agent.OS) | ${{ job.target }}
+          hipBLASLt | $(Agent.OS)
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
-        os: ${{ job.os }}
        extraBuildFlags: >-
+          -DCMAKE_BUILD_TYPE=Release
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-          -DCMAKE_INCLUDE_PATH=$(Agent.BuildDirectory)/rocm/llvm/include
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
          -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
          -DCMAKE_C_COMPILER_LAUNCHER=ccache
          -DAMDGPU_TARGETS=${{ job.target }}
+          -DTensile_LOGIC=
+          -DTensile_CPU_THREADS=
+          -DTensile_LIBRARY_FORMAT=msgpack
+          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm"
          -DBUILD_CLIENTS_TESTS=ON
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
-        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
-        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    - ${{ if eq(job.os, 'ubuntu2204') }}:
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          pipModules: ${{ parameters.pipModules }}
-          gpuTarget: ${{ job.target }}
-          extraPaths: /home/user/workspace/rocm/llvm/bin:/home/user/workspace/rocm/bin
-          installLatestCMake: true
-          extraEnvVars:
-            - HIP_ROCCLR_HOME:::/home/user/workspace/rocm
-            - TENSILE_ROCM_ASSEMBLER_PATH:::/home/user/workspace/rocm/llvm/bin/amdclang
-            - TENSILE_ROCM_OFFLOAD_BUNDLER_PATH:::/home/user/workspace/rocm/llvm/bin/clang-offload-bundler
-            - ROCM_PATH:::/home/user/workspace/rocm
-          extraCopyDirectories:
-            - deps
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        pipModules: ${{ parameters.pipModules }}
+        gpuTarget: ${{ job.target }}
+        extraPaths: /home/user/workspace/rocm/llvm/bin:/home/user/workspace/rocm/bin
+        installLatestCMake: true
+        extraEnvVars:
+          - HIP_ROCCLR_HOME:::/home/user/workspace/rocm
+          - TENSILE_ROCM_ASSEMBLER_PATH:::/home/user/workspace/rocm/llvm/bin/amdclang
+          - TENSILE_ROCM_OFFLOAD_BUNDLER_PATH:::/home/user/workspace/rocm/llvm/bin/clang-offload-bundler
+          - ROCM_PATH:::/home/user/workspace/rocm
+        extraCopyDirectories:
+          - deps

- ${{ if eq(parameters.unifiedBuild, False) }}:
-  - ${{ each job in parameters.jobMatrix.testJobs }}:
-    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
-      timeoutInMinutes: 300
-      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
-      condition:
-        and(succeeded(),
-          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
-          eq(${{ parameters.aggregatePipeline }}, False)
-        )
-      variables:
-      - group: common
-      - template: /.azuredevops/variables-global.yml
-      - name: ROCM_PATH
-        value: $(Agent.BuildDirectory)/rocm
-      pool: ${{ job.target }}_test_pool
-      workspace:
-        clean: all
-      steps:
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          pipModules: ${{ parameters.pipModules }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-        parameters:
-          preTargetFilter: ${{ parameters.componentName }}
-          os: ${{ job.os }}
-          gpuTarget: ${{ job.target }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-        parameters:
-          os: ${{ job.os }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-        parameters:
-          checkoutRef: ${{ parameters.checkoutRef }}
-          dependencyList: ${{ parameters.rocmTestDependencies }}
-          os: ${{ job.os }}
-          gpuTarget: ${{ job.target }}
-          ${{ if parameters.triggerDownstreamJobs }}:
-            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-        parameters:
-          componentName: ${{ parameters.componentName }}
-          os: ${{ job.os }}
-          testDir: '$(Agent.BuildDirectory)/rocm/bin'
-          testExecutable: './hipblaslt-test'
-          testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes --gtest_filter=*pre_checkin*'
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          pipModules: ${{ parameters.pipModules }}
-          environment: test
-          gpuTarget: ${{ job.target }}
-
-# - ${{ if parameters.triggerDownstreamJobs }}:
-#   - ${{ each component in parameters.downstreamComponentMatrix }}:
-#     - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
-#       - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
-#         parameters:
-#           checkoutRepo: ${{ parameters.checkoutRepo }}
-#           sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
-#           buildDependsOn: ${{ component.buildDependsOn }}
-#           downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
-#           triggerDownstreamJobs: true
-#           unifiedBuild: ${{ parameters.unifiedBuild }}
+- ${{ each job in parameters.jobMatrix.testJobs }}:
+  - job: hipBLASLt_test_${{ job.target }}
+    timeoutInMinutes: 300
+    dependsOn: hipBLASLt_build_${{ job.target }}
+    condition:
+      and(succeeded(),
+        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
+        eq(${{ parameters.aggregatePipeline }}, False)
+      )
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    - name: ROCM_PATH
+      value: $(Agent.BuildDirectory)/rocm
+    pool: ${{ job.target }}_test_pool
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        pipModules: ${{ parameters.pipModules }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+      parameters:
+        gpuTarget: ${{ job.target }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      parameters:
+        checkoutRef: ${{ parameters.checkoutRef }}
+        dependencyList: ${{ parameters.rocmTestDependencies }}
+        gpuTarget: ${{ job.target }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+      parameters:
+        componentName: hipBLASLt
+        testDir: '$(Agent.BuildDirectory)/rocm/bin'
+        testExecutable: './hipblaslt-test'
+        testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes --gtest_filter=*pre_checkin*'
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        pipModules: ${{ parameters.pipModules }}
+        environment: test
+        gpuTarget: ${{ job.target }}
--- a/.azuredevops/components/hipCUB.yml
+++ b/.azuredevops/components/hipCUB.yml
@@ -1,29 +1,10 @@
 parameters:
- name: componentName
-  type: string
-  default: hipCUB
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
-# monorepo related parameters
- name: sparseCheckoutDir
-  type: string
-  default: ''
- name: triggerDownstreamJobs
-  type: boolean
-  default: false
- name: downstreamAggregateNames
-  type: string
-  default: ''
- name: buildDependsOn
-  type: object
-  default: null
- name: unifiedBuild
-  type: boolean
-  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -33,8 +14,9 @@ parameters:
  type: object
  default:
    - cmake
-    - git
    - ninja-build
+    - libgtest-dev
+    - git
    - python3-pip
 - name: rocmDependencies
  type: object
@@ -51,143 +33,103 @@ parameters:
    - llvm-project
    - rocminfo
    - rocPRIM
-    - rocprofiler-register
    - ROCR-Runtime
+    - rocprofiler-register

 - name: jobMatrix
  type: object
  default:
    buildJobs:
-      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
-      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
-      - { os: almalinux8, packageManager: dnf, target: gfx942 }
-      - { os: almalinux8, packageManager: dnf, target: gfx90a }
-      - { os: almalinux8, packageManager: dnf, target: gfx1201 }
-      - { os: almalinux8, packageManager: dnf, target: gfx1100 }
-      - { os: almalinux8, packageManager: dnf, target: gfx1030 }
+      - gfx942:
+        target: gfx942
+      - gfx90a:
+        target: gfx90a
    testJobs:
-      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+      - gfx942:
+        target: gfx942
+      - gfx90a:
+        target: gfx90a

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
-    ${{ if parameters.buildDependsOn }}:
-      dependsOn:
-        - ${{ each build in parameters.buildDependsOn }}:
-          - ${{ build }}_${{ job.os }}_${{ job.target }}
+  - job: hipCUB_build_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
    pool: ${{ variables.MEDIUM_BUILD_POOL }}
-    ${{ if eq(job.os, 'almalinux8') }}:
-      container:
-        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-        endpoint: ContainerService3
    workspace:
      clean: all
    steps:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
-        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
-      parameters:
-        dependencyList:
-          - gtest
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-        ${{ if parameters.triggerDownstreamJobs }}:
-          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
-        os: ${{ job.os }}
-        consolidateBuildAndInstall: true
        extraBuildFlags: >-
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
-          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor"
-          -DBUILD_BENCHMARK=ON
+          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
          -DBUILD_TEST=ON
          -DAMDGPU_TARGETS=${{ job.target }}
          -GNinja
-        extraCxxFlags: -Wno-deprecated-declarations
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    - ${{ if eq(job.os, 'ubuntu2204') }}:
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          gpuTarget: ${{ job.target }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        gpuTarget: ${{ job.target }}

- ${{ if eq(parameters.unifiedBuild, False) }}:
-  - ${{ each job in parameters.jobMatrix.testJobs }}:
-    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
-      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
-      condition:
-        and(succeeded(),
-          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
-          eq(${{ parameters.aggregatePipeline }}, False)
-        )
-      variables:
-      - group: common
-      - template: /.azuredevops/variables-global.yml
-      pool: ${{ job.target }}_test_pool
-      workspace:
-        clean: all
-      steps:
-      - checkout: none
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          packageManager: ${{ job.packageManager }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-        parameters:
-          preTargetFilter: ${{ parameters.componentName }}
-          gpuTarget: ${{ job.target }}
-          os: ${{ job.os }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-        parameters:
-          checkoutRef: ${{ parameters.checkoutRef }}
-          dependencyList: ${{ parameters.rocmTestDependencies }}
-          gpuTarget: ${{ job.target }}
-          os: ${{ job.os }}
-          ${{ if parameters.triggerDownstreamJobs }}:
-            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-        parameters:
-          componentName: ${{ parameters.componentName }}
-          testDir: '$(Agent.BuildDirectory)/rocm/bin/hipcub'
-          os: ${{ job.os }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          environment: test
-          gpuTarget: ${{ job.target }}
+- ${{ each job in parameters.jobMatrix.testJobs }}:
+  - job: hipCUB_test_${{ job.target }}
+    dependsOn: hipCUB_build_${{ job.target }}
+    condition:
+      and(succeeded(),
+        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
+        eq(${{ parameters.aggregatePipeline }}, False)
+      )
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    pool: ${{ job.target }}_test_pool
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+      parameters:
+        gpuTarget: ${{ job.target }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      parameters:
+        checkoutRef: ${{ parameters.checkoutRef }}
+        dependencyList: ${{ parameters.rocmTestDependencies }}
+        gpuTarget: ${{ job.target }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+      parameters:
+        componentName: hipCUB
+        testDir: '$(Agent.BuildDirectory)/rocm/bin/hipcub'
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        environment: test
+        gpuTarget: ${{ job.target }}
--- a/.azuredevops/components/hipFFT.yml
+++ b/.azuredevops/components/hipFFT.yml
@@ -1,29 +1,10 @@
 parameters:
- name: componentName
-  type: string
-  default: hipFFT
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
-# monorepo related parameters
- name: sparseCheckoutDir
-  type: string
-  default: ''
- name: triggerDownstreamJobs
-  type: boolean
-  default: false
- name: downstreamAggregateNames
-  type: string
-  default: ''
- name: buildDependsOn
-  type: object
-  default: null
- name: unifiedBuild
-  type: boolean
-  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -80,11 +61,7 @@ parameters:

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: ${{ parameters.componentName }}_build_${{ job.target }}
-    ${{ if parameters.buildDependsOn }}:
-      dependsOn:
-        - ${{ each build in parameters.buildDependsOn }}:
-          - ${{ build }}_${{ job.target }} # todo: add OS
+  - job: hipFFT_build_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -102,15 +79,12 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-        ${{ if parameters.triggerDownstreamJobs }}:
-          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        extraBuildFlags: >-
@@ -128,11 +102,9 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
@@ -141,8 +113,8 @@ jobs:
    #     gpuTarget: ${{ job.target }}

 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: ${{ parameters.componentName }}_test_${{ job.target }}
-    dependsOn: ${{ parameters.componentName }}_build_${{ job.target }}
+  - job: hipFFT_test_${{ job.target }}
+    dependsOn: hipFFT_build_${{ job.target }}
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -162,7 +134,6 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
      parameters:
-        preTargetFilter: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
@@ -170,12 +141,10 @@ jobs:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmTestDependencies }}
        gpuTarget: ${{ job.target }}
-        ${{ if parameters.triggerDownstreamJobs }}:
-          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
+        componentName: hipFFT
        testDir: '$(Agent.BuildDirectory)/rocm/bin'
        testExecutable: './hipfft-test'
        testParameters: '--test_prob 0.002 --gtest_output=xml:./test_output.xml --gtest_color=yes'
--- a/.azuredevops/components/hipRAND.yml
+++ b/.azuredevops/components/hipRAND.yml
@@ -1,29 +1,10 @@
 parameters:
- name: componentName
-  type: string
-  default: hipRAND
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
-# monorepo related parameters
- name: sparseCheckoutDir
-  type: string
-  default: ''
- name: triggerDownstreamJobs
-  type: boolean
-  default: false
- name: downstreamAggregateNames
-  type: string
-  default: ''
- name: buildDependsOn
-  type: object
-  default: null
- name: unifiedBuild
-  type: boolean
-  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -33,18 +14,18 @@ parameters:
  type: object
  default:
    - cmake
-    - git
    - ninja-build
+    - googletest
+    - git
    - python3-pip
 - name: rocmDependencies
  type: object
  default:
-    - clr
    - llvm-project
-    - rocm-cmake
+    - ROCR-Runtime
+    - clr
    - rocminfo
    - rocRAND
-    - ROCR-Runtime
 - name: rocmTestDependencies
  type: object
  default:
@@ -52,168 +33,110 @@ parameters:
    - llvm-project
    - rocminfo
    - rocprofiler-register
-    - rocRAND
    - ROCR-Runtime
+    - rocRAND

 - name: jobMatrix
  type: object
  default:
    buildJobs:
-      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
-      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
-      - { os: almalinux8, packageManager: dnf, target: gfx942 }
-      - { os: almalinux8, packageManager: dnf, target: gfx90a }
-      - { os: almalinux8, packageManager: dnf, target: gfx1201 }
-      - { os: almalinux8, packageManager: dnf, target: gfx1100 }
-      - { os: almalinux8, packageManager: dnf, target: gfx1030 }
+      - gfx942:
+        target: gfx942
+      - gfx90a:
+        target: gfx90a
    testJobs:
-      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
-# - name: downstreamComponentMatrix
-#   type: object
-#   default:
-#     - rocFFT:
-#       name: rocFFT
-#       sparseCheckoutDir: projects/rocfft
-#       skipUnifiedBuild: 'false'
-#       buildDependsOn:
-#         - hipRAND_build
+      - gfx942:
+        target: gfx942
+      - gfx90a:
+        target: gfx90a

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
-    ${{ if parameters.buildDependsOn }}:
-      dependsOn:
-        - ${{ each build in parameters.buildDependsOn }}:
-          - ${{ build }}_${{ job.os }}_${{ job.target }}
+  - job: hipRAND_build_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
    - name: HIP_ROCCLR_HOME
      value: $(Build.BinariesDirectory)/rocm
-    pool: ${{ variables.MEDIUM_BUILD_POOL }}
-    ${{ if eq(job.os, 'almalinux8') }}:
-      container:
-        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-        endpoint: ContainerService3
+    pool:
+      vmImage: ${{ variables.BASE_BUILD_POOL }}
    workspace:
      clean: all
    steps:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
-        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
-      parameters:
-        dependencyList:
-          - gtest
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-        ${{ if parameters.triggerDownstreamJobs }}:
-          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
-        os: ${{ job.os }}
-        consolidateBuildAndInstall: true
        extraBuildFlags: >-
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
          -DBUILD_TEST=ON
          -DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm/lib/cmake/hip
-          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor"
+          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
          -DCMAKE_BUILD_TYPE=Release
          -DAMDGPU_TARGETS=${{ job.target }}
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    - ${{ if eq(job.os, 'ubuntu2204') }}:
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          gpuTarget: ${{ job.target }}
-          extraEnvVars:
-            - HIP_ROCCLR_HOME:::/home/user/workspace/rocm
+    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+    #   parameters:
+    #     aptPackages: ${{ parameters.aptPackages }}
+    #     gpuTarget: ${{ job.target }}
+    #     extraEnvVars:
+    #       - HIP_ROCCLR_HOME:::/home/user/workspace/rocm

- ${{ if eq(parameters.unifiedBuild, False) }}:
-  - ${{ each job in parameters.jobMatrix.testJobs }}:
-    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
-      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
-      condition:
-          and(succeeded(),
-            eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-            not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
-            eq(${{ parameters.aggregatePipeline }}, False)
-          )
-      variables:
-      - group: common
-      - template: /.azuredevops/variables-global.yml
-      pool: ${{ job.target }}_test_pool
-      workspace:
-        clean: all
-      steps:
-      - checkout: none
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          packageManager: ${{ job.packageManager }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-        parameters:
-          preTargetFilter: ${{ parameters.componentName }}
-          os: ${{ job.os }}
-          gpuTarget: ${{ job.target }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-        parameters:
-          checkoutRef: ${{ parameters.checkoutRef }}
-          dependencyList: ${{ parameters.rocmTestDependencies }}
-          gpuTarget: ${{ job.target }}
-          os: ${{ job.os }}
-          ${{ if parameters.triggerDownstreamJobs }}:
-            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-        parameters:
-          componentName: ${{ parameters.componentName }}
-          testDir: '$(Agent.BuildDirectory)/rocm/bin/hipRAND'
-          os: ${{ job.os }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          environment: test
-          gpuTarget: ${{ job.target }}
-
-# - ${{ if parameters.triggerDownstreamJobs }}:
-#   - ${{ each component in parameters.downstreamComponentMatrix }}:
-#     - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
-#       - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
-#         parameters:
-#           checkoutRepo: ${{ parameters.checkoutRepo }}
-#           sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
-#           buildDependsOn: ${{ component.buildDependsOn }}
-#           downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
-#           triggerDownstreamJobs: true
-#           unifiedBuild: ${{ parameters.unifiedBuild }}
+- ${{ each job in parameters.jobMatrix.testJobs }}:
+  - job: hipRAND_test_${{ job.target }}
+    dependsOn: hipRAND_build_${{ job.target }}
+    condition:
+        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
+          eq(${{ parameters.aggregatePipeline }}, False)
+        )
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    pool: ${{ job.target }}_test_pool
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+      parameters:
+        gpuTarget: ${{ job.target }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      parameters:
+        checkoutRef: ${{ parameters.checkoutRef }}
+        dependencyList: ${{ parameters.rocmTestDependencies }}
+        gpuTarget: ${{ job.target }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+      parameters:
+        componentName: hipRAND
+        testDir: '$(Agent.BuildDirectory)/rocm/bin/hipRAND'
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        environment: test
+        gpuTarget: ${{ job.target }}
--- a/.azuredevops/components/llvm-project.yml
+++ b/.azuredevops/components/llvm-project.yml
@@ -14,188 +14,146 @@ parameters:
  type: object
  default:
    - cmake
+    - python3-pip
    - libnuma-dev
    - ninja-build
-    - pkg-config
    - python-is-python3
-    - python3-pip
    - zlib1g-dev
+    - pkg-config
 - name: rocmDependencies
  type: object
  default:
    - rocm-cmake

- name: jobMatrix
-  type: object
-  default:
-    buildJobs:
-      - { os: ubuntu2204, packageManager: apt }
-      - { os: ubuntu2404, packageManager: apt }
-      - { os: almalinux8, packageManager: dnf }
-
 jobs:
- ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: llvm_project_${{ job.os }}
-    pool:
-      ${{ if eq(job.os, 'ubuntu2404') }}:
-        name: 'rocm-ci_high_build_pool_2404' #temporarily using 'high' pool while 'ultra' is down
-      ${{ else }}:
-        name: 'rocm-ci_ultra_build_pool'
-    ${{ if eq(job.os, 'almalinux8') }}:
-      container:
-        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-        endpoint: ContainerService3
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    - name: HIP_DEVICE_LIB_PATH
-      value: '$(Build.BinariesDirectory)/amdgcn/bitcode'
-    - name: HIP_PATH
-      value: '$(Agent.BuildDirectory)/rocm'
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        packageManager: ${{ job.packageManager }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      parameters:
-        checkoutRepo: ${{ parameters.checkoutRepo }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmDependencies }}
-        skipLlvmSymlink: true
-        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-      parameters:
-        componentName: rocm-llvm
-        os: ${{ job.os }}
-        useAmdclang: false
-        extraBuildFlags: >-
-          -DCMAKE_PREFIX_PATH="$(Build.BinariesDirectory)/llvm;$(Build.BinariesDirectory)"
-          -DCMAKE_BUILD_TYPE=Release
-          -DLLVM_ENABLE_PROJECTS=clang;lld;clang-tools-extra;mlir;flang
-          -DLLVM_ENABLE_RUNTIMES=compiler-rt;libunwind;libcxx;libcxxabi
-          -DCLANG_ENABLE_AMDCLANG=ON
-          -DLLVM_TARGETS_TO_BUILD=AMDGPU;X86
-          -DLIBCXX_ENABLE_SHARED=OFF
-          -DLIBCXX_ENABLE_STATIC=ON
-          -DLIBCXX_INSTALL_LIBRARY=OFF
-          -DLIBCXX_INSTALL_HEADERS=OFF
-          -DLIBCXXABI_ENABLE_SHARED=OFF
-          -DLIBCXXABI_ENABLE_STATIC=ON
-          -DLIBCXXABI_INSTALL_STATIC_LIBRARY=OFF
-          -DLLVM_BUILD_DOCS=OFF
-          -DLLVM_ENABLE_SPHINX=OFF
-          -DLLVM_ENABLE_ASSERTIONS=OFF
-          -DLLVM_ENABLE_Z3_SOLVER=OFF
-          -DLLVM_ENABLE_ZLIB=ON
-          -DCLANG_DEFAULT_LINKER=lld
-          -DCLANG_DEFAULT_RTLIB=compiler-rt
-          -DCLANG_DEFAULT_UNWINDLIB=libgcc
-          -DSANITIZER_AMDGPU=OFF
-          -DPACKAGE_VENDOR=AMD
-          -DCLANG_LINK_FLANG_LEGACY=ON
-          -DCMAKE_CXX_STANDARD=17
-          -DROCM_LLVM_BACKWARD_COMPAT_LINK=$(Build.BinariesDirectory)/llvm
-          -DROCM_LLVM_BACKWARD_COMPAT_LINK_TARGET=./lib/llvm
-          -GNinja
-        cmakeBuildDir: '$(Build.SourcesDirectory)/llvm/build'
-        cmakeSourceDir: '$(Build.SourcesDirectory)/llvm'
-        installDir: '$(Build.BinariesDirectory)/llvm'
-    # use llvm-lit to run unit tests for llvm, clang, and lld
-    - task: Bash@3
-      displayName: 'Copy llvm-lit to install directory'
-      inputs:
-        targetType: inline
-        script: |
-          cp $(Build.SourcesDirectory)/llvm/build/bin/llvm-lit $(Build.BinariesDirectory)/llvm/bin/
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: check-llvm
-        testDir: 'llvm/build'
-        testExecutable: './bin/llvm-lit'
-        testParameters: '-q --xunit-xml-output=llvm_test_output.xml --filter-out="live-debug-values-spill-tracking" ./test'
-        testOutputFile: llvm_test_output.xml
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: check-clang
-        testDir: 'llvm/build'
-        testExecutable: './bin/llvm-lit'
-        testParameters: '-q --xunit-xml-output=clang_test_output.xml ./tools/clang/test'
-        testOutputFile: clang_test_output.xml
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: check-lld
-        testDir: 'llvm/build'
-        testExecutable: './bin/llvm-lit'
-        testParameters: '-q --xunit-xml-output=lld_test_output.xml ./tools/lld/test'
-        testOutputFile: lld_test_output.xml
-        os: ${{ job.os }}
-    - task: CopyFiles@2
-      displayName: Copy FileCheck for Publishing
-      inputs:
-        CleanTargetFolder: false
-        SourceFolder: llvm/build/bin
-        Contents: FileCheck
-        TargetFolder: $(Build.BinariesDirectory)/llvm/bin
-        retryCount: 3
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-      parameters:
-        componentName: device-libs
-        os: ${{ job.os }}
-        useAmdclang: false
-        extraBuildFlags: >-
-          -DCMAKE_PREFIX_PATH="$(Build.SourcesDirectory)/llvm/build"
-          -DCMAKE_BUILD_TYPE=Release
-        cmakeBuildDir: '$(Build.SourcesDirectory)/amd/device-libs/build'
-        cmakeSourceDir: '$(Build.SourcesDirectory)/amd/device-libs'
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-      parameters:
-        componentName: comgr
-        os: ${{ job.os }}
-        useAmdclang: false
-        extraBuildFlags: >-
-          -DCMAKE_PREFIX_PATH="$(Build.SourcesDirectory)/llvm/build;$(Build.SourcesDirectory)/amd/device-libs/build"
-          -DCOMGR_DISABLE_SPIRV=1
-          -DCMAKE_BUILD_TYPE=Release
-        cmakeBuildDir: '$(Build.SourcesDirectory)/amd/comgr/build'
-        cmakeSourceDir: '$(Build.SourcesDirectory)/amd/comgr'
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: comgr
-        testParameters: '--output-on-failure --force-new-ctest-process --output-junit comgr_test_output.xml'
-        testDir: 'amd/comgr/build'
-        testOutputFile: comgr_test_output.xml
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-      parameters:
-        componentName: hipcc
-        os: ${{ job.os }}
-        useAmdclang: false
-        extraBuildFlags: >-
-          -DCMAKE_BUILD_TYPE=Release
-          -DHIPCC_BACKWARD_COMPATIBILITY=OFF
-        cmakeBuildDir: '$(Build.SourcesDirectory)/amd/hipcc/build'
-        cmakeSourceDir: '$(Build.SourcesDirectory)/amd/hipcc'
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-      parameters:
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-      parameters:
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    - ${{ if eq(job.os, 'ubuntu2204') }}:
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          environment: combined
-          extraEnvVars:
-            - HIP_DEVICE_LIB_PATH:::/home/user/workspace/bin/amdgcn/bitcode
-            - HIP_PATH:::/home/user/workspace/rocm
+- job: llvm_project
+  variables:
+  - group: common
+  - template: /.azuredevops/variables-global.yml
+  - name: HIP_DEVICE_LIB_PATH
+    value: '$(Build.BinariesDirectory)/amdgcn/bitcode'
+  - name: HIP_PATH
+    value: '$(Agent.BuildDirectory)/rocm'
+  pool: ${{ variables.ULTRA_BUILD_POOL }}
+  workspace:
+    clean: all
+  steps:
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+    parameters:
+      aptPackages: ${{ parameters.aptPackages }}
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+    parameters:
+      checkoutRepo: ${{ parameters.checkoutRepo }}
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+    parameters:
+      checkoutRef: ${{ parameters.checkoutRef }}
+      dependencyList: ${{ parameters.rocmDependencies }}
+      skipLlvmSymlink: true
+      aggregatePipeline: ${{ parameters.aggregatePipeline }}
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+    parameters:
+      componentName: rocm-llvm
+      extraBuildFlags: >-
+        -DCMAKE_PREFIX_PATH="$(Build.BinariesDirectory)/llvm;$(Build.BinariesDirectory)"
+        -DCMAKE_BUILD_TYPE=Release
+        -DLLVM_ENABLE_PROJECTS=clang;lld;clang-tools-extra;mlir;flang
+        -DLLVM_ENABLE_RUNTIMES=compiler-rt;libunwind;libcxx;libcxxabi
+        -DCLANG_ENABLE_AMDCLANG=ON
+        -DLLVM_TARGETS_TO_BUILD=AMDGPU;X86
+        -DLIBCXX_ENABLE_SHARED=OFF
+        -DLIBCXX_ENABLE_STATIC=ON
+        -DLIBCXX_INSTALL_LIBRARY=OFF
+        -DLIBCXX_INSTALL_HEADERS=OFF
+        -DLIBCXXABI_ENABLE_SHARED=OFF
+        -DLIBCXXABI_ENABLE_STATIC=ON
+        -DLIBCXXABI_INSTALL_STATIC_LIBRARY=OFF
+        -DLLVM_BUILD_DOCS=OFF
+        -DLLVM_ENABLE_SPHINX=OFF
+        -DLLVM_ENABLE_ASSERTIONS=OFF
+        -DLLVM_ENABLE_Z3_SOLVER=OFF
+        -DLLVM_ENABLE_ZLIB=ON
+        -DCLANG_DEFAULT_LINKER=lld
+        -DCLANG_DEFAULT_RTLIB=compiler-rt
+        -DCLANG_DEFAULT_UNWINDLIB=libgcc
+        -DSANITIZER_AMDGPU=OFF
+        -DPACKAGE_VENDOR=AMD
+        -DCLANG_LINK_FLANG_LEGACY=ON
+        -DCMAKE_CXX_STANDARD=17
+        -DROCM_LLVM_BACKWARD_COMPAT_LINK=$(Build.BinariesDirectory)/llvm
+        -DROCM_LLVM_BACKWARD_COMPAT_LINK_TARGET=./lib/llvm
+        -GNinja
+      cmakeBuildDir: '$(Build.SourcesDirectory)/llvm/build'
+      cmakeSourceDir: '$(Build.SourcesDirectory)/llvm'
+      installDir: '$(Build.BinariesDirectory)/llvm'
+# use llvm-lit to run unit tests for llvm, clang, and lld
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+    parameters:
+      componentName: check-llvm
+      testDir: 'llvm/build'
+      testExecutable: './bin/llvm-lit'
+      testParameters: '-q --xunit-xml-output=llvm_test_output.xml --filter-out="live-debug-values-spill-tracking" ./test'
+      testOutputFile: llvm_test_output.xml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+    parameters:
+      componentName: check-clang
+      testDir: 'llvm/build'
+      testExecutable: './bin/llvm-lit'
+      testParameters: '-q --xunit-xml-output=clang_test_output.xml ./tools/clang/test'
+      testOutputFile: clang_test_output.xml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+    parameters:
+      componentName: check-lld
+      testDir: 'llvm/build'
+      testExecutable: './bin/llvm-lit'
+      testParameters: '-q --xunit-xml-output=lld_test_output.xml ./tools/lld/test'
+      testOutputFile: lld_test_output.xml
+  - task: CopyFiles@2
+    displayName: Copy FileCheck for Publishing
+    inputs:
+      CleanTargetFolder: false
+      SourceFolder: llvm/build/bin
+      Contents: FileCheck
+      TargetFolder: $(Build.BinariesDirectory)/llvm/bin
+      retryCount: 3
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+    parameters:
+      componentName: device-libs
+      extraBuildFlags: >-
+        -DCMAKE_PREFIX_PATH="$(Build.SourcesDirectory)/llvm/build"
+        -DCMAKE_BUILD_TYPE=Release
+      cmakeBuildDir: '$(Build.SourcesDirectory)/amd/device-libs/build'
+      cmakeSourceDir: '$(Build.SourcesDirectory)/amd/device-libs'
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+    parameters:
+      componentName: comgr
+      extraBuildFlags: >-
+        -DCMAKE_PREFIX_PATH="$(Build.SourcesDirectory)/llvm/build;$(Build.SourcesDirectory)/amd/device-libs/build"
+        -DCOMGR_DISABLE_SPIRV=1
+        -DCMAKE_BUILD_TYPE=Release
+      cmakeBuildDir: '$(Build.SourcesDirectory)/amd/comgr/build'
+      cmakeSourceDir: '$(Build.SourcesDirectory)/amd/comgr'
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+    parameters:
+      componentName: comgr
+      testParameters: '--output-on-failure --force-new-ctest-process --output-junit comgr_test_output.xml'
+      testDir: 'amd/comgr/build'
+      testOutputFile: comgr_test_output.xml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+    parameters:
+      componentName: hipcc
+      extraBuildFlags: >-
+        -DCMAKE_BUILD_TYPE=Release
+        -DHIPCC_BACKWARD_COMPATIBILITY=OFF
+      cmakeBuildDir: '$(Build.SourcesDirectory)/amd/hipcc/build'
+      cmakeSourceDir: '$(Build.SourcesDirectory)/amd/hipcc'
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+    parameters:
+      aptPackages: ${{ parameters.aptPackages }}
+      environment: combined
+      extraEnvVars:
+        - HIP_DEVICE_LIB_PATH:::/home/user/workspace/bin/amdgcn/bitcode
+        - HIP_PATH:::/home/user/workspace/rocm
--- a/.azuredevops/components/rccl.yml
+++ b/.azuredevops/components/rccl.yml
@@ -15,6 +15,7 @@ parameters:
  default:
    - cmake
    - git
+    - googletest
    - libboost-program-options-dev
    - libdrm-dev
    - libfftw3-dev
@@ -89,10 +90,6 @@ jobs:
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
        submoduleBehaviour: recursive
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
-      parameters:
-        dependencyList:
-          - gtest
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
@@ -104,11 +101,12 @@ jobs:
        extraBuildFlags: >-
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/bin/hipcc
          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/bin/hipcc
+          -DHALF_INCLUDE_DIR=$(Agent.BuildDirectory)/rocm/include
          -DCMAKE_BUILD_TYPE=Release
          -DROCM_PATH=$(Agent.BuildDirectory)/rocm
          -DBUILD_TESTS=ON
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/rocm/share/rocm/cmake;$(Agent.BuildDirectory)/rocm/libexec/hipify
-          -DGPU_TARGETS=${{ job.target }}
+          -DAMDGPU_TARGETS=${{ job.target }}
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
--- a/.azuredevops/components/rocBLAS.yml
+++ b/.azuredevops/components/rocBLAS.yml
@@ -1,29 +1,10 @@
 parameters:
- name: componentName
-  type: string
-  default: rocBLAS
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
-# monorepo related parameters
- name: sparseCheckoutDir
-  type: string
-  default: ''
- name: triggerDownstreamJobs
-  type: boolean
-  default: false
- name: downstreamAggregateNames
-  type: string
-  default: ''
- name: buildDependsOn
-  type: object
-  default: null
- name: unifiedBuild
-  type: boolean
-  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -83,43 +64,19 @@ parameters:
  type: object
  default:
    buildJobs:
-      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
-      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
-      # - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
-      # - { os: almalinux8, packageManager: dnf, target: gfx942 }
-      # - { os: almalinux8, packageManager: dnf, target: gfx90a }
-      # - { os: almalinux8, packageManager: dnf, target: gfx1201 }
-      # - { os: almalinux8, packageManager: dnf, target: gfx1100 }
-      # - { os: almalinux8, packageManager: dnf, target: gfx1030 }
+      - gfx942:
+        target: gfx942
+      - gfx90a:
+        target: gfx90a
    testJobs:
-      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
-# - name: downstreamComponentMatrix
-#   type: object
-#   default:
-#     # rocSOLVER depends on both rocBLAS and rocPRIM
-#     # for a unified build, rocBLAS will be the one to call rocSOLVER
-#     - rocSOLVER:
-#       name: rocSOLVER
-#       sparseCheckoutDir: projects/rocsolver
-#       skipUnifiedBuild: 'false'
-#       buildDependsOn:
-#         - rocBLAS_build
-#       unifiedBuild:
-#         downstreamAggregateNames: rocBLAS+rocPRIM
-#         buildDependsOn:
-#           - rocBLAS_build
-#           - rocPRIM_build
+      - gfx942:
+        target: gfx942
+      - gfx90a:
+        target: gfx90a

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
-    ${{ if parameters.buildDependsOn }}:
-      dependsOn:
-        - ${{ each build in parameters.buildDependsOn }}:
-          - ${{ build }}_${{ job.os }}_${{ job.target }}
+  - job: rocBLAS_build_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -132,10 +89,6 @@ jobs:
    - name: ROCM_PATH
      value: $(Agent.BuildDirectory)/rocm
    pool: ${{ variables.MEDIUM_BUILD_POOL }}
-    ${{ if eq(job.os, 'almalinux8') }}:
-      container:
-        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-        endpoint: ContainerService3
    workspace:
      clean: all
    steps:
@@ -143,26 +96,19 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
-        packageManager: ${{ job.packageManager }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aocl.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
-        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-        ${{ if parameters.triggerDownstreamJobs }}:
-          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
-        os: ${{ job.os }}
        extraBuildFlags: >-
          -DCMAKE_TOOLCHAIN_FILE=toolchain-linux.cmake
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm/llvm;$(Agent.BuildDirectory)/rocm
@@ -182,94 +128,63 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
-        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
-        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    - ${{ if eq(job.os, 'ubuntu2204') }}:
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          pipModules: ${{ parameters.pipModules }}
-          installAOCL: true
-          gpuTarget: ${{ job.target }}
-          extraEnvVars:
-            - HIP_ROCCLR_HOME:::/home/user/workspace/rocm
-            - TENSILE_ROCM_ASSEMBLER_PATH:::/home/user/workspace/rocm/llvm/bin/clang
-            - TENSILE_ROCM_OFFLOAD_BUNDLER_PATH:::/home/user/workspace/rocm/llvm/bin/clang-offload-bundler
-            - ROCM_PATH:::/home/user/workspace/rocm
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        pipModules: ${{ parameters.pipModules }}
+        installAOCL: true
+        gpuTarget: ${{ job.target }}
+        extraEnvVars:
+          - HIP_ROCCLR_HOME:::/home/user/workspace/rocm
+          - TENSILE_ROCM_ASSEMBLER_PATH:::/home/user/workspace/rocm/llvm/bin/clang
+          - TENSILE_ROCM_OFFLOAD_BUNDLER_PATH:::/home/user/workspace/rocm/llvm/bin/clang-offload-bundler
+          - ROCM_PATH:::/home/user/workspace/rocm

- ${{ if eq(parameters.unifiedBuild, False) }}:
-  - ${{ each job in parameters.jobMatrix.testJobs }}:
-    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
-      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
-      condition:
-        and(succeeded(),
-          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
-          eq(${{ parameters.aggregatePipeline }}, False)
-        )
-      variables:
-      - group: common
-      - template: /.azuredevops/variables-global.yml
-      pool: ${{ job.target }}_test_pool
-      workspace:
-        clean: all
-      steps:
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          pipModules: ${{ parameters.pipModules }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-        parameters:
-          preTargetFilter: ${{ parameters.componentName }}
-          os: ${{ job.os }}
-          gpuTarget: ${{ job.target }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-        parameters:
-          os: ${{ job.os }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-        parameters:
-          checkoutRef: ${{ parameters.checkoutRef }}
-          dependencyList: ${{ parameters.rocmTestDependencies }}
-          os: ${{ job.os }}
-          gpuTarget: ${{ job.target }}
-          ${{ if parameters.triggerDownstreamJobs }}:
-            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-        parameters:
-          componentName: ${{ parameters.componentName }}
-          os: ${{ job.os }}
-          testDir: '$(Agent.BuildDirectory)/rocm/bin'
-          testExecutable: './rocblas-test'
-          testParameters: '--yaml rocblas_smoke.yaml --gtest_output=xml:./test_output.xml --gtest_color=yes'
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          pipModules: ${{ parameters.pipModules }}
-          environment: test
-          gpuTarget: ${{ job.target }}
-
-# - ${{ if parameters.triggerDownstreamJobs }}:
-#   - ${{ each component in parameters.downstreamComponentMatrix }}:
-#     - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
-#       - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
-#         parameters:
-#           checkoutRepo: ${{ parameters.checkoutRepo }}
-#           sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
-#           triggerDownstreamJobs: true
-#           unifiedBuild: ${{ parameters.unifiedBuild }}
-#           ${{ if parameters.unifiedBuild }}:
-#             buildDependsOn: ${{ component.unifiedBuild.buildDependsOn }}
-#             downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ component.unifiedBuild.downstreamAggregateNames }}
-#           ${{ else }}:
-#             buildDependsOn: ${{ component.buildDependsOn }}
-#             downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
+- ${{ each job in parameters.jobMatrix.testJobs }}:
+  - job: rocBLAS_test_${{ job.target }}
+    dependsOn: rocBLAS_build_${{ job.target }}
+    condition:
+      and(succeeded(),
+        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
+        eq(${{ parameters.aggregatePipeline }}, False)
+      )
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    pool: ${{ job.target }}_test_pool
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        pipModules: ${{ parameters.pipModules }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+      parameters:
+        gpuTarget: ${{ job.target }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      parameters:
+        checkoutRef: ${{ parameters.checkoutRef }}
+        dependencyList: ${{ parameters.rocmTestDependencies }}
+        gpuTarget: ${{ job.target }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+      parameters:
+        componentName: rocBLAS
+        testDir: '$(Agent.BuildDirectory)/rocm/bin'
+        testExecutable: './rocblas-test'
+        testParameters: '--yaml rocblas_smoke.yaml --gtest_output=xml:./test_output.xml --gtest_color=yes'
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        pipModules: ${{ parameters.pipModules }}
+        environment: test
+        gpuTarget: ${{ job.target }}
--- a/.azuredevops/components/rocDecode.yml
+++ b/.azuredevops/components/rocDecode.yml
@@ -1,7 +1,4 @@
 parameters:
- name: componentName
-  type: string
-  default: rocDecode
 - name: checkoutRepo
  type: string
  default: 'self'
@@ -16,28 +13,29 @@ parameters:
 - name: aptPackages
  type: object
  default:
+    - python3-pip
    - cmake
+    - ninja-build
+    - pkg-config
    - ffmpeg
    - libavcodec-dev
    - libavformat-dev
    - libavutil-dev
-    - libdrm-dev
    - libstdc++-12-dev
    - libva-amdgpu-dev
    - mesa-amdgpu-va-drivers
-    - ninja-build
-    - pkg-config
+    - libdrm-dev
    - python3-pip
 - name: rocmDependencies
  type: object
  default:
-    - clr
-    - llvm-project
    - rocm-cmake
-    - rocm-core
-    - rocminfo
-    - rocprofiler-register
+    - llvm-project
    - ROCR-Runtime
+    - clr
+    - rocminfo
+    - rocm-core
+    - rocprofiler-register
 - name: rocmTestDependencies
  type: object
  default:
@@ -50,70 +48,53 @@ parameters:
 - name: jobMatrix
  type: object
  default:
-    buildJobs:
-      - { os: ubuntu2204, packageManager: apt }
-      - { os: almalinux8, packageManager: dnf }
    testJobs:
-      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+      - gfx942:
+        target: gfx942
+      - gfx90a:
+        target: gfx90a

 jobs:
- ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: ${{ parameters.componentName }}_build_${{ job.os }}
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    - name: ROCM_PATH
-      value: $(Agent.BuildDirectory)/rocm
-    pool:
-      vmImage: ${{ variables.BASE_BUILD_POOL }}
-    ${{ if eq(job.os, 'almalinux8') }}:
-      container:
-        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-        endpoint: ContainerService3
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        packageManager: ${{ job.packageManager }}
-        registerROCmPackages: true
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      parameters:
-        checkoutRepo: ${{ parameters.checkoutRepo }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmDependencies }}
-        os: ${{ job.os }}
-        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-      parameters:
-        os: ${{ job.os }}
-        consolidateBuildAndInstall: true
-        extraBuildFlags: >-
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-          -DCMAKE_BUILD_TYPE=Release
-          -GNinja
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-      parameters:
-        componentName: ${{ parameters.componentName }}
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-      parameters:
-        componentName: ${{ parameters.componentName }}
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-    #   parameters:
-    #     aptPackages: ${{ parameters.aptPackages }}
-    #     registerROCmPackages: true
+- job: rocDecode_build
+  variables:
+  - group: common
+  - template: /.azuredevops/variables-global.yml
+  - name: ROCM_PATH
+    value: $(Agent.BuildDirectory)/rocm
+  pool:
+    vmImage: ${{ variables.BASE_BUILD_POOL }}
+  workspace:
+    clean: all
+  steps:
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+    parameters:
+      aptPackages: ${{ parameters.aptPackages }}
+      registerROCmPackages: true
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+    parameters:
+      checkoutRepo: ${{ parameters.checkoutRepo }}
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+    parameters:
+      checkoutRef: ${{ parameters.checkoutRef }}
+      dependencyList: ${{ parameters.rocmDependencies }}
+      aggregatePipeline: ${{ parameters.aggregatePipeline }}
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+    parameters:
+      extraBuildFlags: >-
+        -DCMAKE_BUILD_TYPE=Release
+        -GNinja
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+  # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+  #   parameters:
+  #     aptPackages: ${{ parameters.aptPackages }}
+  #     registerROCmPackages: true

 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
-    dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}
+  - job: rocDecode_test_${{ job.target }}
+    dependsOn: rocDecode_build
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -133,27 +114,20 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
-        packageManager: ${{ job.packageManager }}
        registerROCmPackages: true
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-      parameters:
-        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-      parameters:
-        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmTestDependencies }}
        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
    - task: Bash@3
      displayName: Build rocDecode tests
      inputs:
        targetType: inline
        script: |
-          ${{ iif(eq(job.os, 'almalinux8'), 'source /opt/rh/gcc-toolset-14/enable', '') }}
          mkdir rocDecode-tests
          cd rocDecode-tests
          cmake $(Agent.BuildDirectory)/rocm/share/rocdecode/test
@@ -162,7 +136,6 @@ jobs:
      parameters:
        componentName: rocDecode
        testDir: 'rocDecode-tests'
-        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/rocFFT.yml
+++ b/.azuredevops/components/rocFFT.yml
@@ -1,29 +1,10 @@
 parameters:
- name: componentName
-  type: string
-  default: rocFFT
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
-# monorepo related parameters
- name: sparseCheckoutDir
-  type: string
-  default: ''
- name: triggerDownstreamJobs
-  type: boolean
-  default: false
- name: downstreamAggregateNames
-  type: string
-  default: ''
- name: buildDependsOn
-  type: object
-  default: null
- name: unifiedBuild
-  type: boolean
-  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -78,23 +59,10 @@ parameters:
        target: gfx942
      - gfx90a:
        target: gfx90a
-# - name: downstreamComponentMatrix
-#   type: object
-#   default:
-#     - hipFFT:
-#       name: hipFFT
-#       sparseCheckoutDir: projects/hipfft
-#       skipUnifiedBuild: 'false'
-#       buildDependsOn:
-#         - rocFFT_build

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: ${{ parameters.componentName }}_build_${{ job.target }}
-    ${{ if parameters.buildDependsOn }}:
-      dependsOn:
-        - ${{ each build in parameters.buildDependsOn }}:
-          - ${{ build }}_ubuntu2204_${{ job.target }} # todo: un-hardcode OS
+  - job: rocFFT_build_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -111,15 +79,12 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-        ${{ if parameters.triggerDownstreamJobs }}:
-          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        extraBuildFlags: >-
@@ -136,11 +101,9 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
@@ -151,8 +114,8 @@ jobs:
          - HIP_ROCCLR_HOME:::/home/user/workspace/rocm

 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: ${{ parameters.componentName }}_test_${{ job.target }}
-    dependsOn: ${{ parameters.componentName }}_build_${{ job.target }}
+  - job: rocFFT_test_${{ job.target }}
+    dependsOn: rocFFT_build_${{ job.target }}
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -172,7 +135,6 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
      parameters:
-        preTargetFilter: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
@@ -180,12 +142,10 @@ jobs:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmTestDependencies }}
        gpuTarget: ${{ job.target }}
-        ${{ if parameters.triggerDownstreamJobs }}:
-          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
+        componentName: rocFFT
        testDir: '$(Agent.BuildDirectory)/rocm/bin'
        testExecutable: './rocfft-test'
        testParameters: '--test_prob 0.004 --gtest_output=xml:./test_output.xml --gtest_color=yes'
@@ -194,15 +154,3 @@ jobs:
        aptPackages: ${{ parameters.aptPackages }}
        environment: test
        gpuTarget: ${{ job.target }}
-
-# - ${{ if parameters.triggerDownstreamJobs }}:
-#   - ${{ each component in parameters.downstreamComponentMatrix }}:
-#     - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
-#       - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
-#         parameters:
-#           checkoutRepo: ${{ parameters.checkoutRepo }}
-#           sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
-#           buildDependsOn: ${{ component.buildDependsOn }}
-#           downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
-#           triggerDownstreamJobs: true
-#           unifiedBuild: ${{ parameters.unifiedBuild }}
--- a/.azuredevops/components/rocJPEG.yml
+++ b/.azuredevops/components/rocJPEG.yml
@@ -1,7 +1,4 @@
 parameters:
- name: componentName
-  type: string
-  default: rocJPEG
 - name: checkoutRepo
  type: string
  default: 'self'
@@ -47,44 +44,32 @@ parameters:
  type: object
  default:
    buildJobs:
-      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
-      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
-      - { os: almalinux8, packageManager: dnf, target: gfx942 }
-      - { os: almalinux8, packageManager: dnf, target: gfx90a }
-      - { os: almalinux8, packageManager: dnf, target: gfx1201 }
-      - { os: almalinux8, packageManager: dnf, target: gfx1100 }
-      - { os: almalinux8, packageManager: dnf, target: gfx1030 }
+      - gfx942:
+        target: gfx942
+      - gfx90a:
+        target: gfx90a
    testJobs:
-      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+      - gfx942:
+        target: gfx942
+      - gfx90a:
+        target: gfx90a

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
+  - job: rocJPEG_build_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
    - name: ROCM_PATH
      value: $(Agent.BuildDirectory)/rocm
    pool:
-      ${{ if eq(job.os, 'ubuntu2404') }}:
-        name: rocm-ci_medium_build_pool_2404
-      ${{ else }}:
-        name: ${{ variables.MEDIUM_BUILD_POOL }}
-    ${{ if eq(job.os, 'almalinux8') }}:
-      container:
-        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-        endpoint: ContainerService3
+      vmImage: ${{ variables.BASE_BUILD_POOL }}
    workspace:
      clean: all
    steps:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
-        packageManager: ${{ job.packageManager }}
        registerROCmPackages: true
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
@@ -95,26 +80,17 @@ jobs:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
-        os: ${{ job.os }}
-        consolidateBuildAndInstall: true
        extraBuildFlags: >-
-          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm"
          -DCMAKE_BUILD_TYPE=Release
          -DGPU_TARGETS=${{ job.target }}
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-      parameters:
-        componentName: ${{ parameters.componentName }}
-        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
    #   parameters:
@@ -123,8 +99,8 @@ jobs:
    #     registerROCmPackages: true

 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
-    dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
+  - job: rocJPEG_test_${{ job.target }}
+    dependsOn: rocJPEG_build_${{ job.target }}
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -144,28 +120,22 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
-        packageManager: ${{ job.packageManager }}
        registerROCmPackages: true
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
      parameters:
        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-      parameters:
-        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmTestDependencies }}
        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
    - task: Bash@3
      displayName: Build rocJPEG tests
      inputs:
        targetType: inline
        script: |
-          ${{ iif(eq(job.os, 'almalinux8'), 'source /opt/rh/gcc-toolset-14/enable', '') }}
          mkdir rocJPEG-tests
          cd rocJPEG-tests
          cmake $(Agent.BuildDirectory)/rocm/share/rocjpeg/test
@@ -174,7 +144,6 @@ jobs:
      parameters:
        componentName: rocJPEG
        testDir: 'rocJPEG-tests'
-        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/rocPRIM.yml
+++ b/.azuredevops/components/rocPRIM.yml
@@ -1,29 +1,16 @@
 parameters:
- name: componentName
-  type: string
-  default: rocPRIM
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
-# monorepo related parameters
+- name: sparseCheckout
+  type: boolean
+  default: false
 - name: sparseCheckoutDir
  type: string
  default: ''
- name: triggerDownstreamJobs
-  type: boolean
-  default: false
- name: downstreamAggregateNames
-  type: string
-  default: ''
- name: buildDependsOn
-  type: object
-  default: null
- name: unifiedBuild
-  type: boolean
-  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -33,17 +20,18 @@ parameters:
  type: object
  default:
    - cmake
-    - git
    - ninja-build
+    - libgtest-dev
+    - git
    - python3-pip
 - name: rocmDependencies
  type: object
  default:
-    - clr
-    - llvm-project
    - rocm-cmake
-    - rocminfo
+    - llvm-project
    - ROCR-Runtime
+    - clr
+    - rocminfo
 - name: rocmTestDependencies
  type: object
  default:
@@ -57,175 +45,98 @@ parameters:
  type: object
  default:
    buildJobs:
-      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
-      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
-      - { os: almalinux8, packageManager: dnf, target: gfx942 }
-      - { os: almalinux8, packageManager: dnf, target: gfx90a }
-      - { os: almalinux8, packageManager: dnf, target: gfx1201 }
-      - { os: almalinux8, packageManager: dnf, target: gfx1100 }
-      - { os: almalinux8, packageManager: dnf, target: gfx1030 }
+      - gfx942:
+        target: gfx942
+      - gfx90a:
+        target: gfx90a
    testJobs:
-      - { os: ubuntu2204, packageManager: apt, target: gfx942, shard: 1, shardCount: 3 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx942, shard: 2, shardCount: 3 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx942, shard: 3, shardCount: 3 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx90a, shard: 1, shardCount: 3 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx90a, shard: 2, shardCount: 3 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx90a, shard: 3, shardCount: 3 }
- name: downstreamComponentMatrix
-  type: object
-  default:
-    - rocThrust:
-      name: rocThrust
-      sparseCheckoutDir: projects/rocthrust
-      skipUnifiedBuild: 'false'
-      buildDependsOn:
-        - rocPRIM_build
-    - hipCUB:
-      name: hipCUB
-      sparseCheckoutDir: projects/hipcub
-      skipUnifiedBuild: 'false'
-      buildDependsOn:
-        - rocPRIM_build
-    # rocSOLVER depends on both rocBLAS and rocPRIM
-    # for a unified build, rocBLAS will be the one to call rocSOLVER
-    # - rocSOLVER:
-    #   name: rocSOLVER
-    #   sparseCheckoutDir: projects/rocsolver
-    #   skipUnifiedBuild: 'true'
-    #   buildDependsOn:
-    #     - rocPRIM_build
+      - gfx942:
+        target: gfx942
+      - gfx90a:
+        target: gfx90a

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
-    ${{ if parameters.buildDependsOn }}:
-      dependsOn:
-        - ${{ each build in parameters.buildDependsOn }}:
-          - ${{ build }}_${{ job.os }}_${{ job.target }}
+  - job: rocPRIM_build_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
    pool: ${{ variables.MEDIUM_BUILD_POOL }}
-    ${{ if eq(job.os, 'almalinux8') }}:
-      container:
-        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-        endpoint: ContainerService3
    workspace:
      clean: all
    steps:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
-        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckout: ${{ parameters.sparseCheckout }}
        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
-      parameters:
-        dependencyList:
-          - gtest
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-        ${{ if parameters.triggerDownstreamJobs }}:
-          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
-        os: ${{ job.os }}
-        consolidateBuildAndInstall: true
        extraBuildFlags: >-
-          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor"
+          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+          -DBUILD_BENCHMARK=ON
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
          -DAMDGPU_TARGETS=${{ job.target }}
-          -DBUILD_BENCHMARK=ON
          -DBUILD_TEST=ON
          -GNinja
-        extraCxxFlags: -Wno-deprecated-declarations
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    - ${{ if eq(job.os, 'ubuntu2204') }}:
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          gpuTarget: ${{ job.target }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        gpuTarget: ${{ job.target }}

- ${{ if eq(parameters.unifiedBuild, False) }}:
-  - ${{ each job in parameters.jobMatrix.testJobs }}:
-    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}_shard_${{ job.shard }}
-      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
-      condition:
-        and(succeeded(),
-          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
-          eq(${{ parameters.aggregatePipeline }}, False)
-        )
-      variables:
-      - group: common
-      - template: /.azuredevops/variables-global.yml
-      pool: ${{ job.target }}_test_pool
-      workspace:
-        clean: all
-      steps:
-      - checkout: none
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          packageManager: ${{ job.packageManager }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-        parameters:
-          preTargetFilter: ${{ parameters.componentName }}
-          gpuTarget: ${{ job.target }}
-          os: ${{ job.os }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-        parameters:
-          checkoutRef: ${{ parameters.checkoutRef }}
-          dependencyList: ${{ parameters.rocmTestDependencies }}
-          gpuTarget: ${{ job.target }}
-          os: ${{ job.os }}
-          ${{ if parameters.triggerDownstreamJobs }}:
-            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-        parameters:
-          componentName: ${{ parameters.componentName }}
-          testDir: '$(Agent.BuildDirectory)/rocm/bin/rocprim'
-          extraTestParameters: '-I ${{ job.shard }},,${{ job.shardCount }}'
-          os: ${{ job.os }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          environment: test
-          gpuTarget: ${{ job.target }}
-
- ${{ if parameters.triggerDownstreamJobs }}:
-  - ${{ each component in parameters.downstreamComponentMatrix }}:
-    - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
-      - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
-        parameters:
-          checkoutRepo: ${{ parameters.checkoutRepo }}
-          sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
-          buildDependsOn: ${{ component.buildDependsOn }}
-          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
-          triggerDownstreamJobs: true
-          unifiedBuild: ${{ parameters.unifiedBuild }}
+- ${{ each job in parameters.jobMatrix.testJobs }}:
+  - job: rocPRIM_test_${{ job.target }}
+    dependsOn: rocPRIM_build_${{ job.target }}
+    condition:
+      and(succeeded(),
+        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
+        eq(${{ parameters.aggregatePipeline }}, False)
+      )
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    pool: ${{ job.target }}_test_pool
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+      parameters:
+        gpuTarget: ${{ job.target }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      parameters:
+        checkoutRef: ${{ parameters.checkoutRef }}
+        dependencyList: ${{ parameters.rocmTestDependencies }}
+        gpuTarget: ${{ job.target }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+      parameters:
+        componentName: rocPRIM
+        testDir: '$(Agent.BuildDirectory)/rocm/bin/rocprim'
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        environment: test
+        gpuTarget: ${{ job.target }}
--- a/.azuredevops/components/rocRAND.yml
+++ b/.azuredevops/components/rocRAND.yml
@@ -1,29 +1,10 @@
 parameters:
- name: componentName
-  type: string
-  default: rocRAND
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
-# monorepo related parameters
- name: sparseCheckoutDir
-  type: string
-  default: ''
- name: triggerDownstreamJobs
-  type: boolean
-  default: false
- name: downstreamAggregateNames
-  type: string
-  default: ''
- name: buildDependsOn
-  type: object
-  default: null
- name: unifiedBuild
-  type: boolean
-  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -34,16 +15,18 @@ parameters:
  default:
    - cmake
    - git
+    - googletest
+    - libgtest-dev
    - ninja-build
    - python3-pip
 - name: rocmDependencies
  type: object
  default:
-    - clr
-    - llvm-project
    - rocm-cmake
-    - rocminfo
+    - llvm-project
    - ROCR-Runtime
+    - clr
+    - rocminfo
 - name: rocmTestDependencies
  type: object
  default:
@@ -57,96 +40,56 @@ parameters:
  type: object
  default:
    buildJobs:
-      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
-      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
-      - { os: almalinux8, packageManager: dnf, target: gfx942 }
-      - { os: almalinux8, packageManager: dnf, target: gfx90a }
-      - { os: almalinux8, packageManager: dnf, target: gfx1201 }
-      - { os: almalinux8, packageManager: dnf, target: gfx1100 }
-      - { os: almalinux8, packageManager: dnf, target: gfx1030 }
+      - gfx942:
+        target: gfx942
+      - gfx90a:
+        target: gfx90a
    testJobs:
-      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
- name: downstreamComponentMatrix
-  type: object
-  default:
-    - hipRAND:
-      name: hipRAND
-      sparseCheckoutDir: projects/hiprand
-      skipUnifiedBuild: 'false'
-      buildDependsOn:
-        - rocRAND_build
+      - gfx942:
+        target: gfx942
+      - gfx90a:
+        target: gfx90a

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
-    ${{ if parameters.buildDependsOn }}:
-      dependsOn:
-        - ${{ each build in parameters.buildDependsOn }}:
-          - ${{ build }}_${{ job.os }}_${{ job.target }}
+  - job: rocRAND_build_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
    - name: HIP_ROCCLR_HOME
      value: $(Build.BinariesDirectory)/rocm
    pool:
-      ${{ if eq(job.os, 'ubuntu2404') }}:
-        vmImage: 'ubuntu-24.04'
-      ${{ else }}:
-        name: ${{ variables.MEDIUM_BUILD_POOL }}
-    ${{ if eq(job.os, 'almalinux8') }}:
-      container:
-        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-        endpoint: ContainerService3
+      vmImage: ${{ variables.BASE_BUILD_POOL }}
    workspace:
      clean: all
    steps:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
-        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
-      parameters:
-        dependencyList:
-          - gtest
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-        os: ${{ job.os }}
-        ${{ if parameters.triggerDownstreamJobs }}:
-          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
-        os: ${{ job.os }}
-        consolidateBuildAndInstall: true
        extraBuildFlags: >-
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor
+          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+          -DBUILD_TEST=ON
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
          -DAMDGPU_TARGETS=${{ job.target }}
-          -DBUILD_TEST=ON
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
    #   parameters:
@@ -155,63 +98,42 @@ jobs:
    #     extraEnvVars:
    #       - HIP_ROCCLR_HOME:::/home/user/workspace/rocm

- ${{ if eq(parameters.unifiedBuild, False) }}:
-  - ${{ each job in parameters.jobMatrix.testJobs }}:
-    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
-      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
-      condition:
-        and(succeeded(),
-          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
-          eq(${{ parameters.aggregatePipeline }}, False)
-        )
-      variables:
-      - group: common
-      - template: /.azuredevops/variables-global.yml
-      pool: ${{ job.target }}_test_pool
-      workspace:
-        clean: all
-      steps:
-      - checkout: none
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          packageManager: ${{ job.packageManager }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-        parameters:
-          preTargetFilter: ${{ parameters.componentName }}
-          gpuTarget: ${{ job.target }}
-          os: ${{ job.os }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-        parameters:
-          checkoutRef: ${{ parameters.checkoutRef }}
-          dependencyList: ${{ parameters.rocmTestDependencies }}
-          gpuTarget: ${{ job.target }}
-          os: ${{ job.os }}
-          ${{ if parameters.triggerDownstreamJobs }}:
-            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-        parameters:
-          componentName: ${{ parameters.componentName }}
-          testDir: '$(Agent.BuildDirectory)/rocm/bin/rocRAND'
-          os: ${{ job.os }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          environment: test
-          gpuTarget: ${{ job.target }}
-
- ${{ if parameters.triggerDownstreamJobs }}:
-  - ${{ each component in parameters.downstreamComponentMatrix }}:
-    - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
-      - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
-        parameters:
-          checkoutRepo: ${{ parameters.checkoutRepo }}
-          sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
-          buildDependsOn: ${{ component.buildDependsOn }}
-          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
-          triggerDownstreamJobs: true
-          unifiedBuild: ${{ parameters.unifiedBuild }}
+- ${{ each job in parameters.jobMatrix.testJobs }}:
+  - job: rocRAND_test_${{ job.target }}
+    dependsOn: rocRAND_build_${{ job.target }}
+    condition:
+      and(succeeded(),
+        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
+        eq(${{ parameters.aggregatePipeline }}, False)
+      )
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    pool: ${{ job.target }}_test_pool
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+      parameters:
+        gpuTarget: ${{ job.target }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      parameters:
+        checkoutRef: ${{ parameters.checkoutRef }}
+        dependencyList: ${{ parameters.rocmTestDependencies }}
+        gpuTarget: ${{ job.target }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+      parameters:
+        componentName: rocRAND
+        testDir: '$(Agent.BuildDirectory)/rocm/bin/rocRAND'
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        environment: test
+        gpuTarget: ${{ job.target }}
--- a/.azuredevops/components/rocSOLVER.yml
+++ b/.azuredevops/components/rocSOLVER.yml
@@ -1,29 +1,10 @@
 parameters:
- name: componentName
-  type: string
-  default: rocSOLVER
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
-# monorepo related parameters
- name: sparseCheckoutDir
-  type: string
-  default: ''
- name: triggerDownstreamJobs
-  type: boolean
-  default: false
- name: downstreamAggregateNames
-  type: string
-  default: ''
- name: buildDependsOn
-  type: object
-  default: null
- name: unifiedBuild
-  type: boolean
-  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -45,12 +26,14 @@ parameters:
  type: object
  default:
    - clr
+    - hipSPARSE
    - llvm-project
    - rocBLAS
    - rocm-cmake
    - rocminfo
    - rocPRIM
    - ROCR-Runtime
+    - rocSPARSE
 - name: rocmTestDependencies
  type: object
  default:
@@ -72,47 +55,33 @@ parameters:
  type: object
  default:
    buildJobs:
-      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
-      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
-      # - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
-      # - { os: almalinux8, packageManager: dnf, target: gfx942 }
-      # - { os: almalinux8, packageManager: dnf, target: gfx90a }
-      # - { os: almalinux8, packageManager: dnf, target: gfx1201 }
-      # - { os: almalinux8, packageManager: dnf, target: gfx1100 }
-      # - { os: almalinux8, packageManager: dnf, target: gfx1030 }
+      - gfx942:
+        target: gfx942
+      - gfx90a:
+        target: gfx90a
    testJobs:
-      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+      - gfx942:
+        target: gfx942
+      - gfx90a:
+        target: gfx90a

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
-    ${{ if parameters.buildDependsOn }}:
-      dependsOn:
-        - ${{ each build in parameters.buildDependsOn }}:
-          - ${{ build }}_${{ job.os }}_${{ job.target }}
+  - job: rocSOLVER_build_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
    pool: ${{ variables.MEDIUM_BUILD_POOL }}
-    ${{ if eq(job.os, 'almalinux8') }}:
-      container:
-        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-        endpoint: ContainerService3
    workspace:
      clean: all
    steps:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
-        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - task: Bash@3
      displayName: 'Clone lapack'
      inputs:
@@ -123,15 +92,11 @@ jobs:
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
-        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-        ${{ if parameters.triggerDownstreamJobs }}:
-          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        componentName: lapack
-        os: ${{ job.os }}
        extraBuildFlags: >-
          -DCMAKE_BUILD_TYPE=Release
          -DCMAKE_Fortran_FLAGS=-fno-optimize-sibling-calls
@@ -144,7 +109,6 @@ jobs:
        installDir: '$(Pipeline.Workspace)/deps-install'
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
-        os: ${{ job.os }}
        extraBuildFlags: >-
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Pipeline.Workspace)/deps-install
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
@@ -156,71 +120,56 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
-        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
-        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    - ${{ if eq(job.os, 'ubuntu2204') }}:
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          gpuTarget: ${{ job.target }}
-          extraCopyDirectories:
-            - deps-install
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        gpuTarget: ${{ job.target }}
+        extraCopyDirectories:
+          - deps-install

- ${{ if eq(parameters.unifiedBuild, False) }}:
-  - ${{ each job in parameters.jobMatrix.testJobs }}:
-    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
-      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
-      condition:
-        and(succeeded(),
-          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
-          eq(${{ parameters.aggregatePipeline }}, False)
-        )
-      variables:
-      - group: common
-      - template: /.azuredevops/variables-global.yml
-      pool: ${{ job.target }}_test_pool
-      workspace:
-        clean: all
-      steps:
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-        parameters:
-          preTargetFilter: ${{ parameters.componentName }}
-          os: ${{ job.os }}
-          gpuTarget: ${{ job.target }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-        parameters:
-          os: ${{ job.os }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-        parameters:
-          checkoutRef: ${{ parameters.checkoutRef }}
-          dependencyList: ${{ parameters.rocmTestDependencies }}
-          os: ${{ job.os }}
-          gpuTarget: ${{ job.target }}
-          ${{ if parameters.triggerDownstreamJobs }}:
-            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-        parameters:
-          componentName: ${{ parameters.componentName }}
-          os: ${{ job.os }}
-          testDir: '$(Agent.BuildDirectory)/rocm/bin'
-          testExecutable: './rocsolver-test'
-          testParameters: '--gtest_filter="*checkin*" --gtest_output=xml:./test_output.xml --gtest_color=yes'
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          environment: test
-          gpuTarget: ${{ job.target }}
+- ${{ each job in parameters.jobMatrix.testJobs }}:
+  - job: rocSOLVER_test_${{ job.target }}
+    dependsOn: rocSOLVER_build_${{ job.target }}
+    condition:
+      and(succeeded(),
+        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
+        eq(${{ parameters.aggregatePipeline }}, False)
+      )
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    pool: ${{ job.target }}_test_pool
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+      parameters:
+        gpuTarget: ${{ job.target }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      parameters:
+        checkoutRef: ${{ parameters.checkoutRef }}
+        dependencyList: ${{ parameters.rocmTestDependencies }}
+        gpuTarget: ${{ job.target }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+      parameters:
+        componentName: rocSOLVER
+        testDir: '$(Agent.BuildDirectory)/rocm/bin'
+        testExecutable: './rocsolver-test'
+        testParameters: '--gtest_filter="*checkin*" --gtest_output=xml:./test_output.xml --gtest_color=yes'
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        environment: test
+        gpuTarget: ${{ job.target }}
--- a/.azuredevops/components/rocThrust.yml
+++ b/.azuredevops/components/rocThrust.yml
@@ -1,29 +1,10 @@
 parameters:
- name: componentName
-  type: string
-  default: rocThrust
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
-# monorepo related parameters
- name: sparseCheckoutDir
-  type: string
-  default: ''
- name: triggerDownstreamJobs
-  type: boolean
-  default: false
- name: downstreamAggregateNames
-  type: string
-  default: ''
- name: buildDependsOn
-  type: object
-  default: null
- name: unifiedBuild
-  type: boolean
-  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -33,17 +14,18 @@ parameters:
  type: object
  default:
    - cmake
-    - git
    - ninja-build
    - libboost-program-options-dev
+    - googletest
    - libfftw3-dev
+    - git
    - python3-pip
 - name: rocmDependencies
  type: object
  default:
    - clr
+    - hipRAND
    - llvm-project
-    - rocm-cmake
    - rocminfo
    - rocPRIM
    - ROCR-Runtime
@@ -54,142 +36,104 @@ parameters:
    - llvm-project
    - rocminfo
    - rocPRIM
-    - rocprofiler-register
    - ROCR-Runtime
+    - hipRAND
+    - rocprofiler-register

 - name: jobMatrix
  type: object
  default:
    buildJobs:
-      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
-      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
-      - { os: almalinux8, packageManager: dnf, target: gfx942 }
-      - { os: almalinux8, packageManager: dnf, target: gfx90a }
-      - { os: almalinux8, packageManager: dnf, target: gfx1201 }
-      - { os: almalinux8, packageManager: dnf, target: gfx1100 }
-      - { os: almalinux8, packageManager: dnf, target: gfx1030 }
+      - gfx942:
+        target: gfx942
+      - gfx90a:
+        target: gfx90a
    testJobs:
-      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+      - gfx942:
+        target: gfx942
+      - gfx90a:
+        target: gfx90a

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
-    ${{ if parameters.buildDependsOn }}:
-      dependsOn:
-        - ${{ each build in parameters.buildDependsOn }}:
-          - ${{ build }}_${{ job.os }}_${{ job.target }}
+  - job: rocThrust_build_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
    pool: ${{ variables.MEDIUM_BUILD_POOL }}
-    ${{ if eq(job.os, 'almalinux8') }}:
-      container:
-        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-        endpoint: ContainerService3
    workspace:
      clean: all
    steps:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
-        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
-      parameters:
-        dependencyList:
-          - gtest
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-        ${{ if parameters.triggerDownstreamJobs }}:
-          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
-        os: ${{ job.os }}
-        consolidateBuildAndInstall: true
        extraBuildFlags: >-
          -GNinja
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
          -DROCM_PATH=$(Agent.BuildDirectory)/rocm
-          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor"
+          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
          -DAMDGPU_TARGETS=${{ job.target }}
          -DBUILD_TEST=ON
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
-        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    - ${{ if eq(job.os, 'ubuntu2204') }}:
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          gpuTarget: ${{ job.target }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        gpuTarget: ${{ job.target }}

- ${{ if eq(parameters.unifiedBuild, False) }}:
-  - ${{ each job in parameters.jobMatrix.testJobs }}:
-    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
-      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
-      condition:
-        and(succeeded(),
-          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
-          eq(${{ parameters.aggregatePipeline }}, False)
-        )
-      variables:
-      - group: common
-      - template: /.azuredevops/variables-global.yml
-      pool: ${{ job.target }}_test_pool
-      workspace:
-        clean: all
-      steps:
-      - checkout: none
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          packageManager: ${{ job.packageManager }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-        parameters:
-          preTargetFilter: ${{ parameters.componentName }}
-          gpuTarget: ${{ job.target }}
-          os: ${{ job.os }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-        parameters:
-          checkoutRef: ${{ parameters.checkoutRef }}
-          dependencyList: ${{ parameters.rocmTestDependencies }}
-          gpuTarget: ${{ job.target }}
-          os: ${{ job.os }}
-          ${{ if parameters.triggerDownstreamJobs }}:
-            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-        parameters:
-          componentName: ${{ parameters.componentName }}
-          testDir: '$(Agent.BuildDirectory)/rocm/bin/rocthrust'
-          testParameters: '--output-on-failure --force-new-ctest-process --output-junit test_output.xml --exclude-regex "scan.hip"'
-          os: ${{ job.os }}
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          environment: test
-          gpuTarget: ${{ job.target }}
+- ${{ each job in parameters.jobMatrix.testJobs }}:
+  - job: rocThrust_test_${{ job.target }}
+    dependsOn: rocThrust_build_${{ job.target }}
+    condition:
+      and(succeeded(),
+        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
+        eq(${{ parameters.aggregatePipeline }}, False)
+      )
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    pool: ${{ job.target }}_test_pool
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+      parameters:
+        gpuTarget: ${{ job.target }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      parameters:
+        checkoutRef: ${{ parameters.checkoutRef }}
+        dependencyList: ${{ parameters.rocmTestDependencies }}
+        gpuTarget: ${{ job.target }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+      parameters:
+        componentName: rocThrust
+        testDir: '$(Agent.BuildDirectory)/rocm/bin/rocthrust'
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        environment: test
+        gpuTarget: ${{ job.target }}
--- a/.azuredevops/components/rocm-cmake.yml
+++ b/.azuredevops/components/rocm-cmake.yml
@@ -16,6 +16,8 @@ parameters:
    - doxygen
    - doxygen-doc
    - ninja-build
+    - python3-pip
+    - python3-sphinx
 - name: pipModules
  type: object
  default:
@@ -23,75 +25,49 @@ parameters:
    - cmake==3.20.5
    - ninja
    - rocm-docs-core
-    - sphinx
-
- name: jobMatrix
-  type: object
-  default:
-    buildJobs:
-      - { os: ubuntu2204, packageManager: apt }
-      - { os: ubuntu2404, packageManager: apt }
-      - { os: almalinux8, packageManager: dnf }

 jobs:
- ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: rocm_cmake_${{ job.os }}
-    pool:
-      ${{ if eq(job.os, 'ubuntu2404') }}:
-        vmImage: 'ubuntu-24.04'
-      ${{ else }}:
-        vmImage: 'ubuntu-22.04'
-    ${{ if eq(job.os, 'almalinux8') }}:
-      container:
-        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-        endpoint: ContainerService3
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-        packageManager: ${{ job.packageManager }}
-    - task: Bash@3
-      displayName: Add CMake to PATH
-      inputs:
-        targetType: inline
-        script: echo "##vso[task.prependpath]$(python3 -m site --user-base)/bin"
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      parameters:
-        checkoutRepo: ${{ parameters.checkoutRepo }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-      parameters:
-        os: ${{ job.os }}
-        useAmdclang: false
-    - task: Bash@3
-      displayName: CTest setup
-      inputs:
-        targetType: inline
-        script: |
-          python -m pip install -r $(Build.SourcesDirectory)/docs/requirements.txt
-          python -m pip install -r $(Build.SourcesDirectory)/test/docsphinx/docs/.sphinx/requirements.txt
-          git config --global user.email "you@example.com"
-          git config --global user.name "Your Name"
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: rocm-cmake
-        testParameters: '-E "pass-version-parent" --output-on-failure --force-new-ctest-process --output-junit test_output.xml'
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-      parameters:
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-      parameters:
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-    #   parameters:
-    #     aptPackages: ${{ parameters.aptPackages }}
-    #     pipModules: ${{ parameters.pipModules }}
-    #     environment: combined
+- job: rocm_cmake
+  variables:
+  - group: common
+  - template: /.azuredevops/variables-global.yml
+  pool:
+    vmImage: ${{ variables.BASE_BUILD_POOL }}
+  workspace:
+    clean: all
+  steps:
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+    parameters:
+      aptPackages: ${{ parameters.aptPackages }}
+      pipModules: ${{ parameters.pipModules }}
+  - task: Bash@3
+    displayName: Add CMake to PATH
+    inputs:
+      targetType: inline
+      script: echo "##vso[task.prependpath]$(python3 -m site --user-base)/bin"
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+    parameters:
+      checkoutRepo: ${{ parameters.checkoutRepo }}
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+  - task: Bash@3
+    displayName: CTest setup
+    inputs:
+      targetType: inline
+      script: |
+        python -m pip install -r $(Build.SourcesDirectory)/docs/requirements.txt
+        python -m pip install -r $(Build.SourcesDirectory)/test/docsphinx/docs/.sphinx/requirements.txt
+        git config --global user.email "you@example.com"
+        git config --global user.name "Your Name"
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+    parameters:
+      componentName: rocm-cmake
+      testParameters: '-E "pass-version-parent" --output-on-failure --force-new-ctest-process --output-junit test_output.xml'
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+  # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+  #   parameters:
+  #     aptPackages: ${{ parameters.aptPackages }}
+  #     pipModules: ${{ parameters.pipModules }}
+  #     environment: combined
--- a/.azuredevops/components/rocm-core.yml
+++ b/.azuredevops/components/rocm-core.yml
@@ -15,61 +15,39 @@ parameters:
  default:
    - cmake
    - ninja-build
-
- name: jobMatrix
-  type: object
-  default:
-    buildJobs:
-      - { os: ubuntu2204, packageManager: apt }
-      - { os: ubuntu2404, packageManager: apt }
-      - { os: almalinux8, packageManager: dnf }
+    - python3-pip

 jobs:
- ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: rocm_core_${{ job.os }}
-    pool:
-      ${{ if eq(job.os, 'ubuntu2404') }}:
-        vmImage: 'ubuntu-24.04'
-      ${{ else }}:
-        vmImage: 'ubuntu-22.04'
-    ${{ if eq(job.os, 'almalinux8') }}:
-      container:
-        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-        endpoint: ContainerService3
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        packageManager: ${{ job.packageManager }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      parameters:
-        checkoutRepo: ${{ parameters.checkoutRepo }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-      parameters:
-        os: ${{ job.os }}
-        useAmdclang: false
-        extraBuildFlags: >-
-          -DCMAKE_CURRENT_BINARY_DIR=$PWD
-          -DCMAKE_CURRENT_SOURCE_DIR=$PWD/../
-          -DCMAKE_VERBOSE_MAKEFILE=1
-          -DCPACK_GENERATOR=DEB
-          -DCPACK_DEBIAN_PACKAGE_RELEASE="local.9999~99.99"
-          -DCPACK_RPM_PACKAGE_RELEASE="local.9999"
-          -DROCM_VERSION="$(NEXT_RELEASE_VERSION)"
-          -GNinja
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-      parameters:
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-      parameters:
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-    #   parameters:
-    #     aptPackages: ${{ parameters.aptPackages }}
+- job: rocm_core
+  variables:
+  - group: common
+  - template: /.azuredevops/variables-global.yml
+  pool:
+    vmImage: ${{ variables.BASE_BUILD_POOL }}
+  workspace:
+    clean: all
+  steps:
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+    parameters:
+      aptPackages: ${{ parameters.aptPackages }}
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+    parameters:
+      checkoutRepo: ${{ parameters.checkoutRepo }}
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+    parameters:
+      extraBuildFlags: >-
+        -DCMAKE_CURRENT_BINARY_DIR=$PWD
+        -DCMAKE_CURRENT_SOURCE_DIR=$PWD/../
+        -DCMAKE_VERBOSE_MAKEFILE=1
+        -DCPACK_GENERATOR=DEB
+        -DCPACK_DEBIAN_PACKAGE_RELEASE="local.9999~99.99"
+        -DCPACK_RPM_PACKAGE_RELEASE="local.9999"
+        -DROCM_VERSION="$(NEXT_RELEASE_VERSION)"
+        -GNinja
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+  # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+  #   parameters:
+  #     aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/rocm-examples.yml
+++ b/.azuredevops/components/rocm-examples.yml
@@ -15,7 +15,6 @@ parameters:
  default:
    - cmake
    - libglfw3-dev
-    - libmsgpack-dev
    - libtbb-dev
    - ninja-build
    - python3-pip
--- a/.azuredevops/components/rocm_smi_lib.yml
+++ b/.azuredevops/components/rocm_smi_lib.yml
@@ -17,66 +17,50 @@ parameters:
    - libdrm-dev
    - ninja-build
    - pkg-config
+    - python3-pip

 - name: jobMatrix
  type: object
  default:
-    buildJobs:
-      - { os: ubuntu2204, packageManager: apt }
-      - { os: ubuntu2404, packageManager: apt }
-      - { os: almalinux8, packageManager: dnf }
    testJobs:
-      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+      - gfx942:
+        target: gfx942
+      - gfx90a:
+        target: gfx90a

 jobs:
- ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: rocm_smi_lib_build_${{ job.os }}
-    pool:
-      ${{ if eq(job.os, 'ubuntu2404') }}:
-        vmImage: 'ubuntu-24.04'
-      ${{ else }}:
-        vmImage: 'ubuntu-22.04'
-    ${{ if eq(job.os, 'almalinux8') }}:
-      container:
-        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-        endpoint: ContainerService3
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        packageManager: ${{ job.packageManager }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      parameters:
-        checkoutRepo: ${{ parameters.checkoutRepo }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-      parameters:
-        os: ${{ job.os }}
-        useAmdclang: false
-        extraBuildFlags: >-
-          -DBUILD_TESTS=ON
-          -DROCM_DEP_ROCMCORE=ON
-          -GNinja
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-      parameters:
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-      parameters:
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-    #   parameters:
-    #     aptPackages: ${{ parameters.aptPackages }}
+- job: rocm_smi_lib_build
+  variables:
+  - group: common
+  - template: /.azuredevops/variables-global.yml
+  pool:
+    vmImage: ${{ variables.BASE_BUILD_POOL }}
+  workspace:
+    clean: all
+  steps:
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+    parameters:
+      aptPackages: ${{ parameters.aptPackages }}
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+    parameters:
+      checkoutRepo: ${{ parameters.checkoutRepo }}
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+    parameters:
+      extraBuildFlags: >-
+        -DBUILD_TESTS=ON
+        -DROCM_DEP_ROCMCORE=ON
+        -GNinja
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+  # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+  #   parameters:
+  #     aptPackages: ${{ parameters.aptPackages }}

 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: rocm_smi_lib_test_${{ job.os }}_${{ job.target }}
-    dependsOn: rocm_smi_lib_build_${{ job.os }}
+  - job: rocm_smi_lib_test_${{ job.target }}
+    dependsOn: rocm_smi_lib_build
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -93,11 +77,8 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
-        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-      parameters:
-        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
      parameters:
        runRocminfo: false
@@ -105,9 +86,8 @@ jobs:
      parameters:
        componentName: rocm_smi_lib
        testDir: '$(Agent.BuildDirectory)'
-        testExecutable: 'sudo ./rocm/share/rocm_smi/rsmitst_tests/rsmitst'
+        testExecutable: './rocm/share/rocm_smi/rsmitst_tests/rsmitst'
        testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
-        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/rocminfo.yml
+++ b/.azuredevops/components/rocminfo.yml
@@ -17,6 +17,7 @@ parameters:
    - libdrm-amdgpu-dev
    - libdrm-dev
    - ninja-build
+    - python3-pip
 - name: rocmDependencies
  type: object
  default:
@@ -31,63 +32,49 @@ parameters:
 - name: jobMatrix
  type: object
  default:
-    buildJobs:
-      - { os: ubuntu2204, packageManager: apt }
-      - { os: almalinux8, packageManager: dnf }
    testJobs:
-      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+      - gfx942:
+        target: gfx942
+      - gfx90a:
+        target: gfx90a

 jobs:
- ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: rocminfo_build_${{ job.os }}
-    pool:
-      vmImage: 'ubuntu-22.04'
-    ${{ if eq(job.os, 'almalinux8') }}:
-      container:
-        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-        endpoint: ContainerService3
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        packageManager: ${{ job.packageManager }}
-        registerROCmPackages: true
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      parameters:
-        checkoutRepo: ${{ parameters.checkoutRepo }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmDependencies }}
-        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-        skipLlvmSymlink: true
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-      parameters:
-        os: ${{ job.os }}
-        useAmdclang: false
-        extraBuildFlags: >-
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-          -DROCRTST_BLD_TYPE=release
-          -GNinja
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-      parameters:
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-      parameters:
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+- job: rocminfo
+  variables:
+  - group: common
+  - template: /.azuredevops/variables-global.yml
+  pool:
+    vmImage: ${{ variables.BASE_BUILD_POOL }}
+  workspace:
+    clean: all
+  steps:
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+    parameters:
+      aptPackages: ${{ parameters.aptPackages }}
+      registerROCmPackages: true
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+    parameters:
+      checkoutRepo: ${{ parameters.checkoutRepo }}
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+    parameters:
+      checkoutRef: ${{ parameters.checkoutRef }}
+      dependencyList: ${{ parameters.rocmDependencies }}
+      aggregatePipeline: ${{ parameters.aggregatePipeline }}
+      skipLlvmSymlink: true
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+    parameters:
+      extraBuildFlags: >-
+        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+        -DROCRTST_BLD_TYPE=release
+        -GNinja
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml

 - ${{ each job in parameters.jobMatrix.testJobs }}:
  - job: rocminfo_test_${{ job.target }}
-    dependsOn: rocminfo_build_${{ job.os }}
+    dependsOn: rocminfo
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -104,18 +91,14 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
-        packageManager: ${{ job.packageManager }}
        registerROCmPackages: true
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-      parameters:
-        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmTestDependencies }}
        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
      parameters:
        runRocminfo: false
@@ -126,7 +109,6 @@ jobs:
        testExecutable: './rocm/bin/rocminfo'
        testParameters: ''
        testPublishResults: false
-        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
      parameters:
        componentName: rocm_agent_enumerator
@@ -134,7 +116,6 @@ jobs:
        testExecutable: './rocm/bin/rocm_agent_enumerator'
        testParameters: ''
        testPublishResults: false
-        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/rocprofiler-compute.yml
+++ b/.azuredevops/components/rocprofiler-compute.yml
@@ -24,28 +24,24 @@ parameters:
  default:
    - astunparse==1.6.2
    - colorlover
-    - dash-bootstrap-components
-    - dash-svg
-    - "dash>=3.0.0"
-    - kaleido==0.2.1
+    - "dash>=1.12.0"
    - matplotlib
    - "numpy>=1.17.5"
    - "pandas>=1.4.3"
-    - plotext
-    - plotille
    - pymongo
    - pyyaml
-    - setuptools
    - tabulate
-    - textual
-    - textual_plotext
-    - textual-fspicker
    - tqdm
+    - dash-svg
+    - dash-bootstrap-components
+    - kaleido
+    - setuptools
+    - plotille
    - mock
    - pytest
    - pytest-cov
    - pytest-xdist
- name: rocmTestDependencies
+- name: rocmDependencies
  type: object
  default:
    - amdsmi
@@ -118,6 +114,14 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      parameters:
+        checkoutRef: ${{ parameters.checkoutRef }}
+        dependencyList: ${{ parameters.rocmDependencies }}
+        dependencySource: ${{ job.dependencySource }}
+        gpuTarget: ${{ job.target }}
+        aggregatePipeline: ${{ parameters.aggregatePipeline }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        extraBuildFlags: >-
@@ -161,6 +165,14 @@ jobs:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - task: Bash@3
+      displayName: Add en_US.UTF-8 locale
+      inputs:
+        targetType: inline
+        script: |
+          sudo locale-gen en_US.UTF-8
+          sudo update-locale
+          locale -a
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
@@ -172,17 +184,9 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmTestDependencies }}
+        dependencyList: ${{ parameters.rocmDependencies }}
        dependencySource: ${{ job.dependencySource }}
        gpuTarget: ${{ job.target }}
-    - task: Bash@3
-      displayName: Add en_US.UTF-8 locale
-      inputs:
-        targetType: inline
-        script: |
-          sudo locale-gen en_US.UTF-8
-          sudo update-locale
-          locale -a
    - task: Bash@3
      displayName: Add ROCm binaries to PATH
      inputs:
--- a/.azuredevops/components/rocprofiler-register.yml
+++ b/.azuredevops/components/rocprofiler-register.yml
@@ -15,62 +15,40 @@ parameters:
  default:
    - cmake
    - ninja-build
-
- name: jobMatrix
-  type: object
-  default:
-    buildJobs:
-      - { os: ubuntu2204, packageManager: apt }
-      - { os: ubuntu2404, packageManager: apt }
-      - { os: almalinux8, packageManager: dnf }
+    - python3-pip

 jobs:
- ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: rocprofiler_register_${{ job.os }}
-    pool:
-      ${{ if eq(job.os, 'ubuntu2404') }}:
-        vmImage: 'ubuntu-24.04'
-      ${{ else }}:
-        vmImage: 'ubuntu-22.04'
-    ${{ if eq(job.os, 'almalinux8') }}:
-      container:
-        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-        endpoint: ContainerService3
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        packageManager: ${{ job.packageManager }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      parameters:
-        checkoutRepo: ${{ parameters.checkoutRepo }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-      parameters:
-        componentName: rocprofiler-register
-        os: ${{ job.os }}
-        useAmdclang: false
-        extraBuildFlags: >-
-          -DCMAKE_PREFIX_PATH=$(Build.BinariesDirectory)
-          -DROCPROFILER_REGISTER_BUILD_TESTS=ON
-          -DROCPROFILER_REGISTER_BUILD_SAMPLES=ON
-          -GNinja
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: rocprofiler-register
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-      parameters:
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-      parameters:
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-    #   parameters:
-    #     aptPackages: ${{ parameters.aptPackages }}
-    #     environment: combined
+- job: rocprofiler_register
+  variables:
+  - group: common
+  - template: /.azuredevops/variables-global.yml
+  pool:
+    vmImage: ${{ variables.BASE_BUILD_POOL }}
+  workspace:
+    clean: all
+  steps:
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+    parameters:
+      aptPackages: ${{ parameters.aptPackages }}
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+    parameters:
+      checkoutRepo: ${{ parameters.checkoutRepo }}
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+    parameters:
+      componentName: rocprofiler-register
+      extraBuildFlags: >-
+        -DCMAKE_PREFIX_PATH=$(Build.BinariesDirectory)
+        -DROCPROFILER_REGISTER_BUILD_TESTS=ON
+        -DROCPROFILER_REGISTER_BUILD_SAMPLES=ON
+        -GNinja
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+    parameters:
+      componentName: rocprofiler-register
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+  # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+  #   parameters:
+  #     aptPackages: ${{ parameters.aptPackages }}
+  #     environment: combined
--- a/.azuredevops/components/rocprofiler-sdk.yml
+++ b/.azuredevops/components/rocprofiler-sdk.yml
@@ -14,12 +14,10 @@ parameters:
  type: object
  default:
    - build-essential
-    - cmake
    - libdrm-amdgpu-dev
    - libdrm-dev
    - libdw-dev
    - libelf-dev
-    - libsqlite3-dev
    - libva-dev
    - ninja-build
    - pkg-config
@@ -76,7 +74,8 @@ jobs:
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
-    pool: ${{ variables.MEDIUM_BUILD_POOL }}
+    pool:
+      vmImage: ${{ variables.BASE_BUILD_POOL }}
    workspace:
      clean: all
    steps:
--- a/.azuredevops/components/rocprofiler.yml
+++ b/.azuredevops/components/rocprofiler.yml
@@ -1,7 +1,4 @@
 parameters:
- name: componentName
-  type: string
-  default: rocprofiler
 - name: checkoutRepo
  type: string
  default: 'self'
@@ -18,6 +15,7 @@ parameters:
  type: object
  default:
    - cmake
+    - libgtest-dev
    - libdrm-dev
    - libdw-dev
    - libsystemd-dev
@@ -28,13 +26,13 @@ parameters:
 - name: pipModules
  type: object
  default:
-    - barectf
-    - Cppheaderparser
-    - lxml
-    - matplotlib
-    - pandas
    - pyyaml==5.3.1
+    - Cppheaderparser
    - websockets
+    - matplotlib
+    - lxml
+    - barectf
+    - pandas
 - name: rocmDependencies
  type: object
  default:
@@ -43,33 +41,29 @@ parameters:
    - ROCdbgapi
    - rocm-cmake
    - rocm-core
-    - rocminfo
    - rocm_smi_lib
-    - rocprofiler-register
+    - rocminfo
    - ROCR-Runtime
+    - rocprofiler-register
    - roctracer

 - name: jobMatrix
  type: object
  default:
    buildJobs:
-      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
-      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
-      - { os: almalinux8, packageManager: dnf, target: gfx942 }
-      - { os: almalinux8, packageManager: dnf, target: gfx90a }
-      - { os: almalinux8, packageManager: dnf, target: gfx1201 }
-      - { os: almalinux8, packageManager: dnf, target: gfx1100 }
-      - { os: almalinux8, packageManager: dnf, target: gfx1030 }
+      - gfx942:
+        target: gfx942
+      - gfx90a:
+        target: gfx90a
    testJobs:
-      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+      - gfx942:
+        target: gfx942
+      - gfx90a:
+        target: gfx90a

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
+  - job: rocprofiler_build_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -78,10 +72,6 @@ jobs:
    - name: ROCM_PATH
      value: $(Agent.BuildDirectory)/rocm
    pool: ${{ variables.MEDIUM_BUILD_POOL }}
-    ${{ if eq(job.os, 'almalinux8') }}:
-      container:
-        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-        endpoint: ContainerService3
    workspace:
      clean: all
    steps:
@@ -89,59 +79,46 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
-        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
-      parameters:
-        dependencyList:
-          - gtest
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-      parameters:
-        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
-        os: ${{ job.os }}
-        useAmdclang: false
        extraBuildFlags: >-
-          -DCMAKE_MODULE_PATH=$(Build.SourcesDirectory)/cmake_modules;$(Agent.BuildDirectory)/rocm/lib/cmake;$(Agent.BuildDirectory)/rocm/lib/cmake/hip;$(Agent.BuildDirectory)/rocm/lib64/cmake;$(Agent.BuildDirectory)/rocm/lib64/cmake/hip
-          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor"
-          -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+          -DCMAKE_MODULE_PATH=$(Build.SourcesDirectory)/cmake_modules;$(Agent.BuildDirectory)/rocm/lib/cmake;$(Agent.BuildDirectory)/rocm/lib/cmake/hip
+          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
          -DENABLE_LDCONFIG=OFF
          -DUSE_PROF_API=1
          -DGPU_TARGETS=${{ job.target }}
+          -DAMDGPU_TARGETS=${{ job.target }}
        multithreadFlag: -- -j32
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    - ${{ if eq(job.os, 'ubuntu2204') }}:
-      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-        parameters:
-          aptPackages: ${{ parameters.aptPackages }}
-          pipModules: ${{ parameters.pipModules }}
-          gpuTarget: ${{ job.target }}
-          extraEnvVars:
-            - HIP_ROCCLR_HOME:::/home/user/workspace/rocm
-            - ROCM_PATH:::/home/user/workspace/rocm
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        pipModules: ${{ parameters.pipModules }}
+        gpuTarget: ${{ job.target }}
+        extraEnvVars:
+          - HIP_ROCCLR_HOME:::/home/user/workspace/rocm
+          - ROCM_PATH:::/home/user/workspace/rocm

 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
-    dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
+  - job: rocprofiler_test_${{ job.target }}
+    dependsOn: rocprofiler_build_${{ job.target }}
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -162,21 +139,16 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
-        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
      parameters:
        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-      parameters:
-        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
      parameters:
@@ -185,14 +157,12 @@ jobs:
        testExecutable:  ./run.sh
        testParameters: ''
        testPublishResults: false
-        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
      parameters:
        componentName: rocprofilerV2
        testDir: $(Agent.BuildDirectory)/rocm
        testExecutable:  share/rocprofiler/tests/runUnitTests
        testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
-        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/roctracer.yml
+++ b/.azuredevops/components/roctracer.yml
@@ -1,7 +1,4 @@
 parameters:
- name: componentName
-  type: string
-  default: roctracer
 - name: checkoutRepo
  type: string
  default: 'self'
@@ -21,7 +18,7 @@ parameters:
    - graphviz
    - libdrm-amdgpu-dev
    - ninja-build
-    - zlib1g-dev
+    - python3-pip
 - name: pipModules
  type: object
  default:
@@ -48,32 +45,26 @@ parameters:
  type: object
  default:
    buildJobs:
-      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
-      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
-      - { os: almalinux8, packageManager: dnf, target: gfx942 }
-      - { os: almalinux8, packageManager: dnf, target: gfx90a }
-      - { os: almalinux8, packageManager: dnf, target: gfx1201 }
-      - { os: almalinux8, packageManager: dnf, target: gfx1100 }
-      - { os: almalinux8, packageManager: dnf, target: gfx1030 }
+      - gfx942:
+        target: gfx942
+      - gfx90a:
+        target: gfx90a
    testJobs:
-      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+      - gfx942:
+        target: gfx942
+      - gfx90a:
+        target: gfx90a

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
+  - job: roctracer_build_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
+    - name: HIP_ROCCLR_HOME
+      value: $(Build.BinariesDirectory)/rocm
    pool:
      vmImage: ${{ variables.BASE_BUILD_POOL }}
-    ${{ if eq(job.os, 'almalinux8') }}:
-      container:
-        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-        endpoint: ContainerService3
    workspace:
      clean: all
    steps:
@@ -81,7 +72,6 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
-        packageManager: ${{ job.packageManager }}
        registerROCmPackages: true
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
@@ -93,27 +83,21 @@ jobs:
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-        os: ${{ job.os }}
-    # the linker flags will not affect ubuntu2204 builds as the paths do not exist
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
-        os: ${{ job.os }}
-        useAmdclang: false
        extraBuildFlags: >-
          -DCMAKE_BUILD_TYPE=release
          -DROCM_PATH=$(Agent.BuildDirectory)/rocm
-          -DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm/lib/cmake/hip;$(Agent.BuildDirectory)/rocm/lib64/cmake/hip
+          -DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm/lib/cmake/hip
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
          -DGPU_TARGETS=${{ job.target }}
-          -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+          -DAMDGPU_TARGETS=${{ job.target }}
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
-        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
-        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
@@ -124,8 +108,8 @@ jobs:
    #     registerROCmPackages: true

 - ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
-    dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
+  - job: roctracer_test_${{ job.target }}
+    dependsOn: roctracer_build_${{ job.target }}
    condition:
      and(succeeded(),
        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
@@ -143,20 +127,17 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
-        packageManager: ${{ job.packageManager }}
        registerROCmPackages: true
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
      parameters:
        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmTestDependencies }}
        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
      parameters:
@@ -165,7 +146,6 @@ jobs:
        testParameters: ''
        testDir: $(Agent.BuildDirectory)
        testPublishResults: false
-        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/dependencies/gtest.yml
+++ b/.azuredevops/dependencies/gtest.yml
@@ -11,54 +11,36 @@ parameters:
 - name: aptPackages
  type: object
  default:
-    - cmake
    - git
+    - cmake
    - ninja-build

- name: jobMatrix
-  type: object
-  default:
-    buildJobs:
-      - { os: ubuntu2204, packageManager: apt }
-      - { os: almalinux8, packageManager: dnf }
-
 jobs:
- ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: gtest_${{ job.os }}
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool:
-      vmImage: 'ubuntu-22.04'
-    ${{ if eq(job.os, 'almalinux8') }}:
-      container:
-        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-        endpoint: ContainerService3
-    workspace:
-      clean: all
-    steps:
-    - checkout: none
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        packageManager: ${{ job.packageManager }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - task: Bash@3
-      displayName: Clone GTest ${{ parameters.gtestVersion }}
-      inputs:
-        targetType: inline
-        script: git clone https://github.com/google/googletest -b ${{ parameters.gtestVersion }} --depth=1 --shallow-submodules --recurse-submodules
-        workingDirectory: $(Agent.BuildDirectory)
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-      parameters:
-        os: ${{ job.os }}
-        cmakeBuildDir: $(Agent.BuildDirectory)/googletest/build
-        cmakeSourceDir: $(Agent.BuildDirectory)/googletest
-        useAmdclang: false
-        extraBuildFlags: >-
-          -DGTEST_FORCE_SHARED_CRT=ON
-          -DCMAKE_DEBUG_POSTFIX=d
-          -GNinja
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-      parameters:
-        os: ${{ job.os }}
+- job: gtest
+  variables:
+  - group: common
+  - template: /.azuredevops/variables-global.yml
+  pool:
+    vmImage: ${{ variables.BASE_BUILD_POOL }}
+  workspace:
+    clean: all
+  steps:
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+    parameters:
+      aptPackages: ${{ parameters.aptPackages }}
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+  - task: Bash@3
+    displayName: 'git clone gtest'
+    inputs:
+      targetType: inline
+      script: git clone -b ${{ parameters.gtestVersion }} https://github.com/google/googletest --depth=1 --shallow-submodules --recurse-submodules
+      workingDirectory: $(Agent.BuildDirectory)
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+    parameters:
+      cmakeBuildDir: $(Agent.BuildDirectory)/googletest/build
+      cmakeSourceDir: $(Agent.BuildDirectory)/googletest
+      extraBuildFlags: >-
+        -DGTEST_FORCE_SHARED_CRT=ON
+        -DCMAKE_DEBUG_POSTFIX=d
+        -GNinja
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
--- a/.azuredevops/nightly/pytorch.yml
+++ b/.azuredevops/nightly/pytorch.yml
@@ -4,71 +4,71 @@ parameters:
 - name: aptPackages
  type: object
  default:
-    - autoconf
+    - build-essential
+    - git
+    - ninja-build
+    - openjdk-8-jdk
+    - ca-certificates
    - bc
    - bridge-utils
-    - build-essential
-    - ca-certificates
-    - ccache
    - devscripts
    - dkms
    - doxygen
-    - fakeroot
-    - ffmpeg
-    - gfortran
-    - git
-    - gnutls-bin
-    - libamd2
-    - libavformat-dev
-    - libblas3
-    - libcamd2
-    - libccolamd2
-    - libcholmod3
-    - libcolamd2
    - libdpkg-dev
    - libdpkg-perl
-    - libdrm-amdgpu1
-    - libdrm-dev
    - libelf-dev
-    - libfreetype-dev
-    - libgfortran5
-    - libgomp1
-    - libjpeg-dev
-    - libjpeg-turbo-official
-    - liblapack-dev
-    - liblapack3
-    - libmetis5
-    - libncurses-dev
-    - libnuma-dev
-    - libopenblas-dev
-    - libpth-dev
-    - libquadmath0
-    - libssh-dev
-    - libstdc++-12-dev
-    - libsuitesparseconfig5
-    - libswscale-dev
-    - libtinfo-dev
-    - libunwind-dev
-    - libwebp-dev
-    - llvm-dev
-    - ncurses-base
-    - ninja-build
-    - numactl
-    - openjdk-8-jdk
-    - python-is-python3
    - python3-dev
    - python3-pip
    - python3-venv
+    - wget
+    - ncurses-base
+    - libncurses-dev
+    - numactl
+    - libnuma-dev
+    - libssh-dev
+    - libunwind-dev
+    - llvm-dev
+    - libpth-dev
    - qemu-kvm
    - re2c
    - subversion
-    - wget
+    - fakeroot
+    - autoconf
+    - libgomp1
+    - libtinfo-dev
+    - libcholmod3
+    - libsuitesparseconfig5
+    - libstdc++-12-dev
+    - python-is-python3
+    - gfortran
+    - libgfortran5
+    - liblapack3
+    - libblas3
+    - libquadmath0
+    - libmetis5
+    - libamd2
+    - libcamd2
+    - libcolamd2
+    - libccolamd2
+    - libdrm-amdgpu1
+    - ccache
    - zip
+    - libjpeg-turbo-official
+    - libjpeg-dev
+    - libwebp-dev
+    - libfreetype-dev
+    - gnutls-bin
+    - ffmpeg
+    - libopenblas-dev
+    - liblapack-dev
+    - libswscale-dev
+    - libavformat-dev
 - name: pipModules
  type: object
  default:
+    - cmake
    - astunparse
-    - "expecttest>=0.3.0"
+    - "expecttest>=0.2.1"
    - hypothesis
    - numpy
    - psutil
@@ -76,8 +76,8 @@ parameters:
    - requests
    - setuptools==75.8.0
    - types-dataclasses
-    - "typing-extensions>=4.10.0"
-    - "sympy>=1.13.3"
+    - "typing-extensions>=4.8.0"
+    - "sympy>=1.13.0"
    - filelock
    - networkx
    - jinja2
@@ -97,39 +97,36 @@ parameters:
 - name: rocmDependencies
  type: object
  default:
+    - rocminfo
+    - MIOpen
    - clr
    - hipBLAS
-    - hipBLASLt
    - hipFFT
    - hipRAND
    - hipSOLVER
    - hipSPARSE
-    - hipSPARSELt
+    - ROCR-Runtime
    - llvm-project
-    - MIOpen
    - rccl
    - rocBLAS
    - rocFFT
-    - rocm-core
-    - rocminfo
    - rocm_smi_lib
-    - rocPRIM
-    - rocprofiler-register
    - rocRAND
-    - ROCR-Runtime
    - rocSOLVER
    - rocSPARSE
    - roctracer
+    - hipBLASLt
+    - rocprofiler-register
+    - rocm-core
+    - rocPRIM
    # below are additional dependencies not called out by build script, but throw errors during cmake
-    - composable_kernel
-    - hipBLAS-common
    - hipCUB
    - rocThrust
+    - hipBLAS-common
+    - composable_kernel
 - name: rocmTestDependencies
  type: object
  default:
-    # rocroller.so needed and is not included in the wheel
-    - hipBLASLt
    - rocminfo
 # Reference on what tests to run for torchvision found in private repo:
 # https://github.com/ROCm/rocAutomation/blob/jenkins-pipelines/pytorch/pytorch_ci/test_pytorch_test1.sh#L54
@@ -243,6 +240,12 @@ jobs:
        git clone https://github.com/pytorch/builder.git --depth=1 --recurse-submodules
        sudo ln -s $(Build.SourcesDirectory)/builder /builder
      workingDirectory: $(Build.SourcesDirectory)
+  - task: Bash@3
+    displayName: Temporarily Patch CK Submodule
+    inputs:
+      targetType: inline
+      script: git pull origin develop
+      workingDirectory: $(Build.SourcesDirectory)/pytorch/third_party/composable_kernel
  - task: Bash@3
    displayName: Install patchelf
    inputs:
@@ -264,11 +267,6 @@ jobs:
      script: |
        sudo bash pytorch/.ci/docker/common/install_rocm_magma.sh $(MAGMA_ROCM)
      workingDirectory: $(Build.SourcesDirectory)
-  - task: Bash@3
-    displayName: Install targeted typing_extensions for build
-    inputs:
-      targetType: inline
-      script: pip install --target=$(Build.SourcesDirectory)/pytorch/torch/.. typing_extensions
  - task: Bash@3
    displayName: Run ROCm Build Script
    inputs:
@@ -283,6 +281,7 @@ jobs:
        PYTORCH_ROOT=$(PYTORCH_ROOT)
        CMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
        DESIRED_DEVTOOLSET=$(DESIRED_DEVTOOLSET)
+        TORCH_PACKAGE_NAME=torch.$(ROCM_BRANCH).$(JOB_GPU_TARGET)
        PYTORCH_BUILD_VERSION=$(cat $(Build.SourcesDirectory)/pytorch/version.txt | cut -da -f1)
        PYTORCH_BUILD_NUMBER=$(date -u +%Y%m%d)
        SKIP_ALL_TESTS=1
@@ -323,6 +322,8 @@ jobs:
      inputs:
        targetType: inline
        script: >-
+          TORCH_PACKAGE_NAME=torch.$(ROCM_BRANCH).$(JOB_GPU_TARGET)
+          TORCHVISION_PACKAGE_NAME=torchvision.$(ROCM_BRANCH).$(JOB_GPU_TARGET)
          PYTORCH_VERSION=$(cat $(Build.SourcesDirectory)/pytorch/version.txt | cut -da -f1)post$(date -u +%Y%m%d)
          BUILD_VERSION=$(cat $(Build.SourcesDirectory)/vision/version.txt | cut -da -f1)post$(date -u +%Y%m%d)
          python3 setup.py bdist_wheel
@@ -399,9 +400,11 @@ jobs:
  - task: DownloadPipelineArtifact@2
    displayName: 'Download Pipeline Wheel Files'
    inputs:
-      itemPattern: '**/*.whl'
+      itemPattern: '**/*$(JOB_GPU_TARGET)*.whl'
      targetPath: $(Agent.BuildDirectory)
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+    parameters:
+      dependencySource: staging
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
    parameters:
      dependencyList: ${{ parameters.rocmTestDependencies }}
--- a/.azuredevops/nightly/rocm-nightly.yml
+++ b/.azuredevops/nightly/rocm-nightly.yml
@@ -3,21 +3,12 @@ parameters:
 - name: jobList
  type: object
  default:
-    - { os: ubuntu2204, target: gfx942, source: staging }
-    - { os: ubuntu2204, target: gfx90a, source: staging }
-    - { os: ubuntu2204, target: gfx1201, source: staging }
-    - { os: ubuntu2204, target: gfx1100, source: staging }
-    - { os: ubuntu2204, target: gfx1030, source: staging }
-    - { os: ubuntu2404, target: gfx942, source: staging }
-    - { os: ubuntu2404, target: gfx90a, source: staging }
-    - { os: ubuntu2404, target: gfx1201, source: staging }
-    - { os: ubuntu2404, target: gfx1100, source: staging }
-    - { os: ubuntu2404, target: gfx1030, source: staging }
-    - { os: almalinux8, target: gfx942, source: staging }
-    - { os: almalinux8, target: gfx90a, source: staging }
-    - { os: almalinux8, target: gfx1201, source: staging }
-    - { os: almalinux8, target: gfx1100, source: staging }
-    - { os: almalinux8, target: gfx1030, source: staging }
+    - gfx942-staging:
+      target: gfx942
+      source: staging
+    - gfx90a-staging:
+      target: gfx90a
+      source: staging
 - name: rocmDependencies
  type: object
  default:
@@ -25,9 +16,9 @@ parameters:
    - amdsmi
    - aomp-extras
    - aomp
-    - clr
    - composable_kernel
    - half
+    - HIP
    - hip-tests
    - hipBLAS
    - hipBLAS-common
@@ -92,7 +83,7 @@ schedules:

 jobs:
 - ${{ each job in parameters.jobList }}:
-  - job: rocm_nightly_${{ job.os }}_${{ job.target }}_${{ job.source }}
+  - job: rocm_nightly_${{ job.target }}_${{ job.source }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -117,9 +108,9 @@ jobs:
      parameters:
        dependencySource: ${{ job.source }}
        dependencyList: ${{ parameters.rocmDependencies }}
-        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
        skipLibraryLinking: true
+        skipLlvmSymlink: true
    - script: df -h
      displayName: System disk space after ROCm
    - script: du -sh $(Agent.BuildDirectory)/rocm
--- a/.azuredevops/tag-builds/clr.yml
+++ b/.azuredevops/tag-builds/clr.yml
@@ -28,22 +28,12 @@ resources:
    endpoint: ROCm
    name: ROCm/hipother
    ref: ${{ parameters.checkoutRef }}
-  pipelines:
-  - pipeline: hip_pipeline
-    source: \experimental\HIP
-    trigger: true
-  - pipeline: hipother_pipeline
-    source: \experimental\hipother
-    trigger: true

 trigger: none
 pr: none

 jobs:
-  - ${{ if eq(variables['Build.Reason'], 'ResourceTrigger') }}:
-    - template: ${{ variables.CI_COMPONENT_PATH }}/copyHIP.yml@pipelines_repo
-  - ${{ if ne(variables['Build.Reason'], 'ResourceTrigger') }}:
-    - template: ${{ variables.CI_COMPONENT_PATH }}/HIP.yml@pipelines_repo
-      parameters:
-        checkoutRepo: release_repo
-        checkoutRef: ${{ parameters.checkoutRef }}
+  - template: ${{ variables.CI_COMPONENT_PATH }}/HIP.yml
+    parameters:
+      checkoutRepo: release_repo
+      checkoutRef: ${{ parameters.checkoutRef }}
--- a/.azuredevops/templates/steps/artifact-download.yml
+++ b/.azuredevops/templates/steps/artifact-download.yml
@@ -19,24 +19,36 @@ parameters:
  default: false

 steps:
+- task: Bash@3
+  displayName: Set allowPartiallySucceededBuilds
+  inputs:
+    targetType: inline
+    script: |
+      if [[ ",$ALLOWED_PARTIAL_SUCCEED_BUILDS," == *",${{ parameters.componentName }},"* ]]; then
+        echo "##vso[task.setvariable variable=allowPartiallySucceededBuilds;]true"
+      else
+        echo "##vso[task.setvariable variable=allowPartiallySucceededBuilds;]false"
+      fi
 - task: DownloadPipelineArtifact@2
  displayName: Download ${{ parameters.componentName }}
  inputs:
-    itemPattern: '**/*${{ parameters.componentName }}*${{ parameters.fileFilter }}*'
-    targetPath: '$(Pipeline.Workspace)/d'
-    allowPartiallySucceededBuilds: true
-    ${{ if parameters.aggregatePipeline }}:
-      buildType: 'current'
-    ${{ else }}:
+    ${{ if eq(parameters.aggregatePipeline, false) }}:
      buildType: 'specific'
      project: ROCm-CI
-      specificBuildWithTriggering: true
      definition: ${{ parameters.pipelineId }}
+      specificBuildWithTriggering: true
+      itemPattern: '**/*${{ parameters.fileFilter }}*'
+      # aomp is a special case, since the trigger file is under ROCm/ROCm instead of the component repo
+      ${{ if notIn(parameters.componentName, 'aomp') }}:
+        buildVersionToDownload: latestFromBranch # default is 'latest'
      branchName: refs/heads/${{ parameters.branchName }}
-      ${{ if eq(parameters.componentName, 'aomp') }}:
-        buildVersionToDownload: latest # aomp trigger lives in ROCm/ROCm, so cannot use ROCm/aomp branch names
-      ${{ else }}:
-        buildVersionToDownload: latestFromBranch
+      allowPartiallySucceededBuilds: $(allowPartiallySucceededBuilds)
+      targetPath: '$(Pipeline.Workspace)/d'
+    ${{ else }}:
+      buildType: 'current'
+      itemPattern: '**/${{ parameters.componentName }}*${{ parameters.fileFilter }}*'
+      allowPartiallySucceededBuilds: $(allowPartiallySucceededBuilds)
+      targetPath: '$(Pipeline.Workspace)/d'
 - task: ExtractFiles@1
  displayName: Extract ${{ parameters.componentName }}
  inputs:
--- a/.azuredevops/templates/steps/artifact-upload.yml
+++ b/.azuredevops/templates/steps/artifact-upload.yml
@@ -3,21 +3,15 @@
 # publish can be toggled off for jobs that produce multiple tarballs
 # for those cases, only publish the last call which puts all the tarballs in one container folder
 parameters:
- name: componentName
-  type: string
-  default: $(Build.DefinitionName)
- name: gpuTarget
-  type: string
-  default: ''
 - name: artifactName
  type: string
-  default: drop
+  default: 'drop'
 - name: publish
  type: boolean
  default: true
- name: os
+- name: gpuTarget
  type: string
-  default: 'ubuntu2204'
+  default: ''

 steps:
 - task: ArchiveFiles@2
@@ -26,7 +20,7 @@ steps:
    includeRootFolder: false
    archiveType: 'tar'
    tarCompression: 'gz'
-    archiveFile: '$(Build.ArtifactStagingDirectory)/${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}_$(System.JobAttempt).tar.gz'
+    archiveFile: '$(Build.ArtifactStagingDirectory)/$(Build.DefinitionName)_$(Build.BuildId)_$(Build.BuildNumber)_ubuntu2204_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.tar.gz'
 - task: DeleteFiles@1
  displayName: 'Cleanup Staging Area'
  inputs:
@@ -38,7 +32,7 @@ steps:
  inputs:
    workingDirectory: $(Pipeline.Workspace)
    targetType: inline
-    script: echo "${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}_$(System.JobAttempt).tar.gz" >> pipelineArtifacts.txt
+    script: echo "$(Build.DefinitionName)_$(Build.BuildId)_$(Build.BuildNumber)_ubuntu2204_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.tar.gz" >> pipelineArtifacts.txt
 # then publish it
 - ${{ if parameters.publish }}:
  - task: PublishPipelineArtifact@1
@@ -46,5 +40,4 @@ steps:
    displayName: '${{ parameters.artifactName }} Publish'
    retryCountOnTaskFailure: 3
    inputs:
-      artifactName: ${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}_$(System.JobAttempt)
      targetPath: '$(Build.ArtifactStagingDirectory)'
--- a/.azuredevops/templates/steps/build-autotools.yml
+++ b/.azuredevops/templates/steps/build-autotools.yml
@@ -1,7 +1,4 @@
 parameters:
- name: os
-  type: string
-  default: 'ubuntu2204'
 - name: componentName
  type: string
  default: ''
@@ -23,23 +20,17 @@ steps:
  displayName: '${{ parameters.componentName }} configure flags'
  inputs:
    targetType: inline
+    script: ./configure --prefix=${{ parameters.installDir }} ${{ parameters.configureFlags }}
    workingDirectory: ${{ parameters.buildDir }}
-    script: |
-      ${{ iif(eq(parameters.os, 'almalinux8'), 'source /opt/rh/gcc-toolset-14/enable', '') }}
-      ./configure --prefix=${{ parameters.installDir }} ${{ parameters.configureFlags }}
 - task: Bash@3
  displayName: '${{ parameters.componentName }} make'
  inputs:
    targetType: inline
+    script: ${{ parameters.makeCallPrefix }} make -j$(nproc)
    workingDirectory: ${{ parameters.buildDir }}
-    script: |
-      ${{ iif(eq(parameters.os, 'almalinux8'), 'source /opt/rh/gcc-toolset-14/enable', '') }}
-      ${{ parameters.makeCallPrefix }} make -j$(nproc)
 - task: Bash@3
  displayName: '${{ parameters.componentName }} make install'
  inputs:
    targetType: inline
+    script: make install
    workingDirectory: ${{ parameters.buildDir }}
-    script: |
-      ${{ iif(eq(parameters.os, 'almalinux8'), 'source /opt/rh/gcc-toolset-14/enable', '') }}
-      make install
--- a/.azuredevops/templates/steps/build-cmake.yml
+++ b/.azuredevops/templates/steps/build-cmake.yml
@@ -1,16 +1,10 @@
 parameters:
- name: os
-  type: string
-  default: 'ubuntu2204'
 - name: componentName
  type: string
  default: ''
 - name: extraBuildFlags
  type: string
  default: ''
- name: extraCxxFlags
-  type: string
-  default: ''
 - name: multithreadFlag
  type: string
  default: ''
@@ -38,81 +32,41 @@ parameters:
 - name: installEnabled
  type: boolean
  default: true
-# for jobs that rebuild during install step and use ninja
-# set to true to save time, only applies for almalinux8
- name: consolidateBuildAndInstall
-  type: boolean
-  default: false
 - name: printDiskSpace
  type: boolean
  default: true
-# todo: make this control cxx and c compiler flags
- name: useAmdclang
-  type: boolean
-  default: true
-
-# for cmake calls, set env variables for AlmaLinux 8
-# to simulate running source /opt/rh/gcc-toolset-14/enable for the session

 steps:
 # create workingDirectory if it does not exist and change into it
 # call cmake from within that directory using $cmakeArgs as its parameters
 - task: CMake@1
  displayName: '${{parameters.componentName }} CMake Flags'
-  ${{ if eq(parameters.os, 'almalinux8')}}:
-    env:
-      PATH: "/opt/rh/gcc-toolset-14/root/usr/bin:$(PATH)"
-      MANPATH: "/opt/rh/gcc-toolset-14/root/usr/share/man:$(MANPATH)"
-      INFOPATH: "/opt/rh/gcc-toolset-14/root/usr/share/info:$(INFOPATH)"
-      PCP_DIR: "/opt/rh/gcc-toolset-14/root"
-      LD_LIBRARY_PATH: "/opt/rh/gcc-toolset-14/root/usr/lib64:/opt/rh/gcc-toolset-14/root/usr/lib:$(LD_LIBRARY_PATH)"
-      PKG_CONFIG_PATH: "/opt/rh/gcc-toolset-14/root/usr/lib64/pkgconfig:$(PKG_CONFIG_PATH)"
  inputs:
    workingDirectory: ${{ parameters.cmakeBuildDir }}
-    cmakeArgs: >-
-      ${{ iif(parameters.customInstallPath, join('', format('-DCMAKE_INSTALL_PREFIX={0}', parameters.installDir)), '') }}
-      ${{ iif(eq(parameters.os, 'almalinux8'), '-DCMAKE_SHARED_LINKER_FLAGS="-L$(Agent.BuildDirectory)/rocm/lib64 -L/opt/rh/gcc-toolset-14/root/usr/lib/gcc/x86_64-redhat-linux/14/"', '') }}
-      ${{ iif(eq(parameters.os, 'almalinux8'), '-DCMAKE_EXE_LINKER_FLAGS="-L$(Agent.BuildDirectory)/rocm/lib64 -L/opt/rh/gcc-toolset-14/root/usr/lib/gcc/x86_64-redhat-linux/14/"', '') }}
-      -DCMAKE_CXX_FLAGS="${{ parameters.extraCxxFlags }} ${{ iif(and(eq(parameters.os, 'almalinux8'), parameters.useAmdclang), '--gcc-toolchain=/opt/rh/gcc-toolset-14/root', '') }}"
-      ${{ parameters.extraBuildFlags }}
-      ${{ parameters.cmakeSourceDir }}
+    ${{ if eq(parameters.customInstallPath, true) }}:
+      cmakeArgs: -DCMAKE_INSTALL_PREFIX=${{ parameters.installDir }} ${{ parameters.extraBuildFlags }} ${{ parameters.cmakeSourceDir }}
+    ${{ else }}:
+      cmakeArgs: ${{ parameters.extraBuildFlags }} ${{ parameters.cmakeSourceDir }}
 - ${{ if parameters.printDiskSpace }}:
  - script: df -h
    displayName: Disk space before build
 # equivalent to running make $cmakeTargetDir from $cmakeBuildDir
 # i.e., cd $cmakeBuildDir; make $cmakeTargetDir
 - task: CMake@1
-  ${{ if and( eq(parameters.os, 'almalinux8'), eq(parameters.consolidateBuildAndInstall , true)) }}:
-    displayName: '${{ parameters.componentName }} CMake Build and Install'
-  ${{ else }}:
-    displayName: '${{ parameters.componentName }} CMake Build'
-  ${{ if eq(parameters.os, 'almalinux8')}}:
-    env:
-      PATH: "/opt/rh/gcc-toolset-14/root/usr/bin:$(PATH)"
-      MANPATH: "/opt/rh/gcc-toolset-14/root/usr/share/man:$(MANPATH)"
-      INFOPATH: "/opt/rh/gcc-toolset-14/root/usr/share/info:$(INFOPATH)"
-      PCP_DIR: "/opt/rh/gcc-toolset-14/root"
-      LD_LIBRARY_PATH: "/opt/rh/gcc-toolset-14/root/usr/lib64:/opt/rh/gcc-toolset-14/root/usr/lib:$(LD_LIBRARY_PATH)"
-      PKG_CONFIG_PATH: "/opt/rh/gcc-toolset-14/root/usr/lib64/pkgconfig:$(PKG_CONFIG_PATH)"
+  displayName: '${{parameters.componentName }} Build'
  inputs:
    workingDirectory: ${{ parameters.cmakeBuildDir }}
-    ${{ if eq(parameters.os, 'almalinux8') }}:
-      cmakeArgs: >-
-        --build ${{ parameters.cmakeTargetDir }}
-        ${{ iif(and(eq(parameters.consolidateBuildAndInstall, true), ne(parameters.cmakeTarget, '')), format('--target {0}', parameters.cmakeTarget), '') }}
-        ${{ iif(and(ne(parameters.customBuildTarget, ''), ne(parameters.consolidateBuildAndInstall, true)), format('--target {0}', parameters.customBuildTarget), '') }}
-        ${{ parameters.multithreadFlag }}
-    ${{ if ne(parameters.os, 'almalinux8') }}:
-      cmakeArgs: >-
-        --build ${{ parameters.cmakeTargetDir }}
-        ${{ iif(ne(parameters.customBuildTarget, ''), format('--target {0}', parameters.customBuildTarget), '') }}
-        ${{ parameters.multithreadFlag }}
+    ${{ if eq(parameters.customBuildTarget, '') }}:
+      cmakeArgs: '--build ${{ parameters.cmakeTargetDir }} ${{ parameters.multithreadFlag }}'
+    ${{ else }}:
+      cmakeArgs: '--build ${{ parameters.cmakeTargetDir }} --target ${{ parameters.customBuildTarget }} ${{ parameters.multithreadFlag }}'
+    retryCountOnTaskFailure: 10
 - ${{ if parameters.printDiskSpace }}:
  - script: df -h
    displayName: Disk space after build
 # equivalent to running make $cmakeTarget from $cmakeBuildDir
 # e.g., make install
- ${{ if and(eq(parameters.installEnabled, true), or(ne(parameters.os, 'almalinux8'), eq(parameters.consolidateBuildAndInstall, false))) }}:
+- ${{ if eq(parameters.installEnabled, true) }}:
  - task: CMake@1
    displayName: '${{parameters.componentName }} ${{ parameters.cmakeTarget }}'
    inputs:
--- a/.azuredevops/templates/steps/checkout.yml
+++ b/.azuredevops/templates/steps/checkout.yml
@@ -4,6 +4,9 @@ parameters:
 - name: checkoutRepo
  type: string
  default: 'self'
+- name: sparseCheckout
+  type: boolean
+  default: false
 - name: sparseCheckoutDir
  type: string
  default: ''
@@ -19,10 +22,10 @@ steps:
    submodules: ${{ parameters.submoduleBehaviour }}
    retryCountOnTaskFailure: 3
    fetchFilter: blob:none
-    ${{ if ne(parameters.sparseCheckoutDir, '') }}:
+    ${{ if eq(parameters.sparseCheckout, true) }}:
      sparseCheckoutDirectories: ${{ parameters.sparseCheckoutDir }}
      path: sparse
-  - ${{ if ne(parameters.sparseCheckoutDir, '') }}:
+  - ${{ if eq(parameters.sparseCheckout, true) }}:
    - task: Bash@3
      displayName: Symlink sparse checkout
      inputs:
--- a/.azuredevops/templates/steps/dependencies-apt.yml
+++ b/.azuredevops/templates/steps/dependencies-apt.yml
@@ -1,42 +0,0 @@
-parameters:
- name: aptPackages
-  type: object
-  default: []
- name: registerROCmPackages
-  type: boolean
-  default: false
-
-steps:
- ${{ if eq(parameters.registerROCmPackages, true) }}:
-  - task: Bash@3
-    displayName: 'Register AMDGPU & ROCm repos (apt)'
-    inputs:
-      targetType: inline
-      script: |
-        sudo mkdir --parents --mode=0755 /etc/apt/keyrings
-        wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null
-        echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/amdgpu/$(REPO_RADEON_VERSION)/ubuntu jammy main" | sudo tee /etc/apt/sources.list.d/amdgpu.list
-        echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/$(REPO_RADEON_VERSION) jammy main" | sudo tee --append /etc/apt/sources.list.d/rocm.list
-        echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600
-        sudo apt update
- task: Bash@3
-  displayName: 'sudo apt-get update'
-  inputs:
-    targetType: inline
-    script: |
-      echo "deb http://archive.ubuntu.com/ubuntu/ jammy main restricted universe multiverse" | sudo tee -a /etc/apt/sources.list.d/default.list
-      echo "deb http://archive.ubuntu.com/ubuntu/ jammy-updates main restricted universe multiverse" | sudo tee -a /etc/apt/sources.list.d/default.list
-      echo "deb http://archive.ubuntu.com/ubuntu/ jammy-backports main restricted universe multiverse" | sudo tee -a /etc/apt/sources.list.d/default.list
-      echo "deb http://archive.ubuntu.com/ubuntu/ jammy-security main restricted universe multiverse" | sudo tee -a /etc/apt/sources.list.d/default.list
-      sudo DEBIAN_FRONTEND=noninteractive apt-get --yes update
- task: Bash@3
-  displayName: 'sudo apt-get fix'
-  inputs:
-    targetType: inline
-    script: sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --fix-broken install
- ${{ if gt(length(parameters.aptPackages), 0) }}:
-  - task: Bash@3
-    displayName: 'sudo apt-get install ...'
-    inputs:
-      targetType: inline
-      script: sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --fix-missing install ${{ join(' ', parameters.aptPackages) }}
--- a/.azuredevops/templates/steps/dependencies-aqlprofile.yml
+++ b/.azuredevops/templates/steps/dependencies-aqlprofile.yml
@@ -1,44 +1,25 @@
-parameters:
- name: os
-  type: string
-  default: ubuntu2204
-
 steps:
 - task: Bash@3
  displayName: Get aqlprofile package name
  inputs:
    targetType: inline
-    ${{ if eq(parameters.os, 'ubuntu2204') }}:
-      script: |
-        export packageName=$(curl -s https://repo.radeon.com/rocm/apt/$(REPO_RADEON_VERSION)/pool/main/h/hsa-amd-aqlprofile/ | grep -oP "href=\"\K[^\"]*$(lsb_release -rs)[^\"]*\.deb")
-        echo "##vso[task.setvariable variable=packageName;isreadonly=true]$packageName"
-    ${{ if eq(parameters.os, 'almalinux8') }}:
-      script: |
-        export packageName=$(curl -s https://repo.radeon.com/rocm/rhel8/$(REPO_RADEON_VERSION)/main/ | grep -oP "hsa-amd-aqlprofile-[^\"]+\.rpm" | head -n1)
-        echo "##vso[task.setvariable variable=packageName;isreadonly=true]$packageName"
+    script: |
+      export packageName=$(curl -s https://repo.radeon.com/rocm/apt/$(REPO_RADEON_VERSION)/pool/main/h/hsa-amd-aqlprofile/ | grep -oP "href=\"\K[^\"]*$(lsb_release -rs)[^\"]*\.deb")
+      echo "##vso[task.setvariable variable=packageName;isreadonly=true]$packageName"
 - task: Bash@3
  displayName: 'Download aqlprofile'
  inputs:
    targetType: inline
+    script: wget -nv https://repo.radeon.com/rocm/apt/$(REPO_RADEON_VERSION)/pool/main/h/hsa-amd-aqlprofile/$(packageName)
    workingDirectory: '$(Pipeline.Workspace)'
-    ${{ if eq(parameters.os, 'ubuntu2204') }}:
-      script: wget -nv https://repo.radeon.com/rocm/apt/$(REPO_RADEON_VERSION)/pool/main/h/hsa-amd-aqlprofile/$(packageName)
-    ${{ if eq(parameters.os, 'almalinux8') }}:
-      script: wget -nv https://repo.radeon.com/rocm/rhel8/$(REPO_RADEON_VERSION)/main/$(packageName)
 - task: Bash@3
  displayName: 'Extract aqlprofile'
  inputs:
    targetType: inline
+    script: |
+      mkdir hsa-amd-aqlprofile
+      dpkg-deb -R $(packageName) hsa-amd-aqlprofile
    workingDirectory: '$(Pipeline.Workspace)'
-    ${{ if eq(parameters.os, 'ubuntu2204') }}:
-      script: |
-        mkdir hsa-amd-aqlprofile
-        dpkg-deb -R $(packageName) hsa-amd-aqlprofile
-    ${{ if eq(parameters.os, 'almalinux8') }}:
-      script: |
-        mkdir hsa-amd-aqlprofile
-        sudo dnf -y install rpm-build cpio
-        rpm2cpio $(packageName) | (cd hsa-amd-aqlprofile && cpio -idmv)
 - task: Bash@3
  displayName: 'Copy aqlprofile files'
  inputs:
--- a/.azuredevops/templates/steps/dependencies-boost.yml
+++ b/.azuredevops/templates/steps/dependencies-boost.yml
@@ -0,0 +1,35 @@
+steps:
+- task: DownloadPipelineArtifact@2
+  displayName: Download Boost
+  inputs:
+    buildType: specific
+    project: ROCm-CI
+    definition: $(BOOST_DEPENDENCY_PIPELINE_ID)
+    targetPath: $(Pipeline.Workspace)/d
+- task: ExtractFiles@1
+  displayName: Extract Boost
+  inputs:
+    archiveFilePatterns: '$(Pipeline.Workspace)/d/**/*.tar.gz'
+    destinationFolder: $(Agent.BuildDirectory)/boost
+    cleanDestinationFolder: true
+    overwriteExistingFiles: true
+- task: DeleteFiles@1
+  displayName: Cleanup Compressed Boost
+  inputs:
+    SourceFolder: $(Pipeline.Workspace)/d
+    Contents: '**/*.tar.gz'
+    RemoveDotFiles: true
+- task: Bash@3
+  displayName: 'List Boost files'
+  inputs:
+    targetType: inline
+    script: ls -1R $(Agent.BuildDirectory)/boost
+- task: Bash@3
+  displayName: 'Link Boost shared libraries'
+  inputs:
+    targetType: inline
+    script: |
+      echo $(Agent.BuildDirectory)/boost/lib | sudo tee /etc/ld.so.conf.d/boost.conf
+      sudo cat /etc/ld.so.conf.d/boost.conf
+      sudo ldconfig -v
+      ldconfig -p
--- a/.azuredevops/templates/steps/dependencies-cmake-latest.yml
+++ b/.azuredevops/templates/steps/dependencies-cmake-latest.yml
@@ -1,23 +1,10 @@
+# replace cmake from apt install with newest version using snap install
 steps:
 - task: Bash@3
-  displayName: Install CMake 3.31
+  displayName: update cmake
  inputs:
    targetType: inline
    script: |
-      CMAKE_VERSION=3.31.0
-      CMAKE_ROOT="$(Pipeline.Workspace)/cmake"
-
-      echo "Downloading CMake $CMAKE_VERSION..."
-      curl -fsSL -o cmake.tar.gz https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz
-
-      echo "Extracting to $CMAKE_ROOT..."
-      sudo mkdir -p $CMAKE_ROOT
-      sudo tar --strip-components=1 -xz -C $CMAKE_ROOT -f cmake.tar.gz
-
-      echo "##vso[task.prependpath]$CMAKE_ROOT/bin"
- task: Bash@3
-  displayName: cmake --version
-  inputs:
-    targetType: inline
-    script: |
-      cmake --version
+      sudo apt purge cmake -y
+      sudo snap install cmake --classic --channel=3.31/stable
+      hash -r
--- a/.azuredevops/templates/steps/dependencies-dnf.yml
+++ b/.azuredevops/templates/steps/dependencies-dnf.yml
@@ -1,157 +0,0 @@
-parameters:
- name: aptPackages
-  type: object
-  default: []
- name: registerROCmPackages
-  type: boolean
-  default: false
-# As par of installing gcc toolset and python,
-# the environment will install this base set of dnf packages.
- name: basePackages
-  type: object
-  default:
-    - epel-release
-    - gcc-toolset-14
-    - gcc-toolset-14-libatomic-devel
-    - git
-    - jq
-    - numactl
-    - python3.11
-    - python3.11-pip
-    - vim-common
-    - wget
-# Instead of defining multiple arrays of packages per component,
-# we define a map of apt package names to dnf package names.
- name: aptToDnfMap
-  type: object
-  default:
-    bison: bison
-    ccache: ccache
-    cmake: cmake
-    cuda-toolkit-12-9: cuda-compiler-12-9 cuda-toolkit-12-9
-    libcudnn9-dev-cuda-12: libcudnn9-cuda-12
-    dejagnu: dejagnu
-    doxygen: doxygen
-    # note: doxygen-doc is not available in dnf
-    # libavcodec-dev, libavformat-dev, libavutil-dev come with ffmpeg-devel
-    ffmpeg: ffmpeg ffmpeg-devel
-    flex: flex
-    # note: g++ is installed by default with gcc-toolset-14
-    # note: gawk is already installed
-    # note: gcc-toolset-14-gfortran is installed by default with gcc-toolset-14
-    # note: git is in the base packages list
-    graphviz: graphviz
-    libbabeltrace-dev: libbabeltrace-devel
-    libbison-dev: bison-devel
-    libboost-program-options-dev: boost-devel
-    # note: libdrm-amdgpu1 is not available in dnf
-    libdrm-dev: libdrm-devel
-    libdrm-amdgpu-dev: libdrm-amdgpu-devel
-    libdw-dev: elfutils-devel
-    libelf-dev: elfutils-libelf-devel
-    libexpat-dev: expat-devel
-    libffi-dev: libffi-devel
-    libfftw3-dev: fftw-devel
-    libgmp-dev: gmp-devel
-    liblzma-dev: xz-devel
-    libmpfr-dev: mpfr-devel
-    libmsgpack-dev: msgpack-devel
-    libncurses5-dev: ncurses-devel
-    libnuma-dev: numactl-devel
-    libopenmpi-dev: openmpi-devel
-    libpci-dev: libpciaccess-devel
-    libssl-dev: openssl-devel
-    # note: libstdc++-devel is in the base packages list
-    libsystemd-dev: systemd-devel
-    libtool: libtool
-    # note: libudev-dev is part of systemd-devel
-    libva-amdgpu-dev: libva-amdgpu-devel
-    mesa-amdgpu-va-drivers: mesa-amdgpu-va-drivers
-    mesa-common-dev: mesa-libGL-devel
-    ncurses-dev: ncurses-devel
-    # note: llvm needs ninja-build version newer than what dnf provides
-    ocl-icd-libopencl1: ocl-icd
-    ocl-icd-opencl-dev: ocl-icd-devel
-    opencl-headers: opencl-headers
-    parallel: parallel
-    pkg-config: pkgconf-pkg-config
-    # note: python3 is the default python in AlmaLinux 8
-    python3-dev: python3.11-devel
-    # note: python3.11-pip is already installed when updating to python 3.11
-    # note: python3.11-setuptools is already installed when updating to python 3.11
-    texinfo: texinfo
-    zlib1g-dev: zlib-devel
-
-steps:
- ${{ if eq(parameters.registerROCmPackages, true) }}:
-  - task: Bash@3
-    displayName: 'Register AMDGPU & ROCm repos (dnf)'
-    inputs:
-      targetType: inline
-      script: |
-        sudo rpm --import https://repo.radeon.com/rocm/rocm.gpg.key
-        echo '[amdgpu]' | sudo tee /etc/yum.repos.d/amdgpu.repo > /dev/null
-        echo "name=amdgpu" | sudo tee --append /etc/yum.repos.d/amdgpu.repo
-        echo "baseurl=https://repo.radeon.com/amdgpu/$(REPO_RADEON_VERSION)/rhel/8.10/main/x86_64/" | sudo tee --append /etc/yum.repos.d/amdgpu.repo
-        echo "enabled=1" | sudo tee --append /etc/yum.repos.d/amdgpu.repo
-        echo "gpgcheck=1" | sudo tee --append /etc/yum.repos.d/amdgpu.repo
-        echo "gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key" | sudo tee --append /etc/yum.repos.d/amdgpu.repo
-        echo '[rocm]' | sudo tee /etc/yum.repos.d/rocm.repo > /dev/null
-        echo "name=ROCm$(REPO_RADEON_VERSION)" | sudo tee --append /etc/yum.repos.d/rocm.repo
-        echo "baseurl=https://repo.radeon.com/rocm/rhel8/$(REPO_RADEON_VERSION)/main/" | sudo tee --append /etc/yum.repos.d/rocm.repo
-        echo "enabled=1" | sudo tee --append /etc/yum.repos.d/rocm.repo
-        echo "gpgcheck=1" | sudo tee --append /etc/yum.repos.d/rocm.repo
-        echo "gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key" | sudo tee --append /etc/yum.repos.d/rocm.repo
-        sudo dnf clean all
-        sudo dnf makecache
- task: Bash@3
-  displayName: 'Install base dnf packages'
-  inputs:
-    targetType: inline
-    script: |
-      sudo dnf config-manager --set-enabled powertools
-      # rpm fusion free repo for some dependencies
-      sudo dnf -y install https://download1.rpmfusion.org/free/el/rpmfusion-free-release-8.noarch.rpm
-      sudo dnf -y install ${{ join(' ', parameters.basePackages) }}
- task: Bash@3
-  displayName: 'Check gcc environment'
-  inputs:
-    targetType: inline
-    script: |
-      echo "=== Versions and sanity checks ==="
-      gcc --version
-      g++ --version
-      gcc -print-file-name=libstdc++.so
-      g++ -print-file-name=libstdc++.so
- task: Bash@3
-  displayName: 'Set python 3.11 as default'
-  inputs:
-    targetType: inline
-    script: |
-      sudo dnf -y module disable python36
-      sudo rm -f /usr/local/bin/python3.12 /usr/local/bin/python3.13 /usr/local/bin/python3.14
-      sudo alternatives --set python /usr/bin/python3.11
-      sudo alternatives --set python3 /usr/bin/python3.11
-      python3 --version
-      python3 -m pip install --upgrade pip setuptools wheel
- ${{ each pkg in parameters.aptPackages }}:
-  # note: llvm needs ninja-build version newer than what dnf provides
-  - ${{ if eq(pkg, 'ninja-build') }}:
-    - task: Bash@3
-      displayName: 'Install ninja 1.11.1'
-      inputs:
-        targetType: inline
-        script: |
-          curl -LO https://github.com/ninja-build/ninja/releases/download/v1.11.1/ninja-linux.zip
-          sudo dnf -y install unzip
-          unzip ninja-linux.zip
-          sudo mv ninja /usr/local/bin/ninja
-          sudo chmod +x /usr/local/bin/ninja
-          echo "##vso[task.prependpath]/usr/local/bin"
-  - ${{ if ne(parameters.aptToDnfMap[pkg], '') }}:
-    - task: Bash@3
-      displayName: 'dnf install ${{ parameters.aptToDnfMap[pkg] }}'
-      inputs:
-        targetType: inline
-        script: |
-          sudo dnf -y install ${{ parameters.aptToDnfMap[pkg] }}
--- a/.azuredevops/templates/steps/dependencies-other.yml
+++ b/.azuredevops/templates/steps/dependencies-other.yml
@@ -9,24 +9,56 @@ parameters:
 - name: registerROCmPackages
  type: boolean
  default: false
- name: packageManager
-  type: string
-  default: apt

 steps:
- ${{ if eq(parameters.packageManager, 'apt') }}:
-  - template: dependencies-apt.yml
-    parameters:
-      aptPackages: ${{ parameters.aptPackages }}
-      registerROCmPackages: ${{ parameters.registerROCmPackages }}
- ${{ if eq(parameters.packageManager, 'dnf') }}:
-  - template: dependencies-dnf.yml
-    parameters:
-      aptPackages: ${{ parameters.aptPackages }}
-      registerROCmPackages: ${{ parameters.registerROCmPackages }}
+- ${{ if eq(parameters.registerROCmPackages, true) }}:
+  - task: Bash@3
+    displayName: 'Register AMDGPU & ROCm repos'
+    inputs:
+      targetType: inline
+      script: |
+        sudo mkdir --parents --mode=0755 /etc/apt/keyrings
+        wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null
+        echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/amdgpu/$(REPO_RADEON_VERSION)/ubuntu jammy main" | sudo tee /etc/apt/sources.list.d/amdgpu.list
+        echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/$(REPO_RADEON_VERSION) jammy main" | sudo tee --append /etc/apt/sources.list.d/rocm.list
+        echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600
+        sudo apt update
+# firefox takes time to upgrade and is not needed for CI workloads, hold version
+- task: Bash@3
+  continueOnError: true
+  displayName: 'sudo apt-mark hold firefox'
+  inputs:
+    targetType: inline
+    script: sudo apt-mark hold firefox
+- task: Bash@3
+  displayName: 'sudo apt-get update'
+  inputs:
+    targetType: inline
+    script: |
+      echo "deb http://archive.ubuntu.com/ubuntu/ jammy main restricted universe multiverse" | sudo tee -a /etc/apt/sources.list.d/default.list
+      echo "deb http://archive.ubuntu.com/ubuntu/ jammy-updates main restricted universe multiverse" | sudo tee -a /etc/apt/sources.list.d/default.list
+      echo "deb http://archive.ubuntu.com/ubuntu/ jammy-backports main restricted universe multiverse" | sudo tee -a /etc/apt/sources.list.d/default.list
+      echo "deb http://archive.ubuntu.com/ubuntu/ jammy-security main restricted universe multiverse" | sudo tee -a /etc/apt/sources.list.d/default.list
+      sudo DEBIAN_FRONTEND=noninteractive apt-get --yes update
+- task: Bash@3
+  displayName: 'sudo apt-get upgrade'
+  inputs:
+    targetType: inline
+    script: sudo DEBIAN_FRONTEND=noninteractive apt-get --yes upgrade
+- task: Bash@3
+  displayName: 'sudo apt-get fix'
+  inputs:
+    targetType: inline
+    script: sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --fix-broken install
+- ${{ if gt(length(parameters.aptPackages), 0) }}:
+  - task: Bash@3
+    displayName: 'sudo apt-get install ...'
+    inputs:
+      targetType: inline
+      script: sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --fix-missing install ${{ join(' ', parameters.aptPackages) }}
 - ${{ if gt(length(parameters.pipModules), 0) }}:
  - task: Bash@3
    displayName: 'pip install  ...'
    inputs:
      targetType: inline
-      script: python3 -m pip install -v --force-reinstall ${{ join(' ', parameters.pipModules) }}
+      script: pip install -v --force-reinstall ${{ join(' ', parameters.pipModules) }}
--- a/.azuredevops/templates/steps/dependencies-rocm.yml
+++ b/.azuredevops/templates/steps/dependencies-rocm.yml
@@ -13,9 +13,6 @@ parameters:
 - name: dependencyList
  type: object
  default: []
- name: os
-  type: string
-  default: 'ubuntu2204'
 - name: gpuTarget
  type: string
  default: ''
@@ -39,10 +36,6 @@ parameters:
 - name: aggregatePipeline
  type: boolean
  default: false
-# monorepo related parameters
- name: downstreamAggregateNames
-  type: string
-  default: ''

 - name: componentVarList
  type: object
@@ -110,7 +103,7 @@ parameters:
    hipCUB:
      pipelineId: $(HIPCUB_PIPELINE_ID)
      stagingBranch: develop
-      mainlineBranch: develop
+      mainlineBranch: mainline
      hasGpuTarget: true
    hipFFT:
      pipelineId: $(HIPFFT_PIPELINE_ID)
@@ -130,7 +123,7 @@ parameters:
    hipRAND:
      pipelineId: $(HIPRAND_PIPELINE_ID)
      stagingBranch: develop
-      mainlineBranch: develop
+      mainlineBranch: mainline
      hasGpuTarget: true
    hipSOLVER:
      pipelineId: $(HIPSOLVER_PIPELINE_ID)
@@ -265,7 +258,7 @@ parameters:
    rocPRIM:
      pipelineId: $(ROCPRIM_PIPELINE_ID)
      stagingBranch: develop
-      mainlineBranch: develop
+      mainlineBranch: mainline
      hasGpuTarget: true
    rocprofiler:
      pipelineId: $(ROCPROFILER_PIPELINE_ID)
@@ -305,7 +298,7 @@ parameters:
    rocRAND:
      pipelineId: $(ROCRAND_PIPELINE_ID)
      stagingBranch: develop
-      mainlineBranch: develop
+      mainlineBranch: mainline
      hasGpuTarget: true
    rocr_debug_agent:
      pipelineId: $(ROCR_DEBUG_AGENT_PIPELINE_ID)
@@ -330,7 +323,7 @@ parameters:
    rocThrust:
      pipelineId: $(ROCTHRUST_PIPELINE_ID)
      stagingBranch: develop
-      mainlineBranch: develop
+      mainlineBranch: mainline
      hasGpuTarget: true
    roctracer:
      pipelineId: $(ROCTRACER_PIPELINE_ID)
@@ -368,7 +361,7 @@ steps:
        pipelineId: ${{ parameters.componentVarList[split(dependency, ':')[0]].pipelineId }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
        ${{ if parameters.componentVarList[split(dependency, ':')[0]].hasGpuTarget }}:
-          fileFilter: "${{ split(dependency, ':')[1] }}*_${{ parameters.os }}_${{ parameters.gpuTarget }}"
+          fileFilter: "${{ split(dependency, ':')[1] }}*${{ parameters.gpuTarget }}"
        # dependencySource = staging
        ${{ if eq(parameters.dependencySource, 'staging')}}:
          branchName: ${{ parameters.componentVarList[split(dependency, ':')[0]].stagingBranch }}
@@ -391,14 +384,6 @@ steps:
        ${{ else }}:
          branchName: ${{ parameters.componentVarList[split(dependency, ':')[0]].stagingBranch }}
 # no colon (:) found in this item in the list
-  - ${{ elseif containsValue(split(parameters.downstreamAggregateNames, '+'), dependency) }}:
-    - template: local-artifact-download.yml
-      parameters:
-        ${{ if parameters.componentVarList[dependency].hasGpuTarget }}:
-          gpuTarget: ${{ parameters.gpuTarget }}
-        preTargetFilter: ${{ dependency }}
-        os: ${{ parameters.os }}
-        buildType: current
  - ${{ else }}:
    - template: artifact-download.yml
      parameters:
@@ -406,9 +391,7 @@ steps:
        pipelineId: ${{ parameters.componentVarList[dependency].pipelineId }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
        ${{ if parameters.componentVarList[dependency].hasGpuTarget }}:
-          fileFilter: ${{ parameters.os }}_${{ parameters.gpuTarget }}
-        ${{ else }}:
-          fileFilter: ${{ parameters.os }}
+          fileFilter: ${{ parameters.gpuTarget }}
        # dependencySource = staging
        ${{ if eq(parameters.dependencySource, 'staging')}}:
          branchName: ${{ parameters.componentVarList[dependency].stagingBranch }}
@@ -436,16 +419,14 @@ steps:
    displayName: Symlink from rocm/llvm to rocm/lib/llvm
    inputs:
      targetType: inline
-      script: |
-        sudo mkdir -p $(Agent.BuildDirectory)/rocm/lib
-        sudo ln -sr $(Agent.BuildDirectory)/rocm/llvm $(Agent.BuildDirectory)/rocm/lib/llvm
+      script: sudo ln -s $(Agent.BuildDirectory)/rocm/llvm $(Agent.BuildDirectory)/rocm/lib/llvm
  - task: Bash@3
    displayName: Symlink executables from rocm/llvm/bin to rocm/bin
    inputs:
      targetType: inline
      script: |
        for file in amdclang amdclang++ amdclang-cl amdclang-cpp amdflang amdlld aompcc mygpu mycpu offload-arch; do
-          sudo ln -sr $(Agent.BuildDirectory)/rocm/llvm/bin/$file $(Agent.BuildDirectory)/rocm/bin/$file
+          sudo ln -s $(Agent.BuildDirectory)/rocm/llvm/bin/$file $(Agent.BuildDirectory)/rocm/bin/$file
        done
 # dlopen calls within a ctest or pytest sequence runs into issues when shared library symlink convention is not followed
 # the convention is as follows:
@@ -490,10 +471,8 @@ steps:
      targetType: inline
 # OS ignores if the ROCm lib folder shows up more than once
      script: |
-        echo $(Agent.BuildDirectory)/rocm/lib | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
+        echo $(Agent.BuildDirectory)/rocm/lib | sudo tee /etc/ld.so.conf.d/rocm-ci.conf
        echo $(Agent.BuildDirectory)/rocm/llvm/lib | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
-        echo $(Agent.BuildDirectory)/rocm/lib64 | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
-        echo $(Agent.BuildDirectory)/rocm/llvm/lib64 | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
        sudo cat /etc/ld.so.conf.d/rocm-ci.conf
        sudo ldconfig -v
        ldconfig -p
--- a/.azuredevops/templates/steps/dependencies-vendor.yml
+++ b/.azuredevops/templates/steps/dependencies-vendor.yml
@@ -1,53 +0,0 @@
-parameters:
- name: os
-  type: string
-  default: 'ubuntu2204'
- name: dependencyList
-  type: object
- name: pipelineIdList
-  type: object
-  default:
-    boost: 250
-    grpc: 72
-    gtest: 73
-    half560: 68
-    lapack: 69
-
-steps:
- ${{ each dependency in parameters.dependencyList }}:
-  - task: DownloadPipelineArtifact@2
-    displayName: Download ${{ dependency }}
-    inputs:
-      project: ROCm-CI
-      buildType: specific
-      targetPath: $(Pipeline.Workspace)/d
-      definition: ${{ parameters.pipelineIdList[dependency] }}
-      itemPattern: '**/*${{ parameters.os }}*'
-  - task: ExtractFiles@1
-    displayName: Extract ${{ dependency }}
-    inputs:
-      archiveFilePatterns: '$(Pipeline.Workspace)/d/**/*.tar.gz'
-      destinationFolder: $(Agent.BuildDirectory)/vendor
-      cleanDestinationFolder: true
-      overwriteExistingFiles: true
-  - task: DeleteFiles@1
-    displayName: Clean up ${{ dependency }}
-    inputs:
-      SourceFolder: $(Pipeline.Workspace)/d
-      Contents: '**/*.tar.gz'
-      RemoveDotFiles: true
- task: Bash@3
-  displayName: List vendored files
-  inputs:
-    targetType: inline
-    script: ls -la1R $(Agent.BuildDirectory)/vendor
- task: Bash@3
-  displayName: Link vendored shared libraries
-  inputs:
-    targetType: inline
-    script: |
-      echo $(Agent.BuildDirectory)/vendor/lib | sudo tee -a /etc/ld.so.conf.d/vendor.conf
-      echo $(Agent.BuildDirectory)/vendor/lib64 | sudo tee -a /etc/ld.so.conf.d/vendor.conf
-      sudo cat /etc/ld.so.conf.d/vendor.conf
-      sudo ldconfig -v
-      ldconfig -p
--- a/.azuredevops/templates/steps/local-artifact-download.yml
+++ b/.azuredevops/templates/steps/local-artifact-download.yml
@@ -2,9 +2,6 @@
 # It can be overridden to download any artifact from any pipeline, given the appropriate build/pipeline IDs

 parameters:
-  - name: os
-    type: string
-    default: 'ubuntu2204'
  - name: gpuTarget
    type: string
    default: ''
@@ -32,27 +29,25 @@ parameters:

 steps:
  - task: DownloadPipelineArtifact@2
-    displayName: Download ${{ parameters.preTargetFilter}}*${{ parameters.os }}_${{ parameters.gpuTarget}}*${{ parameters.postTargetFilter}}
+    displayName: 'Download Pipeline Build'
    inputs:
      ${{ if eq(parameters.buildType, 'specific') }}:
        buildType: specific
        buildVersionToDownload: specific
        project: ROCm-CI
-        ${{ if ne(parameters.definitionId, 0) }}:
-          definition: ${{ parameters.definitionId }}
-        ${{ if ne(parameters.buildId, 0) }}:
-          buildId: ${{ parameters.buildId }}
-      itemPattern: '**/*${{ parameters.preTargetFilter }}*${{ parameters.os }}_${{ parameters.gpuTarget }}*${{ parameters.postTargetFilter }}*'
+        definition: ${{ parameters.definitionId }}
+        buildId: ${{ parameters.buildId }}
+      itemPattern: '**/*${{ parameters.preTargetFilter }}*${{ parameters.gpuTarget }}*${{ parameters.postTargetFilter }}*'
      targetPath: $(Pipeline.Workspace)/d
  - task: ExtractFiles@1
-    displayName: Extract ${{ parameters.preTargetFilter}}*${{ parameters.os }}_${{ parameters.gpuTarget}}*${{ parameters.postTargetFilter}}
+    displayName: 'Extract Pipeline Build'
    inputs:
      archiveFilePatterns: '$(Pipeline.Workspace)/d/**/*.tar.gz'
      destinationFolder: '$(Agent.BuildDirectory)/rocm'
      cleanDestinationFolder: false
      overwriteExistingFiles: true
  - task: DeleteFiles@1
-    displayName: Clean up ${{ parameters.preTargetFilter}}*${{ parameters.os }}_${{ parameters.gpuTarget}}*${{ parameters.postTargetFilter}}
+    displayName: 'Clean up Compressed Pipeline Build'
    inputs:
      SourceFolder: '$(Pipeline.Workspace)/d'
      Contents: '/**/*.tar.xz'
--- a/.azuredevops/templates/steps/manifest.yml
+++ b/.azuredevops/templates/steps/manifest.yml
@@ -1,19 +1,10 @@
 parameters:
- name: componentName
+- name: artifactName
  type: string
-  default: $(Build.DefinitionName)
- name: sparseCheckoutDir
-  type: string
-  default: ''
+  default: 'drop'
 - name: gpuTarget
  type: string
  default: ''
- name: artifactName
-  type: string
-  default: drop
- name: os
-  type: string
-  default: 'ubuntu2204'

 steps:
 - task: Bash@3
@@ -34,9 +25,8 @@ steps:

      IS_TAG_BUILD=$(jq 'has("release_repo")' resources.repositories)
      IS_AOMP_BUILD=$(jq 'has("aomp_repo")' resources.repositories)
-      IS_MATHLIBS_BUILD=$(jq 'has("libraries_repo")' resources.repositories)

-      if [ "$IS_TAG_BUILD" = "true" ] || [ "$IS_AOMP_BUILD" = "true" ] || [ "$IS_MATHLIBS_BUILD" = "true" ]; then
+      if [ "$IS_TAG_BUILD" = "true" ] || [ "$IS_AOMP_BUILD" = "true" ]; then
        exclude_keys=("pipelines_repo" "self") # Triggered by a file under ROCm/ROCm
      else
        exclude_keys=("pipelines_repo") # Triggered by a file under a component repo
@@ -55,7 +45,6 @@ steps:
              buildId: "$(Build.BuildId)",
              repoId: $entry.value.id,
              repoName: $entry.value.name,
-              repoSparse: "${{ parameters.sparseCheckoutDir }}",
              repoRef: $entry.value.ref,
              repoUrl: $entry.value.url,
              repoVersion: $entry.value.version
@@ -66,7 +55,7 @@ steps:
        )
      ' resources.repositories)

-      manifest_json=$(Build.ArtifactStagingDirectory)/manifest_${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.json
+      manifest_json=$(Build.ArtifactStagingDirectory)/manifest_$(Build.DefinitionName)_$(Build.BuildId)_$(Build.BuildNumber)_ubuntu2204_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.json

      dependencies=()
      for manifest_file in $(Pipeline.Workspace)/d/**/manifest_*.json; do
@@ -92,7 +81,6 @@ steps:
          "<tr><td>" + .buildNumber + "</td>" +
          "<td><a href=\"https://dev.azure.com/ROCm-CI/ROCm-CI/_build/results?buildId=" + .buildId + "\">" + .buildId + "</a></td>" +
          "<td><a href=\"" + .repoUrl + "\">" + .repoName + "</a></td>" +
-          "<td><a href=\"" + .repoUrl + "/tree/" + .repoRef + "/" + .repoSparse + "\">" + .repoSparse + "</a></td>" +
          "<td><a href=\"" + .repoUrl + "/tree/" + .repoRef + "\">" + .repoRef + "</a></td>" +
          "<td><a href=\"" + .repoUrl + "/commit/" + .repoVersion + "\">" + .repoVersion + "</a></td></tr>"
        ')
@@ -105,7 +93,6 @@ steps:
          "<tr><td>" + .buildNumber + "</td>" +
          "<td><a href=\"https://dev.azure.com/ROCm-CI/ROCm-CI/_build/results?buildId=" + .buildId + "\">" + .buildId + "</a></td>" +
          "<td><a href=\"" + .repoUrl + "\">" + .repoName + "</a></td>" +
-          "<td><a href=\"" + .repoUrl + "/tree/" + .repoRef + "/" + .repoSparse + "\">" + .repoSparse + "</a></td>" +
          "<td><a href=\"" + .repoUrl + "/tree/" + .repoRef + "\">" + .repoRef + "</a></td>" +
          "<td><a href=\"" + .repoUrl + "/commit/" + .repoVersion + "\">" + .repoVersion + "</a></td></tr>"
        ')
@@ -120,7 +107,7 @@ steps:
  inputs:
    targetType: inline
    script: |
-      manifest_html=$(Build.ArtifactStagingDirectory)/manifest_${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.html
+      manifest_html=$(Build.ArtifactStagingDirectory)/manifest_$(Build.DefinitionName)_$(Build.BuildId)_$(Build.BuildNumber)_ubuntu2204_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.html
      cat <<EOF > $manifest_html
      <html>
      <h1>Manifest</h1>
@@ -130,7 +117,6 @@ steps:
        <th>Build Number</th>
        <th>Build ID</th>
        <th>Repo Name</th>
-        <th>Repo Sparse</th>
        <th>Repo Ref</th>
        <th>Repo Version</th>
      </tr>
@@ -142,7 +128,6 @@ steps:
        <th>Build Number</th>
        <th>Build ID</th>
        <th>Repo Name</th>
-        <th>Repo Sparse</th>
        <th>Repo Ref</th>
        <th>Repo Version</th>
      </tr>
@@ -163,7 +148,7 @@ steps:
  continueOnError: true
  inputs:
    tabName: Manifest
-    reportDir: $(Build.ArtifactStagingDirectory)/manifest_${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.html
+    reportDir: $(Build.ArtifactStagingDirectory)/manifest_$(Build.DefinitionName)_$(Build.BuildId)_$(Build.BuildNumber)_ubuntu2204_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.html
 - task: Bash@3
  displayName: Save manifest artifact file name
  condition: always()
@@ -172,5 +157,5 @@ steps:
    workingDirectory: $(Pipeline.Workspace)
    targetType: inline
    script: |
-      echo "manifest_${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.html" >> pipelineArtifacts.txt
-      echo "manifest_${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.json" >> pipelineArtifacts.txt
+      echo "manifest_$(Build.DefinitionName)_$(Build.BuildId)_$(Build.BuildNumber)_ubuntu2204_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.html" >> pipelineArtifacts.txt
+      echo "manifest_$(Build.DefinitionName)_$(Build.BuildId)_$(Build.BuildNumber)_ubuntu2204_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.json" >> pipelineArtifacts.txt
--- a/.azuredevops/templates/steps/miopen-get-ck-build.yml
+++ b/.azuredevops/templates/steps/miopen-get-ck-build.yml
@@ -25,7 +25,7 @@ steps:
      echo "Fetching CK build ID for commit $CK_COMMIT"
      CK_CHECKS_URL="$GH_API/composable_kernel/commits/${CK_COMMIT}/check-runs"
      CK_BUILD_ID=$(curl -s $CK_CHECKS_URL | \
-        jq '.check_runs[] | select(.name == "composable_kernel" and .app.slug == "azure-pipelines" and .conclusion == "success") | .details_url' | \
+        jq '.check_runs[] | select(.name == "composable_kernel" and .app.slug == "azure-pipelines") | .details_url' | \
        tr -d '"' | grep -oP 'buildId=\K\d+')

      # If none found, use latest successful CK build instead
--- a/.azuredevops/templates/steps/preamble.yml
+++ b/.azuredevops/templates/steps/preamble.yml
@@ -3,27 +3,10 @@
 # also display installed components and packages
 steps:
 - task: Bash@3
-  displayName: OS Version
+  displayName: List apt packages
  inputs:
    targetType: inline
-    script: cat /etc/os-release
- task: Bash@3
-  displayName: List installed packages (apt, dnf, or yum)
-  inputs:
-    targetType: inline
-    script: |
-      if command -v apt >/dev/null 2>&1; then
-        echo "Listing installed packages with apt:"
-        apt list --installed
-      elif command -v dnf >/dev/null 2>&1; then
-        echo "Listing installed packages with dnf:"
-        dnf list installed
-      elif command -v yum >/dev/null 2>&1; then
-        echo "Listing installed packages with yum:"
-        yum list installed
-      else
-        echo "No supported package manager found (apt, dnf, yum)."
-      fi
+    script: apt list --installed
 - task: Bash@3
  displayName: Print Python version
  inputs:
@@ -33,7 +16,7 @@ steps:
  displayName: List Python packages
  inputs:
    targetType: inline
-    script: python3 -m pip list -v
+    script: pip list -v
 # The "Azure Pipelines" agents install CMake in multiple ways, including a standalone install into /usr/local/bin:
 # https://github.com/actions/runner-images/blob/6d939a3ab352a54a021dd67b071577287b6f14a5/images/ubuntu/scripts/build/install-cmake.sh#L27
 # This standalone CMake does not have a fixed version, and is not the same version as the one installed by the package manager
--- a/.azuredevops/templates/steps/test.yml
+++ b/.azuredevops/templates/steps/test.yml
@@ -2,27 +2,21 @@ parameters:
 - name: componentName
  type: string
  default: ''
- name: os
-  type: string
-  default: ubuntu2204
 - name: testDir
  type: string
-  default: build
+  default: 'build'
 - name: testExecutable
  type: string
-  default: ctest
+  default: 'ctest'
 - name: testParameters
  type: string
-  default: --output-on-failure --force-new-ctest-process --output-junit test_output.xml
- name: extraTestParameters
-  type: string
-  default: ''
+  default: '--output-on-failure --force-new-ctest-process --output-junit test_output.xml'
 - name: testOutputFile
  type: string
  default: test_output.xml
 - name: testOutputFormat
  type: string
-  default: JUnit
+  default: 'JUnit'
  values:
    - JUnit
    - NUnit
@@ -32,28 +26,26 @@ parameters:
 - name: testPublishResults
  type: boolean
  default: true
- name: allowComponentTestFailure
+- name: allowPartiallySucceededBuilds
  type: object
  default:
    - amdsmi
-    - HIPIFY
-    - rocm_smi_lib
-    - roctracer
-    # the following do not use this template but allow test failures, included for completeness
    - aomp
-    - ROCgdb
+    - HIPIFY
+    - MIVisionX
+    - rocm_smi_lib
+    - rocprofiler-sdk
+    - roctracer

 steps:
 # run test, continue on failure to publish results
 # and to publish build artifacts
 - task: Bash@3
  displayName: '${{ parameters.componentName }} Test'
-  continueOnError: ${{ containsValue(parameters.allowComponentTestFailure, parameters.componentName) }}
+  continueOnError: ${{ containsValue(parameters.allowPartiallySucceededBuilds, parameters.componentName) }}
  inputs:
    targetType: inline
-    script: |
-      ${{ iif(eq(parameters.os, 'almalinux8'), 'source /opt/rh/gcc-toolset-14/enable', '') }}
-      ${{ parameters.testExecutable }} ${{ parameters.testParameters }} ${{ parameters.extraTestParameters }}
+    script: ${{ parameters.testExecutable }} ${{ parameters.testParameters }}
    workingDirectory: ${{ parameters.testDir }}
 - ${{ if parameters.testPublishResults }}:
  - task: PublishTestResults@2
--- a/.azuredevops/variables-global.yml
+++ b/.azuredevops/variables-global.yml
@@ -3,8 +3,6 @@
 variables:
 - name: RESOURCES_REPOSITORIES
  value: $[ convertToJson(resources.repositories) ]
- name: CCACHE_DIR
-  value: $(Pipeline.Workspace)/ccache
 - name: CI_ROOT_PATH
  value: /.azuredevops
 - name: CI_COMPONENT_PATH
@@ -32,136 +30,320 @@ variables:
 - name: GFX90A_TEST_POOL
  value: gfx90a_test_pool
 - name: LATEST_RELEASE_VERSION
-  value: 6.4.1
+  value: 6.4.0
 - name: REPO_RADEON_VERSION
-  value: 6.4.1
+  value: 6.4
 - name: NEXT_RELEASE_VERSION
-  value: 7.0.0
+  value: 6.5.0
 - name: LATEST_RELEASE_TAG
-  value: rocm-6.4.1
+  value: rocm-6.4.0
 - name: DOCKER_SKIP_GFX
  value: gfx90a
+- name: AMDMIGRAPHX_GFX942_TEST_PIPELINE_ID
+  value: 197
 - name: AMDMIGRAPHX_PIPELINE_ID
  value: 113
+- name: AMDMIGRAPHX_TAGGED_PIPELINE_ID
+  value: 60
 - name: AMDSMI_PIPELINE_ID
  value: 99
+- name: AMDSMI_TAGGED_PIPELINE_ID
+  value: 33
 - name: AOMP_EXTRAS_PIPELINE_ID
  value: 111
+- name: AOMP_EXTRAS_TAGGED_PIPELINE_ID
+  value: 75
 - name: AOMP_PIPELINE_ID
  value: 115
+- name: AOMP_TAGGED_PIPELINE_ID
+  value: 76
+- name: CCACHE_DIR
+  value: $(Pipeline.Workspace)/ccache
 - name: CLR_PIPELINE_ID
  value: 145
+- name: CLR_TAGGED_PIPELINE_ID
+  value: 71
+- name: COMPOSABLE_KERNEL_GFX942_TEST_PIPELINE_ID
+  value: 179
 - name: COMPOSABLE_KERNEL_PIPELINE_ID
  value: 86
+- name: COMPOSABLE_KERNEL_TAGGED_PIPELINE_ID
+  value: 38
 - name: FLANG_LEGACY_PIPELINE_ID
  value: 77
+- name: FLANG_LEGACY_TAGGED_PIPELINE_ID
+  value: 77
 - name: HALF_PIPELINE_ID
  value: 101
+- name: HALF_TAGGED_PIPELINE_ID
+  value: 11
 - name: HALF560_PIPELINE_ID
  value: 68
 - name: HALF560_BUILD_ID
  value: 621
 - name: HIP_PIPELINE_ID
  value: 93
+- name: HIP_TAGGED_PIPELINE_ID
+  value: 31
 - name: HIP_TESTS_PIPELINE_ID
  value: 233
+- name: HIP_TESTS_TAGGED_PIPELINE_ID
+  value: 220
 - name: HIPBLAS_COMMON_PIPELINE_ID
  value: 223
+- name: HIPBLAS_COMMON_TAGGED_PIPELINE_ID
+  value: 224
+- name: HIPBLAS_GFX942_TEST_PIPELINE_ID
+  value: 202
 - name: HIPBLAS_PIPELINE_ID
  value: 87
+- name: HIPBLAS_TAGGED_PIPELINE_ID
+  value: 44
+- name: HIPBLASLT_GFX942_TEST_PIPELINE_ID
+  value: 187
 - name: HIPBLASLT_PIPELINE_ID
  value: 112
+- name: HIPBLASLT_TAGGED_PIPELINE_ID
+  value: 45
+- name: HIPCUB_GFX942_TEST_PIPELINE_ID
+  value: 186
 - name: HIPCUB_PIPELINE_ID
-  value: 277
+  value: 97
+- name: HIPCUB_TAGGED_PIPELINE_ID
+  value: 46
+- name: HIPFFT_GFX942_TEST_PIPELINE_ID
+  value: 198
 - name: HIPFFT_PIPELINE_ID
  value: 121
+- name: HIPFFT_TAGGED_PIPELINE_ID
+  value: 12
 - name: HIPFORT_PIPELINE_ID
  value: 102
+- name: HIPFORT_TAGGED_PIPELINE_ID
+  value: 34
 - name: HIPIFY_PIPELINE_ID
  value: 92
+- name: HIPIFY_TAGGED_PIPELINE_ID
+  value: 13
+- name: HIPRAND_GFX942_TEST_PIPELINE_ID
+  value: 188
 - name: HIPRAND_PIPELINE_ID
-  value: 275
+  value: 90
+- name: HIPRAND_TAGGED_PIPELINE_ID
+  value: 42
+- name: HIPSOLVER_GFX942_TEST_PIPELINE_ID
+  value: 201
 - name: HIPSOLVER_PIPELINE_ID
  value: 84
+- name: HIPSOLVER_TAGGED_PIPELINE_ID
+  value: 52
+- name: HIPSPARSE_GFX942_TEST_PIPELINE_ID
+  value: 195
 - name: HIPSPARSE_PIPELINE_ID
  value: 83
+- name: HIPSPARSE_TAGGED_PIPELINE_ID
+  value: 14
+- name: HIPSPARSELT_GFX942_TEST_PIPELINE_ID
+  value: 200
 - name: HIPSPARSELT_PIPELINE_ID
  value: 104
+- name: HIPSPARSELT_TAGGED_PIPELINE_ID
+  value: 53
+- name: HIPTENSOR_GFX942_TEST_PIPELINE_ID
+  value: 192
 - name: HIPTENSOR_PIPELINE_ID
  value: 105
+- name: HIPTENSOR_TAGGED_PIPELINE_ID
+  value: 56
 - name: LLVM_PROJECT_PIPELINE_ID
  value: 2
+- name: LLVM_PROJECT_TAGGED_PIPELINE_ID
+  value: 8
 - name: MIOPEN_PIPELINE_ID
  value: 108
+- name: MIOPEN_TAGGED_PIPELINE_ID
+  value: 58
 - name: MIVISIONX_PIPELINE_ID
  value: 80
+- name: MIVISIONX_TAGGED_PIPELINE_ID
+  value: 18
+- name: OMNIPERF_PIPELINE_ID
+  value: 241
+- name: OMNIPERF_TAGGED_PIPELINE_ID
+  value: 242
+- name: OMNITRACE_PIPELINE_ID
+  value: 253
+- name: OMNITRACE_TAGGED_PIPELINE_ID
+  value: 252
+- name: RCCL_GFX942_TEST_PIPELINE_ID
+  value: 184
 - name: RCCL_PIPELINE_ID
  value: 107
+- name: RCCL_TAGGED_PIPELINE_ID
+  value: 15
 - name: RDC_PIPELINE_ID
  value: 100
+- name: RDC_TAGGED_PIPELINE_ID
+  value: 59
 - name: ROCAL_PIPELINE_ID
  value: 151
+- name: ROCALUTION_GFX942_TEST_PIPELINE_ID
+  value: 196
 - name: ROCALUTION_PIPELINE_ID
  value: 89
+- name: ROCALUTION_TAGGED_PIPELINE_ID
+  value: 16
+- name: ROCBLAS_GFX942_TEST_PIPELINE_ID
+  value: 185
 - name: ROCBLAS_PIPELINE_ID
  value: 85
+- name: ROCBLAS_TAGGED_PIPELINE_ID
+  value: 32
 - name: ROCDBGAPI_PIPELINE_ID
  value: 135
+- name: ROCDBGAPI_TAGGED_PIPELINE_ID
+  value: 17
 - name: ROCDECODE_PIPELINE_ID
  value: 79
+- name: ROCDECODE_TAGGED_PIPELINE_ID
+  value: 21
+- name: ROCFFT_GFX942_TEST_PIPELINE_ID
+  value: 189
 - name: ROCFFT_PIPELINE_ID
  value: 120
+- name: ROCFFT_TAGGED_PIPELINE_ID
+  value: 19
 - name: ROCGDB_PIPELINE_ID
  value: 134
+- name: ROCGDB_TAGGED_PIPELINE_ID
+  value: 50
 - name: ROCJPEG_PIPELINE_ID
  value: 262
+- name: ROCJPEG_TAGGED_PIPELINE_ID
+  value: 263
 - name: ROCM_BANDWIDTH_TEST_PIPELINE_ID
  value: 88
+- name: ROCM_BANDWIDTH_TEST_TAGGED_PIPELINE_ID
+  value: 23
 - name: ROCM_CMAKE_PIPELINE_ID
  value: 6
+- name: ROCM_CMAKE_TAGGED_PIPELINE_ID
+  value: 7
 - name: ROCM_CORE_PIPELINE_ID
  value: 103
+- name: ROCM_CORE_TAGGED_PIPELINE_ID
+  value: 22
+- name: ROCM_EXAMPLES_GFX942_TEST_PIPELINE_ID
+  value: 204
 - name: ROCM_EXAMPLES_PIPELINE_ID
  value: 216
+- name: ROCM_EXAMPLES_TAGGED_PIPELINE_ID
+  value: 245
 - name: ROCM_SMI_LIB_PIPELINE_ID
  value: 96
+- name: ROCM_SMI_LIB_TAGGED_PIPELINE_ID
+  value: 47
 - name: ROCMINFO_PIPELINE_ID
  value: 91
+- name: ROCMINFO_TAGGED_PIPELINE_ID
+  value: 27
 - name: ROCMLIR_PIPELINE_ID
  value: 229
+- name: ROCMLIR_TAGGED_PIPELINE_ID
+  value: 62
 - name: ROCMVALIDATIONSUITE_PIPELINE_ID
  value: 106
+- name: ROCMVALIDATIONSUITE_TAGGED_PIPELINE_ID
+  value: 43
+- name: ROCPRIM_GFX942_TEST_PIPELINE_ID
+  value: 180
 - name: ROCPRIM_PIPELINE_ID
-  value: 273
+  value: 82
+- name: ROCPRIM_TAGGED_PIPELINE_ID
+  value: 20
+- name: ROCPROFILER_GFX942_TEST_PIPELINE_ID
+  value: 190
 - name: ROCPROFILER_COMPUTE_PIPELINE_ID
  value: 257
+- name: ROCPROFILER_COMPUTE_TAGGED_PIPELINE_ID
+  value: 258
 - name: ROCPROFILER_REGISTER_PIPELINE_ID
  value: 1
+- name: ROCPROFILER_REGISTER_TAGGED_PIPELINE_ID
+  value: 25
 - name: ROCPROFILER_SDK_PIPELINE_ID
  value: 246
+- name: ROCPROFILER_SDK_TAGGED_PIPELINE_ID
+  value: 234
 - name: ROCPROFILER_SYSTEMS_PIPELINE_ID
  value: 255
+- name: ROCPROFILER_SYSTEMS_TAGGED_PIPELINE_ID
+  value: 254
 - name: ROCPROFILER_PIPELINE_ID
  value: 143
+- name: ROCPROFILER_TAGGED_PIPELINE_ID
+  value: 28
 - name: ROCPYDECODE_PIPELINE_ID
  value: 239
+- name: ROCPYDECODE_TAGGED_PIPELINE_ID
+  value: 232
 - name: ROCR_DEBUG_AGENT_PIPELINE_ID
  value: 136
+- name: ROCR_DEBUG_AGENT_TAGGED_PIPELINE_ID
+  value: 29
 - name: ROCR_RUNTIME_PIPELINE_ID
  value: 10
+- name: ROCR_RUNTIME_TAGGED_PIPELINE_ID
+  value: 24
+- name: ROCRAND_GFX942_TEST_PIPELINE_ID
+  value: 183
 - name: ROCRAND_PIPELINE_ID
-  value: 274
+  value: 95
+- name: ROCRAND_TAGGED_PIPELINE_ID
+  value: 41
+- name: ROCSOLVER_GFX942_TEST_PIPELINE_ID
+  value: 199
 - name: ROCSOLVER_PIPELINE_ID
  value: 81
+- name: ROCSOLVER_TAGGED_PIPELINE_ID
+  value: 55
+- name: ROCSPARSE_GFX942_TEST_PIPELINE_ID
+  value: 191
 - name: ROCSPARSE_PIPELINE_ID
  value: 98
+- name: ROCSPARSE_TAGGED_PIPELINE_ID
+  value: 67
+- name: ROCT_THUNK_INTERFACE_PIPELINE_ID
+  value: 3
+- name: ROCT_THUNK_INTERFACE_TAGGED_PIPELINE_ID
+  value: 9
+- name: ROCTHRUST_GFX942_TEST_PIPELINE_ID
+  value: 194
 - name: ROCTHRUST_PIPELINE_ID
-  value: 276
+  value: 94
+- name: ROCTHRUST_TAGGED_PIPELINE_ID
+  value: 26
+- name: ROCTRACER_GFX942_TEST_PIPELINE_ID
+  value: 181
 - name: ROCTRACER_PIPELINE_ID
  value: 141
+- name: ROCTRACER_TAGGED_PIPELINE_ID
+  value: 30
+- name: ROCWMMA_GFX942_TEST_PIPELINE_ID
+  value: 193
 - name: ROCWMMA_PIPELINE_ID
  value: 109
+- name: ROCWMMA_TAGGED_PIPELINE_ID
+  value: 57
+- name: RPP_GFX942_TEST_PIPELINE_ID
+  value: 182
 - name: RPP_PIPELINE_ID
  value: 78
+- name: RPP_TAGGED_PIPELINE_ID
+  value: 39
 - name: TRANSFERBENCH_PIPELINE_ID
  value: 265
+- name: TRANSFERBENCH_TAGGED_PIPELINE_ID
+  value: 266
+- name: BOOST_DEPENDENCY_PIPELINE_ID
+  value: 250
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -1,18 +1,3 @@
-Datacenter
-GST
-IET
-LTO
-MX
-Microscaling
-NANOO
-ROCprof
-affinitization
-amdclang
-benefitting
-demangled
-inlined
-microscaling
-roofline
 AAC
 ABI
 ACE
@@ -47,6 +32,7 @@ Andrej
 Arb
 Autocast
 BARs
+BatchNorm
 BLAS
 BMC
 BabelStream
@@ -140,6 +126,7 @@ FX
 Filesystem
 FindDb
 Flang
+FlashAttention
 FluxBenchmark
 Fortran
 Fuyu
@@ -243,6 +230,7 @@ LM
 LSAN
 LSan
 LTS
+LSTMs
 LanguageCrossEntropy
 LoRA
 MEM
@@ -398,6 +386,7 @@ Ryzen
 SALU
 SBIOS
 SCA
+ScaledGEMM
 SDK
 SDMA
 SDPA
@@ -438,6 +427,8 @@ TCI
 TCIU
 TCP
 TCR
+TensorRT
+TensorFloat
 TF
 TFLOPS
 TP
@@ -524,6 +515,7 @@ allocator
 allocators
 amdgpu
 api
+aten
 atmi
 atomics
 autogenerated
@@ -694,6 +686,7 @@ installable
 interop
 interprocedural
 intra
+intrinsics
 invariants
 invocating
 ipo
@@ -840,6 +833,7 @@ roctracer
 rst
 runtime
 runtimes
+ResNet
 sL
 scalability
 scalable
@@ -855,6 +849,7 @@ sm
 smi
 softmax
 spack
+spmm
 src
 stochastically
 strided
@@ -863,6 +858,7 @@ subdirectory
 subexpression
 subfolder
 subfolders
+submatrix
 submodule
 submodules
 subnet
@@ -887,6 +883,7 @@ torchvision
 tqdm
 tracebacks
 txt
+TopK
 uarch
 uncached
 uncacheable
--- a/README.md
+++ b/README.md
@@ -19,17 +19,143 @@ ROCm supports programming models, such as OpenMP and OpenCL, and includes all ne
 source software compilers, debuggers, and libraries. ROCm is fully integrated into machine learning
 (ML) frameworks, such as PyTorch and TensorFlow.

-> [!IMPORTANT]
-> A new open source build platform for ROCm is under development at
-> https://github.com/ROCm/TheRock, featuring a unified CMake build with bundled
-> dependencies, Windows support, and more.
->
-> The instructions below describe the prior process for building from source
-> which will be replaced once TheRock is mature enough.
+## Getting the ROCm Source Code

-## Getting and Building ROCm from Source
+AMD ROCm is built from open source software. It is, therefore, possible to modify the various components of ROCm by downloading the source code and rebuilding the components. The source code for ROCm components can be cloned from each of the GitHub repositories using git.  For easy access to download the correct versions of each of these tools, the ROCm repository contains a repo manifest file called [default.xml](./default.xml). You can use this manifest file to download the source code for ROCm software.

-Please use [TheRock](https://github.com/ROCm/TheRock) build system to build ROCm from source.
+### Installing the repo tool
+
+The repo tool from Google allows you to manage multiple git repositories simultaneously. Run the following commands to install the repo tool:
+
+```bash
+mkdir -p ~/bin/
+curl https://storage.googleapis.com/git-repo-downloads/repo > ~/bin/repo
+chmod a+x ~/bin/repo
+```
+
+**Note:** The ```~/bin/``` folder is used as an example. You can specify a different folder to install the repo tool into if you desire.
+
+### Installing git-lfs
+
+Some ROCm projects use the Git Large File Storage (LFS) format that may require you to install git-lfs. Refer to [Git Large File Storage](https://github.com/git-lfs/git-lfs/blob/main/INSTALLING.md) for more information. For example, to install git-lfs for Ubuntu, use the following command:
+
+```bash
+sudo apt-get install git-lfs
+```
+
+### Downloading the ROCm source code
+
+The following example shows how to use the repo tool to download the ROCm source code. If you choose a directory other than ~/bin/ to install the repo tool, you must use that chosen directory in the code as shown below:
+
+```bash
+mkdir -p ~/ROCm/
+cd ~/ROCm/
+export ROCM_VERSION=6.4.1
+~/bin/repo init -u http://github.com/ROCm/ROCm.git -b roc-6.4.x -m tools/rocm-build/rocm-${ROCM_VERSION}.xml
+~/bin/repo sync
+```
+
+**Note:** Using this sample code will cause the repo tool to download the open source code associated with the specified ROCm release. Ensure that you have ssh-keys configured on your machine for your GitHub ID prior to the download as explained at [Connecting to GitHub with SSH](https://docs.github.com/en/authentication/connecting-to-github-with-ssh).
+
+## Building the ROCm source code
+
+Each ROCm component repository contains directions for building that component, such as the rocSPARSE documentation [Installation and Building for Linux](https://rocm.docs.amd.com/projects/rocSPARSE/en/latest/install/Linux_Install_Guide.html). Refer to the specific component documentation for instructions on building the repository.
+
+Each release of the ROCm software supports specific hardware and software configurations. Refer to [System requirements (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html) for the current supported hardware and OS.
+
+## Build ROCm from source
+
+The Build will use as many processors as it can find to build in parallel. Some of the compiles can consume as much as 10GB of RAM, so make sure you have plenty of Swap Space !
+
+By default the ROCm build will compile for all supported GPU architectures and will take approximately 500 CPU hours.
+The Build time will reduce significantly if we limit the GPU Architecture/s against which we need to build by using the environment variable GPU_ARCHS as mentioned below.
+
+```bash
+# --------------------------------------
+# Step1: clone source code
+# --------------------------------------
+
+mkdir -p ~/WORKSPACE/      # Or any folder name other than WORKSPACE
+cd ~/WORKSPACE/
+export ROCM_VERSION=6.4.1
+~/bin/repo init -u http://github.com/ROCm/ROCm.git -b roc-6.4.x -m tools/rocm-build/rocm-${ROCM_VERSION}.xml
+~/bin/repo sync
+
+# --------------------------------------
+# Step 2: Prepare build environment
+# --------------------------------------
+
+# Option 1: Start a docker container
+# Pulling required base docker images:
+# Ubuntu22.04 built from ROCm/tools/rocm-build/docker/ubuntu22/Dockerfile
+docker pull rocm/rocm-build-ubuntu-22.04:6.4
+# Ubuntu24.04 built from ROCm/tools/rocm-build/docker/ubuntu24/Dockerfile
+docker pull rocm/rocm-build-ubuntu-24.04:6.4
+
+# Start docker container and mount the source code folder:
+docker run -ti \
+    -e ROCM_VERSION=${ROCM_VERSION} \
+    -e CCACHE_DIR=$HOME/.ccache \
+    -e CCACHE_ENABLED=true \
+    -e DOCK_WORK_FOLD=/src \
+    -w /src \
+    -v $PWD:/src \
+    -v /etc/passwd:/etc/passwd \
+    -v /etc/shadow:/etc/shadow \
+    -v ${HOME}/.ccache:${HOME}/.ccache \
+    -u $(id -u):$(id -g) \
+    <replace_with_required_ubuntu_base_docker_image> bash
+
+# Option 2: Install required packages into the host machine
+# For ubuntu22.04 system
+cd ROCm/tools/rocm-build/docker/ubuntu22
+cp * /tmp && cd /tmp
+bash install-prerequisites.sh
+# For ubuntu24.04 system
+cd ROCm/tools/rocm-build/docker/ubuntu24
+cp * /tmp && cd /tmp
+bash install-prerequisites.sh
+
+# --------------------------------------
+# Step 3: Run build command line
+# --------------------------------------
+
+# Select GPU targets before building:
+# When GPU_ARCHS is not set, default GPU targets supported by ROCm6.1 will be used.
+# To build against a subset of GFX architectures you can use the below env variable.
+# Support MI300 (gfx940, gfx941, gfx942).
+export GPU_ARCHS="gfx942"               # Example
+export GPU_ARCHS="gfx940;gfx941;gfx942" # Example
+
+cd ~/WORKSPACE/
+# Pick and run build commands in the docker container:
+# Build rocm-dev packages
+make -f ROCm/tools/rocm-build/ROCm.mk -j ${NPROC:-$(nproc)} rocm-dev
+# Build all ROCm packages
+make -f ROCm/tools/rocm-build/ROCm.mk -j ${NPROC:-$(nproc)} all
+# list all ROCm components to find required components
+make -f ROCm/tools/rocm-build/ROCm.mk list_components
+# Build a single ROCm packages
+make -f ROCm/tools/rocm-build/ROCm.mk T_rocblas
+
+# Find built packages in ubuntu22.04:
+out/ubuntu-22.04/22.04/deb/
+# Find built packages in ubuntu24.04:
+out/ubuntu-24.04/24.04/deb/
+
+# Find built logs in ubuntu22.04:
+out/ubuntu-22.04/22.04/logs/
+# Find built logs in ubuntu24.04:
+out/ubuntu-24.04/24.04/logs/
+# All logs pertaining to failed components, end with .errrors extension.
+out/ubuntu-22.04/22.04/logs/rocblas.errors      # Example
+# All logs pertaining to building components, end with .inprogress extension.
+out/ubuntu-22.04/22.04/logs/rocblas.inprogress  # Example
+# All logs pertaining to passed components, use the component names.
+out/ubuntu-22.04/22.04/logs/rocblas             # Example
+```
+
+Note: [Overview for ROCm.mk](tools/rocm-build/README.md)

 ## ROCm documentation

--- a/RELEASE.md
+++ b/RELEASE.md
@@ -654,4 +654,4 @@ There are a number of upcoming changes planned for HIP runtime API in an upcomin
 that are not backward compatible with prior releases. Most of these changes increase 
 alignment between HIP and CUDA APIs or behavior. Some of the upcoming changes are to 
 clean up header files, remove namespace collision, and have a clear separation between 
-`hipRTC` and HIP runtime.
+`hipRTC` and HIP runtime. For more information, see [HIP 7.0 Is Coming: What You Need to Know to Stay Ahead](https://rocm.blogs.amd.com/ecosystems-and-partners/transition-to-hip-7.0:-guidance-on-upcoming-compatibility-changes/README.html).
--- a/docs/compatibility/ml-compatibility/jax-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/jax-compatibility.rst
@@ -53,7 +53,7 @@ Use cases and recommendations
 * The `nanoGPT in JAX <https://rocm.blogs.amd.com/artificial-intelligence/nanoGPT-JAX/README.html>`_
  blog explores the implementation and training of a Generative Pre-trained
  Transformer (GPT) model in JAX, inspired by Andrej Karpathy’s JAX-based
-  nanoGPT. Comparing how essential GPT components—such as self-attention 
+  nanoGPT. Comparing how essential GPT components—such as self-attention
  mechanisms and optimizers—are realized in JAX and JAX, also highlights
  JAX’s unique features.

@@ -160,12 +160,14 @@ associated inventories are tested for `ROCm 6.3.2 <https://repo.radeon.com/rocm/
      - Ubuntu 22.04
      - `3.10.16 <https://www.python.org/downloads/release/python-31016/>`_

+.. _key_rocm_libraries:
+
 Key ROCm libraries for JAX
 ================================================================================

-JAX functionality on ROCm is determined by its underlying library
-dependencies. These ROCm components affect the capabilities, performance, and
-feature set available to developers.
+The following ROCm libraries represent potential targets that could be utilized
+by JAX on ROCm for various computational tasks. The actual libraries used will
+depend on the specific implementation and operations performed.

 .. list-table::
    :header-rows: 1
@@ -173,347 +175,140 @@ feature set available to developers.
    * - ROCm library
      - Version
      - Purpose
-      - Used in
    * - `hipBLAS <https://github.com/ROCm/hipBLAS>`_
      - :version-ref:`hipBLAS rocm_version`
      - Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
        matrix and vector operations.
-      - Matrix multiplication in ``jax.numpy.matmul``, ``jax.lax.dot`` and
-        ``jax.lax.dot_general``, operations like ``jax.numpy.dot``, which
-        involve vector and matrix computations and batch matrix multiplications
-        ``jax.numpy.einsum`` with matrix-multiplication patterns algebra
-        operations.
    * - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`_
      - :version-ref:`hipBLASLt rocm_version`
      - hipBLASLt is an extension of hipBLAS, providing additional
        features like epilogues fused into the matrix multiplication kernel or
        use of integer tensor cores.
-      - Matrix multiplication in ``jax.numpy.matmul`` or ``jax.lax.dot``, and
-        the XLA (Accelerated Linear Algebra) use hipBLASLt for optimized matrix
-        operations, mixed-precision support, and hardware-specific
-        optimizations.
    * - `hipCUB <https://github.com/ROCm/hipCUB>`_
      - :version-ref:`hipCUB rocm_version`
      - Provides a C++ template library for parallel algorithms for reduction,
        scan, sort and select.
-      - Reduction functions (``jax.numpy.sum``, ``jax.numpy.mean``,
-        ``jax.numpy.prod``, ``jax.numpy.max`` and ``jax.numpy.min``), prefix sum
-        (``jax.numpy.cumsum``, ``jax.numpy.cumprod``) and sorting
-        (``jax.numpy.sort``, ``jax.numpy.argsort``).
    * - `hipFFT <https://github.com/ROCm/hipFFT>`_
      - :version-ref:`hipFFT rocm_version`
      - Provides GPU-accelerated Fast Fourier Transform (FFT) operations.
-      - Used in functions like ``jax.numpy.fft``.
    * - `hipRAND <https://github.com/ROCm/hipRAND>`_
      - :version-ref:`hipRAND rocm_version`
      - Provides fast random number generation for GPUs.
-      - The ``jax.random.uniform``, ``jax.random.normal``,
-        ``jax.random.randint`` and ``jax.random.split``.
    * - `hipSOLVER <https://github.com/ROCm/hipSOLVER>`_
      - :version-ref:`hipSOLVER rocm_version`
      - Provides GPU-accelerated solvers for linear systems, eigenvalues, and
        singular value decompositions (SVD).
-      - Solving linear systems (``jax.numpy.linalg.solve``), matrix
-        factorizations, SVD (``jax.numpy.linalg.svd``) and eigenvalue problems
-        (``jax.numpy.linalg.eig``).
    * - `hipSPARSE <https://github.com/ROCm/hipSPARSE>`_
      - :version-ref:`hipSPARSE rocm_version`
      - Accelerates operations on sparse matrices, such as sparse matrix-vector
        or matrix-matrix products.
-      - Sparse matrix multiplication (``jax.numpy.matmul``), sparse
-        matrix-vector and matrix-matrix products
-        (``jax.experimental.sparse.dot``), sparse linear system solvers and
-        sparse data handling.
    * - `hipSPARSELt <https://github.com/ROCm/hipSPARSELt>`_
      - :version-ref:`hipSPARSELt rocm_version`
      - Accelerates operations on sparse matrices, such as sparse matrix-vector
        or matrix-matrix products.
-      - Sparse matrix multiplication (``jax.numpy.matmul``), sparse
-        matrix-vector and matrix-matrix products
-        (``jax.experimental.sparse.dot``) and sparse linear system solvers.
    * - `MIOpen <https://github.com/ROCm/MIOpen>`_
      - :version-ref:`MIOpen rocm_version`
      - Optimized for deep learning primitives such as convolutions, pooling,
        normalization, and activation functions.
-      - Speeds up convolutional neural networks (CNNs), recurrent neural
-        networks (RNNs), and other layers. Used in operations like
-        ``jax.nn.conv``, ``jax.nn.relu``, and ``jax.nn.batch_norm``.
    * - `RCCL <https://github.com/ROCm/rccl>`_
      - :version-ref:`RCCL rocm_version`
      - Optimized for multi-GPU communication for operations like  all-reduce,
        broadcast, and scatter.
-      - Distribute computations across multiple GPU with ``pmap`` and
-        ``jax.distributed``. XLA automatically uses rccl when executing
-        operations across multiple GPUs on AMD hardware.
    * - `rocThrust <https://github.com/ROCm/rocThrust>`_
      - :version-ref:`rocThrust rocm_version`
      - Provides a C++ template library for parallel algorithms like sorting,
        reduction, and scanning.
-      - Reduction operations like ``jax.numpy.sum``, ``jax.pmap`` for
-        distributed training, which involves parallel reductions or
-        operations like ``jax.numpy.cumsum`` can use rocThrust.

-Supported features
+.. note::
+
+    This table shows ROCm libraries that could potentially be utilized by JAX. Not
+    all libraries may be used in every configuration, and the actual library usage
+    will depend on the specific operations and implementation details.
+
+Supported data types and modules
 ===============================================================================

-The following table maps the public JAX API modules to their supported
-ROCm and JAX versions.
+The following tables lists the supported public JAX API data types and modules.
+
+Supported data types
+--------------------------------------------------------------------------------
+
+ROCm supports all the JAX data types of `jax.dtypes <https://docs.jax.dev/en/latest/jax.dtypes.html>`_
+module, `jax.numpy.dtype <https://docs.jax.dev/en/latest/_autosummary/jax.numpy.dtype.html>`_
+and `default_dtype <https://docs.jax.dev/en/latest/default_dtypes.html>`_ .
+The ROCm supported data types in JAX are collected in the following table.

 .. list-table::
    :header-rows: 1

-    * - Module
-      - Description
-      - As of JAX
-      - As of ROCm
-    * - ``jax.numpy``
-      - Implements the NumPy API, using the primitives in ``jax.lax``.
-      - 0.1.56
-      - 5.0.0
-    * - ``jax.scipy``
-      - Provides GPU-accelerated and differentiable implementations of many
-        functions from the SciPy library, leveraging JAX's transformations
-        (e.g., ``grad``, ``jit``, ``vmap``).
-      - 0.1.56
-      - 5.0.0
-    * - ``jax.lax``
-      - A library of primitives operations that underpins libraries such as
-        ``jax.numpy.`` Transformation rules, such as Jacobian-vector product
-        (JVP) and batching rules, are typically defined as transformations on
-        ``jax.lax`` primitives.
-      - 0.1.57
-      - 5.0.0
-    * - ``jax.random``
-      - Provides a number of routines for deterministic generation of sequences
-        of pseudorandom numbers.
-      - 0.1.58
-      - 5.0.0
-    * - ``jax.sharding``
-      - Allows to define partitioning and distributing arrays across multiple
-        devices.
-      - 0.3.20
-      - 5.1.0
-    * - ``jax.distributed``
-      - Enables the scaling of computations across multiple devices on a single
-        machine or across multiple machines.
-      - 0.1.74
-      - 5.0.0
-    * - ``jax.image``
-      - Contains image manipulation functions like resize, scale and translation.
-      - 0.1.57
-      - 5.0.0
-    * - ``jax.nn``
-      - Contains common functions for neural network libraries.
-      - 0.1.56
-      - 5.0.0
-    * - ``jax.ops``
-      - Computes the minimum, maximum, sum or product within segments of an
-        array.
-      - 0.1.57
-      - 5.0.0
-    * - ``jax.stages``
-      - Contains interfaces to stages of the compiled execution process.
-      - 0.3.4
-      - 5.0.0
-    * - ``jax.extend``
-      - Provides modules for access to JAX internal machinery module. The
-        ``jax.extend`` module defines a library view of some of JAX’s internal
-        components.
-      - 0.4.15
-      - 5.5.0
-    * - ``jax.example_libraries``
-      - Serves as a collection of example code and libraries that demonstrate
-        various capabilities of JAX.
-      - 0.1.74
-      - 5.0.0
-    * - ``jax.experimental``
-      - Namespace for experimental features and APIs that are in development or
-        are not yet fully stable for production use.
-      - 0.1.56
-      - 5.0.0
-    * - ``jax.lib``
-      - Set of internal tools and types for bridging between JAX’s Python
-        frontend and its XLA backend.
-      - 0.4.6
-      - 5.3.0
-    * - ``jax_triton``
-      - Library that integrates the Triton deep learning compiler with JAX.
-      - jax_triton 0.2.0
-      - 6.2.4
-
-jax.scipy module
-------------------------------------------------------------------------------
-
-A SciPy-like API for scientific computing.
-
-.. list-table::
-    :header-rows: 1
-
-    * - Module
-      - As of JAX
-      - As of ROCm
-    * - ``jax.scipy.cluster``
-      - 0.3.11
-      - 5.1.0
-    * - ``jax.scipy.fft``
-      - 0.1.71
-      - 5.0.0
-    * - ``jax.scipy.integrate``
-      - 0.4.15
-      - 5.5.0
-    * - ``jax.scipy.interpolate``
-      - 0.1.76
-      - 5.0.0
-    * - ``jax.scipy.linalg``
-      - 0.1.56
-      - 5.0.0
-    * - ``jax.scipy.ndimage``
-      - 0.1.56
-      - 5.0.0
-    * - ``jax.scipy.optimize``
-      - 0.1.57
-      - 5.0.0
-    * - ``jax.scipy.signal``
-      - 0.1.56
-      - 5.0.0
-    * - ``jax.scipy.spatial.transform``
-      - 0.4.12
-      - 5.4.0
-    * - ``jax.scipy.sparse.linalg``
-      - 0.1.56
-      - 5.0.0
-    * - ``jax.scipy.special``
-      - 0.1.56
-      - 5.0.0
-    * - ``jax.scipy.stats``
-      - 0.1.56
-      - 5.0.0
-
-jax.scipy.stats module
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. list-table::
-   :header-rows: 1
-
-   * - Module
-     - As of JAX
-     - As of ROCm
-   * - ``jax.scipy.stats.bernouli``
-     - 0.1.56
-     - 5.0.0
-   * - ``jax.scipy.stats.beta``
-     - 0.1.56
-     - 5.0.0
-   * - ``jax.scipy.stats.betabinom``
-     - 0.1.61
-     - 5.0.0
-   * - ``jax.scipy.stats.binom``
-     - 0.4.14
-     - 5.4.0
-   * - ``jax.scipy.stats.cauchy``
-     - 0.1.56
-     - 5.0.0
-   * - ``jax.scipy.stats.chi2``
-     - 0.1.61
-     - 5.0.0
-   * - ``jax.scipy.stats.dirichlet``
-     - 0.1.56
-     - 5.0.0
-   * - ``jax.scipy.stats.expon``
-     - 0.1.56
-     - 5.0.0
-   * - ``jax.scipy.stats.gamma``
-     - 0.1.56
-     - 5.0.0
-   * - ``jax.scipy.stats.gennorm``
-     - 0.3.15
-     - 5.2.0
-   * - ``jax.scipy.stats.geom``
-     - 0.1.56
-     - 5.0.0
-   * - ``jax.scipy.stats.laplace``
-     - 0.1.56
-     - 5.0.0
-   * - ``jax.scipy.stats.logistic``
-     - 0.1.56
-     - 5.0.0
-   * - ``jax.scipy.stats.multinomial``
-     - 0.3.18
-     - 5.1.0
-   * - ``jax.scipy.stats.multivariate_normal``
-     - 0.1.56
-     - 5.0.0
-   * - ``jax.scipy.stats.nbinom``
-     - 0.1.72
-     - 5.0.0
-   * - ``jax.scipy.stats.norm``
-     - 0.1.56
-     - 5.0.0
-   * - ``jax.scipy.stats.pareto``
-     - 0.1.56
-     - 5.0.0
-   * - ``jax.scipy.stats.poisson``
-     - 0.1.56
-     - 5.0.0
-   * - ``jax.scipy.stats.t``
-     - 0.1.56
-     - 5.0.0
-   * - ``jax.scipy.stats.truncnorm``
-     - 0.4.0
-     - 5.3.0
-   * - ``jax.scipy.stats.uniform``
-     - 0.1.56
-     - 5.0.0
-   * - ``jax.scipy.stats.vonmises``
-     - 0.4.2
-     - 5.3.0
-   * - ``jax.scipy.stats.wrapcauchy``
-     - 0.4.20
-     - 5.6.0
-
-jax.extend module
-------------------------------------------------------------------------------
-
-Modules for JAX extensions.
-
-.. list-table::
-    :header-rows: 1
-
-    * - Module
-      - As of JAX
-      - As of ROCm
-    * - ``jax.extend.ffi``
-      - 0.4.30
-      - 6.0.0
-    * - ``jax.extend.linear_util``
-      - 0.4.17
-      - 5.6.0
-    * - ``jax.extend.mlir``
-      - 0.4.26
-      - 5.6.0
-    * - ``jax.extend.random``
-      - 0.4.15
-      - 5.5.0
-
-Unsupported JAX features
-===============================================================================
-
-The following GPU-accelerated JAX features are not supported by ROCm for
-the listed supported JAX versions.
-
-.. list-table::
-    :header-rows: 1
-
-    * - Feature
+    * - Data type
      - Description

-    * - Mixed Precision with TF32
-      - Mixed precision with TF32 is used for matrix multiplications,
-        convolutions, and other linear algebra operations, particularly in
-        deep learning workloads like CNNs and transformers.
+    * - ``bfloat16``
+      - 16-bit bfloat (brain floating point).

-    * - XLA int4 support
-      - 4-bit integer (int4) precision in the XLA compiler.
+    * - ``bool``
+      - Boolean.

-    * - MOSAIC (GPU)
-      - Mosaic is a library of kernel-building abstractions for JAX's Pallas system
+    * - ``complex128``
+      - 128-bit complex.
+
+    * - ``complex64``
+      - 64-bit complex.
+
+    * - ``float16``
+      - 16-bit (half precision) floating-point.
+
+    * - ``float32``
+      - 32-bit (single precision) floating-point.
+
+    * - ``float64``
+      - 64-bit (double precision) floating-point.
+
+    * - ``half``
+      - 16-bit (half precision) floating-point.
+
+    * - ``int16``
+      - Signed 16-bit integer.
+
+    * - ``int32``
+      - Signed 32-bit integer.
+
+    * - ``int64``
+      - Signed 64-bit integer.
+
+    * - ``int8``
+      - Signed 8-bit integer.
+
+    * - ``uint16``
+      - Unsigned 16-bit (word) integer.
+
+    * - ``uint32``
+      - Unsigned 32-bit (dword) integer.
+
+    * - ``uint64``
+      - Unsigned 64-bit (qword) integer.
+
+    * - ``uint8``
+      - Unsigned 8-bit (byte) integer.
+
+.. note::
+
+  JAX data type support is effected by the :ref:`key_rocm_libraries` and it's
+  collected on :doc:`ROCm data types and precision support <rocm:reference/precision-support>`
+  page.
+
+Supported modules
+--------------------------------------------------------------------------------
+
+For a complete and up-to-date list of JAX public modules (for example, ``jax.numpy``,
+``jax.scipy``, ``jax.lax``), their descriptions, and usage, please refer directly to the
+`official JAX API documentation <https://jax.readthedocs.io/en/latest/jax.html>`_.
+
+.. note::
+
+  Since version 0.1.56, JAX has full support for ROCm, and the
+  :ref:`Known issues and important notes <jax_comp_known_issues>` section
+  contains details about limitations specific to the ROCm backend. The list of
+  JAX API modules is maintained by the JAX project and is subject to change. 
+  Refer to the official Jax documentation for the most up-to-date information.
--- a/docs/compatibility/ml-compatibility/pytorch-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/pytorch-compatibility.rst
@@ -372,24 +372,15 @@ feature set available to developers.
        involve matrix products, such as ``torch.matmul``, ``torch.bmm``, and
        more.

-Supported features
+Supported modules and data types
 ================================================================================

-This section maps GPU-accelerated PyTorch features to their supported ROCm and
-PyTorch versions.
+The following section outlines the supported data types, modules, and domain libraries available in PyTorch on ROCm.

-torch
+Supported data types
 --------------------------------------------------------------------------------

-`torch <https://pytorch.org/docs/stable/index.html>`_ is the central module of
-PyTorch, providing data structures for multi-dimensional tensors and
-implementing mathematical operations on them. It also includes utilities for
-efficient serialization of tensors and arbitrary data types and other tools.
-
-Tensor data types
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-The tensor data type is specified using the ``dtype`` attribute or argument. 
+The tensor data type is specified using the ``dtype`` attribute or argument.
 PyTorch supports many data types for different use cases.

 The following table lists `torch.Tensor <https://pytorch.org/docs/stable/tensors.html>`_
@@ -400,539 +391,154 @@ single data types:

    * - Data type
      - Description
-      - As of PyTorch
-      - As of ROCm
    * - ``torch.float8_e4m3fn``
      - 8-bit floating point, e4m3
-      - 2.3
-      - 5.5
    * - ``torch.float8_e5m2``
      - 8-bit floating point, e5m2
-      - 2.3
-      - 5.5
    * - ``torch.float16`` or ``torch.half``
      - 16-bit floating point
-      - 0.1.6
-      - 2.0
    * - ``torch.bfloat16``
      - 16-bit floating point
-      - 1.6
-      - 2.6
    * - ``torch.float32`` or ``torch.float``
      - 32-bit floating point
-      - 0.1.12_2
-      - 2.0
    * - ``torch.float64`` or ``torch.double``
      - 64-bit floating point
-      - 0.1.12_2
-      - 2.0
    * - ``torch.complex32`` or ``torch.chalf``
-      - PyTorch provides native support for 32-bit complex numbers
-      - 1.6
-      - 2.0
+      - 32-bit complex numbers
    * - ``torch.complex64`` or ``torch.cfloat``
-      - PyTorch provides native support for 64-bit complex numbers
-      - 1.6
-      - 2.0
+      - 64-bit complex numbers
    * - ``torch.complex128`` or ``torch.cdouble``
-      - PyTorch provides native support for 128-bit complex numbers
-      - 1.6
-      - 2.0
+      - 128-bit complex numbers
    * - ``torch.uint8``
      - 8-bit integer (unsigned)
-      - 0.1.12_2
-      - 2.0
    * - ``torch.uint16``
-      - 16-bit integer (unsigned)
-      - 2.3
-      - Not natively supported
+      - 16-bit integer (unsigned);
+        Not natively supported in ROCm
    * - ``torch.uint32``
-      - 32-bit integer (unsigned)
-      - 2.3
-      - Not natively supported
+      - 32-bit integer (unsigned);
+        Not natively supported in ROCm
    * - ``torch.uint64``
-      - 32-bit integer (unsigned)
-      - 2.3
-      - Not natively supported
+      - 64-bit integer (unsigned);
+        Not natively supported in ROCm
    * - ``torch.int8``
      - 8-bit integer (signed)
-      - 1.12
-      - 5.0
    * - ``torch.int16`` or ``torch.short``
      - 16-bit integer (signed)
-      - 0.1.12_2
-      - 2.0
    * - ``torch.int32`` or ``torch.int``
      - 32-bit integer (signed)
-      - 0.1.12_2
-      - 2.0
    * - ``torch.int64`` or ``torch.long``
      - 64-bit integer (signed)
-      - 0.1.12_2
-      - 2.0
    * - ``torch.bool``
      - Boolean
-      - 1.2
-      - 2.0
    * - ``torch.quint8``
      - Quantized 8-bit integer (unsigned)
-      - 1.8
-      - 5.0
    * - ``torch.qint8``
      - Quantized 8-bit integer (signed)
-      - 1.8
-      - 5.0
    * - ``torch.qint32``
      - Quantized 32-bit integer (signed)
-      - 1.8
-      - 5.0
    * - ``torch.quint4x2``
      - Quantized 4-bit integer (unsigned)
-      - 1.8
-      - 5.0

 .. note::

-  Unsigned types except ``uint8`` have limited support in eager mode. They
+  Unsigned types, except ``uint8``, have limited support in eager mode. They
  primarily exist to assist usage with ``torch.compile``.

  See :doc:`ROCm precision support <rocm:reference/precision-support>` for the
  native hardware support of data types.

-torch.cuda
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-``torch.cuda`` in PyTorch is a module that provides utilities and functions for
-managing and utilizing AMD and NVIDIA GPUs. It enables GPU-accelerated
-computations, memory management, and efficient execution of tensor operations,
-leveraging ROCm and CUDA as the underlying frameworks.
-
-.. list-table::
-    :header-rows: 1
-
-    * - Feature
-      - Description
-      - As of PyTorch
-      - As of ROCm
-    * - Device management
-      - Utilities for managing and interacting with GPUs.
-      - 0.4.0
-      - 3.8
-    * - Tensor operations on GPU
-      - Performs tensor operations such as addition and matrix multiplications on
-        the GPU.
-      - 0.4.0
-      - 3.8
-    * - Streams and events
-      - Streams allow overlapping computation and communication for optimized
-        performance. Events enable synchronization.
-      - 1.6.0
-      - 3.8
-    * - Memory management
-      - Functions to manage and inspect memory usage like
-        ``torch.cuda.memory_allocated()``, ``torch.cuda.max_memory_allocated()``,
-        ``torch.cuda.memory_reserved()`` and ``torch.cuda.empty_cache()``.
-      - 0.3.0
-      - 1.9.2
-    * - Running process lists of memory management
-      - Returns a human-readable printout of the running processes and their GPU
-        memory use for a given device with functions like
-        ``torch.cuda.memory_stats()`` and ``torch.cuda.memory_summary()``.
-      - 1.8.0
-      - 4.0
-    * - Communication collectives
-      - Set of APIs that enable efficient communication between multiple GPUs,
-        allowing for distributed computing and data parallelism.
-      - 1.9.0
-      - 5.0
-    * - ``torch.cuda.CUDAGraph``
-      - Graphs capture sequences of GPU operations to minimize kernel launch
-        overhead and improve performance.
-      - 1.10.0
-      - 5.3
-    * - TunableOp
-      - A mechanism that allows certain operations to be more flexible and
-        optimized for performance. It enables automatic tuning of kernel
-        configurations and other settings to achieve the best possible
-        performance based on the specific hardware (GPU) and workload.
-      - 2.0
-      - 5.4
-    * - NVIDIA Tools Extension (NVTX)
-      - Integration with NVTX for profiling and debugging GPU performance using
-        NVIDIA's Nsight tools.
-      - 1.8.0
-      - ❌
-    * - Lazy loading NVRTC
-      - Delays JIT compilation with NVRTC until the code is explicitly needed.
-      - 1.13.0
-      - ❌
-    * - Jiterator (beta)
-      - Jiterator allows asynchronous data streaming into computation streams
-        during training loops.
-      - 1.13.0
-      - 5.2
-
-.. Need to validate and extend.
-
-torch.backends.cuda
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-``torch.backends.cuda`` is a PyTorch module that provides configuration options
-and flags to control the behavior of ROCm or CUDA operations. It is part of the
-PyTorch backend configuration system, which allows users to fine-tune how
-PyTorch interacts with the ROCm or CUDA environment.
-
-.. list-table::
-    :header-rows: 1
-
-    * - Feature
-      - Description
-      - As of PyTorch
-      - As of ROCm
-    * - ``cufft_plan_cache``
-      - Manages caching of GPU FFT plans to optimize repeated FFT computations.
-      - 1.7.0
-      - 5.0
-    * - ``matmul.allow_tf32``
-      - Enables or disables the use of TensorFloat-32 (TF32) precision for
-        faster matrix multiplications on GPUs with Tensor Cores.
-      - 1.10.0
-      - ❌
-    * - ``matmul.allow_fp16_reduced_precision_reduction``
-      - Reduced precision reductions (e.g., with fp16 accumulation type) are
-        allowed with fp16 GEMMs.
-      - 2.0
-      - ❌
-    * - ``matmul.allow_bf16_reduced_precision_reduction``
-      - Reduced precision reductions are allowed with bf16 GEMMs.
-      - 2.0
-      - ❌
-    * - ``enable_cudnn_sdp``
-      - Globally enables cuDNN SDPA's kernels within SDPA.
-      - 2.0
-      - ❌
-    * - ``enable_flash_sdp``
-      - Globally enables or disables FlashAttention for SDPA.
-      - 2.1
-      - ❌
-    * - ``enable_mem_efficient_sdp``
-      - Globally enables or disables Memory-Efficient Attention for SDPA.
-      - 2.1
-      - ❌
-    * - ``enable_math_sdp``
-      - Globally enables or disables the PyTorch C++ implementation within SDPA.
-      - 2.1
-      - ❌
-
-.. Need to validate and extend.
-
-torch.backends.cudnn
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Supported ``torch`` options include:
-
-.. list-table::
-    :header-rows: 1
-
-    * - Option
-      - Description
-      - As of PyTorch
-      - As of ROCm
-    * - ``allow_tf32``
-      - TensorFloat-32 tensor cores may be used in cuDNN convolutions on NVIDIA
-        Ampere or newer GPUs.
-      - 1.12.0
-      - ❌
-    * - ``deterministic``
-      - A bool that, if True, causes cuDNN to only use deterministic
-        convolution algorithms.
-      - 1.12.0
-      - 6.0
-
-Automatic mixed precision: torch.amp
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-PyTorch automates the process of using both 16-bit (half-precision, float16) and
-32-bit (single-precision, float32) floating-point types in model training and
-inference.
-
-.. list-table::
-    :header-rows: 1
-
-    * - Feature
-      - Description
-      - As of PyTorch
-      - As of ROCm
-    * - Autocasting
-      - Autocast instances serve as context managers or decorators that allow
-        regions of your script to run in mixed precision.
-      - 1.9
-      - 2.5
-    * - Gradient scaling
-      - To prevent underflow, “gradient scaling” multiplies the network’s
-        loss by a scale factor and invokes a backward pass on the scaled
-        loss. The same factor then scales gradients flowing backward through
-        the network. In other words, gradient values have a larger magnitude so
-        that they don’t flush to zero.
-      - 1.9
-      - 2.5
-    * - CUDA op-specific behavior
-      - These ops always go through autocasting whether they are invoked as part
-        of a ``torch.nn.Module``, as a function, or as a ``torch.Tensor`` method. If
-        functions are exposed in multiple namespaces, they go through
-        autocasting regardless of the namespace.
-      - 1.9
-      - 2.5
-
-Distributed library features
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-PyTorch distributed library includes a collective of parallelism modules, a
-communications layer, and infrastructure for launching and debugging large
-training jobs. See :ref:`rocm-for-ai-pytorch-distributed` for more information.
-
-The Distributed Library feature in PyTorch provides tools and APIs for building
-and running distributed machine learning workflows. It allows training models
-across multiple processes, GPUs, or nodes in a cluster, enabling efficient use
-of computational resources and scalability for large-scale tasks.
-
-.. list-table::
-    :header-rows: 1
-
-    * - Feature
-      - Description
-      - As of PyTorch
-      - As of ROCm
-    * - TensorPipe
-      - A point-to-point communication library integrated into
-        PyTorch for distributed training. It handles tensor data transfers
-        efficiently between different processes or devices, including those on
-        separate machines.
-      - 1.8
-      - 5.4
-    * - Gloo
-      - Designed for multi-machine and multi-GPU setups, enabling
-        efficient communication and synchronization between processes. Gloo is
-        one of the default backends for PyTorch's Distributed Data Parallel
-        (DDP) and RPC frameworks, alongside other backends like NCCL and MPI.
-      - 1.0
-      - 2.0
-
-torch.compiler
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. list-table::
-    :header-rows: 1
-
-    * - Feature
-      - Description
-      - As of PyTorch
-      - As of ROCm
-    * - ``torch.compiler`` (AOT Autograd)
-      - Autograd captures not only the user-level code, but also backpropagation,
-        which results in capturing the backwards pass “ahead-of-time”. This
-        enables acceleration of both forwards and backwards pass using
-        ``TorchInductor``.
-      - 2.0
-      - 5.3
-    * - ``torch.compiler`` (TorchInductor)
-      - The default ``torch.compile`` deep learning compiler that generates fast
-        code for multiple accelerators and backends. You need to use a backend
-        compiler to make speedups through ``torch.compile`` possible. For AMD,
-        NVIDIA, and Intel GPUs, it leverages OpenAI Triton as the key building block.
-      - 2.0
-      - 5.3
-
-torchaudio
+Supported modules
 --------------------------------------------------------------------------------

-The `torchaudio <https://pytorch.org/audio/stable/index.html>`_ library provides
-utilities for processing audio data in PyTorch, such as audio loading,
-transformations, and feature extraction.
+For a complete and up-to-date list of PyTorch core modules (for example., ``torch``,
+``torch.nn``, ``torch.cuda``, ``torch.backends.cuda`` and
+``torch.backends.cudnn``), their descriptions, and usage, please refer directly
+to the `official PyTorch documentation <https://pytorch.org/docs/stable/index.html>`_.

-To ensure GPU-acceleration with ``torchaudio.transforms``, you need to
-explicitly move audio data (waveform tensor) to GPU using ``.to('cuda')``.
+Core PyTorch functionality on ROCm includes tensor operations, neural network
+layers, automatic differentiation, distributed training, mixed-precision
+training, compilation features, and domain-specific libraries for audio, vision,
+text processing, and more.

-The following ``torchaudio`` features are GPU-accelerated.
+Supported domain libraries
+--------------------------------------------------------------------------------
+
+PyTorch offers specialized `domain libraries <https://pytorch.org/domains/>`_ with
+GPU acceleration that build on its core features to support specific application
+areas. The table below lists the PyTorch domain libraries that are compatible
+with ROCm.

 .. list-table::
    :header-rows: 1

-    * - Feature
+    * - Library
      - Description
-      - As of torchaudio version
-      - As of ROCm
-    * - ``torchaudio.transforms.Spectrogram``
-      - Generate a spectrogram of an input waveform using STFT.
-      - 0.6.0
-      - 4.5
-    * - ``torchaudio.transforms.MelSpectrogram``
-      - Generates the mel-scale spectrogram of raw audio signals.
-      - 0.9.0
-      - 4.5
-    * - ``torchaudio.transforms.MFCC``
-      - Extract of MFCC features.
-      - 0.9.0
-      - 4.5
-    * - ``torchaudio.transforms.Resample``
-      - Resamples a signal from one frequency to another.
-      - 0.9.0
-      - 4.5

-torchvision
--------------------------------------------------------------------------------
+    * - `torchaudio <https://docs.pytorch.org/audio/stable/index.html>`_ 
+      - Audio and signal processing library for PyTorch. Provides utilities for
+        audio I/O, signal and data processing functions, datasets, model
+        implementations, and application components for audio and speech
+        processing tasks.

-The `torchvision <https://pytorch.org/vision/stable/index.html>`_ library
-provides datasets, model architectures, and common image transformations for
-computer vision.
+        **Note:** To ensure GPU-acceleration with ``torchaudio.transforms``,
+        you need to explicitly move audio data (waveform tensor) to GPU using
+        ``.to('cuda')``.

-The following ``torchvision`` features are GPU-accelerated.
+    * - `torchtune <https://docs.pytorch.org/torchtune/stable/index.html>`_
+      - PyTorch-native library designed for fine-tuning large language models
+        (LLMs). Provides supports the full fine-tuning workflow and offers
+        compatibility with popular production inference systems.

-.. list-table::
-    :header-rows: 1
+        **Note:** Only official release exists.

-    * - Feature
-      - Description
-      - As of torchvision version
-      - As of ROCm
-    * - ``torchvision.transforms.functional``
-      - Provides GPU-compatible transformations for image preprocessing like
-        resize, normalize, rotate and crop.
-      - 0.2.0
-      - 4.0
-    * - ``torchvision.ops``
-      - GPU-accelerated operations for object detection and segmentation tasks.
-        ``torchvision.ops.roi_align``, ``torchvision.ops.nms`` and
-        ``box_convert``.
-      - 0.6.0
-      - 3.3
-    * - ``torchvision.models`` with ``.to('cuda')``
-      - ``torchvision`` provides several pre-trained models (ResNet, Faster
-        R-CNN, Mask R-CNN, ...) that can run on CUDA for faster inference and
-        training.
-      - 0.1.6
-      - 2.x
-    * - ``torchvision.io``
-      - Enables video decoding and frame extraction using GPU acceleration with NVIDIA’s
-        NVDEC and nvJPEG (rocJPEG) on CUDA-enabled GPUs.
-      - 0.4.0
-      - 6.3
+    * - `torchvision <https://docs.pytorch.org/vision/stable/index.html>`_
+      - Computer vision library that is part of the PyTorch project. Provides
+        popular datasets, model architectures, and common image transformations
+        for computer vision applications.

-torchtext
--------------------------------------------------------------------------------
+    * - `torchtext <https://docs.pytorch.org/text/stable/index.html>`_
+      - Text processing library for PyTorch. Provides data processing utilities
+        and popular datasets for natural language processing, including
+        tokenization, vocabulary management, and text embeddings.

-The `torchtext <https://pytorch.org/text/stable/index.html>`_ library provides
-utilities for processing and working with text data in PyTorch, including
-tokenization, vocabulary management, and text embeddings. torchtext supports
-preprocessing pipelines and integration with PyTorch models, simplifying the
-implementation of natural language processing (NLP) tasks.
+        **Note:** ``torchtext`` does not implement ROCm-specific kernels. 
+        ROCm acceleration is provided through the underlying PyTorch framework
+        and ROCm library integration. Only official release exists.

-To leverage GPU acceleration in torchtext, you need to move tensors
-explicitly to the GPU using ``.to('cuda')``.
+    * - `torchdata <https://docs.pytorch.org/data/beta/index.html>`_
+      - Beta library of common modular data loading primitives for easily
+        constructing flexible and performant data pipelines, with features still
+        in prototype stage.

-* torchtext does not implement its own kernels. ROCm support is enabled by linking against ROCm libraries.
+    * - `torchrec <https://docs.pytorch.org/torchrec/>`_
+      - PyTorch domain library for common sparsity and parallelism primitives
+        needed for large-scale recommender systems, enabling authors to train
+        models with large embedding tables shared across many GPUs.

-* Only official release exists.
+        **Note:** ``torchrec`` does not implement ROCm-specific kernels. ROCm
+        acceleration is provided through the underlying PyTorch framework and
+        ROCm library integration.

-torchtune
--------------------------------------------------------------------------------
+    * - `torchserve <https://docs.pytorch.org/serve/>`_
+      - Performant, flexible and easy-to-use tool for serving PyTorch models in
+        production, providing features for model management, batch processing,
+        and scalable deployment.

-The `torchtune <https://pytorch.org/torchtune/stable/index.html>`_ library for
-authoring, fine-tuning and experimenting with LLMs.
+        **Note:** `torchserve <https://docs.pytorch.org/serve/>`_ is no longer
+        actively maintained. Last official release is sent out with PyTorch 2.4.

-* Usage: Enabling developers to fine-tune ROCm PyTorch solutions.
+    * - `torchrl <https://docs.pytorch.org/rl/stable/index.html>`_
+      - Open-source, Python-first Reinforcement Learning library for PyTorch
+        with a focus on high modularity and good runtime performance, providing
+        low and high-level RL abstractions and reusable functionals for cost
+        functions, returns, and data processing.

-* Only official release exists.
+        **Note:** Only official release exists.

-torchserve
--------------------------------------------------------------------------------
+    * - `tensordict <https://docs.pytorch.org/tensordict/stable/index.html>`_
+      - Dictionary-like class that simplifies operations on batches of tensors,
+        enhancing code readability, compactness, and modularity by abstracting
+        tailored operations and reducing errors through automatic operation
+        dispatching.

-The `torchserve <https://pytorch.org/serve/>`_ is a PyTorch domain library
-for common sparsity and parallelism primitives needed for large-scale recommender
-systems.
-
-* torchtext does not implement its own kernels. ROCm support is enabled by
-  linking against ROCm libraries.
-
-* Only official release exists.
-
-torchrec
--------------------------------------------------------------------------------
-
-The `torchrec <https://pytorch.org/torchrec/>`_ is a PyTorch domain library for
-common sparsity and parallelism primitives needed for large-scale recommender
-systems.
-
-* torchrec does not implement its own kernels. ROCm support is enabled by
-  linking against ROCm libraries.
-
-* Only official release exists.
-
-Unsupported PyTorch features
-================================================================================
-
-The following GPU-accelerated PyTorch features are not supported by ROCm for
-the listed supported PyTorch versions.
-
-.. list-table::
-    :widths: 30, 60, 10
-    :header-rows: 1
-
-    * - Feature
-      - Description
-      - As of PyTorch
-    * - APEX batch norm
-      - Use APEX batch norm instead of PyTorch batch norm.
-      - 1.6.0
-    * - ``torch.backends.cuda`` / ``matmul.allow_tf32``
-      - A bool that controls whether TensorFloat-32 tensor cores may be used in
-        matrix multiplications.
-      - 1.7
-    * - ``torch.cuda`` / NVIDIA Tools Extension (NVTX)
-      - Integration with NVTX for profiling and debugging GPU performance using
-        NVIDIA's Nsight tools.
-      - 1.7.0
-    * - ``torch.cuda`` / Lazy loading NVRTC
-      - Delays JIT compilation with NVRTC until the code is explicitly needed.
-      - 1.8.0
-    * - ``torch-tensorrt``
-      - Integrate TensorRT library for optimizing and deploying PyTorch models.
-        ROCm does not have equialent library for TensorRT.
-      - 1.9.0
-    * - ``torch.backends`` / ``cudnn.allow_tf32``
-      - TensorFloat-32 tensor cores may be used in cuDNN convolutions.
-      - 1.10.0
-    * - ``torch.backends.cuda`` / ``matmul.allow_fp16_reduced_precision_reduction``
-      - Reduced precision reductions with fp16 accumulation type are
-        allowed with fp16 GEMMs.
-      - 2.0
-    * - ``torch.backends.cuda`` / ``matmul.allow_bf16_reduced_precision_reduction``
-      - Reduced precision reductions are allowed with bf16 GEMMs.
-      - 2.0
-    * - ``torch.nn.functional`` / ``scaled_dot_product_attention``
-      - Flash attention backend for SDPA to accelerate attention computation in
-        transformer-based models.
-      - 2.0
-    * - ``torch.backends.cuda`` / ``enable_cudnn_sdp``
-      - Globally enables cuDNN SDPA's kernels within SDPA.
-      - 2.0
-    * - ``torch.backends.cuda`` / ``enable_flash_sdp``
-      - Globally enables or disables FlashAttention for SDPA.
-      - 2.1
-    * - ``torch.backends.cuda`` / ``enable_mem_efficient_sdp``
-      - Globally enables or disables Memory-Efficient Attention for SDPA.
-      - 2.1
-    * - ``torch.backends.cuda`` / ``enable_math_sdp``
-      - Globally enables or disables the PyTorch C++ implementation within SDPA.
-      - 2.1
-    * - Dynamic parallelism
-      - PyTorch itself does not directly expose dynamic parallelism as a core
-        feature. Dynamic parallelism allow GPU threads to launch additional
-        threads which can be reached using custom operations via the
-        ``torch.utils.cpp_extension`` module.
-      - Not a core feature
-    * - Unified memory support in PyTorch
-      - Unified Memory is not directly exposed in PyTorch's core API, it can be
-        utilized effectively through custom CUDA extensions or advanced
-        workflows.
-      - Not a core feature
+        **Note:** Only official release exists.
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -34,86 +34,69 @@ project = "ROCm Documentation"
 project_path = os.path.abspath(".").replace("\\", "/")
 author = "Advanced Micro Devices, Inc."
 copyright = "Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved."
-version = "7.0 Alpha"
-release = "7.0 Alpha"
+version = "6.4.1"
+release = "6.4.1"
 setting_all_article_info = True
 all_article_info_os = ["linux", "windows"]
 all_article_info_author = ""

 # pages with specific settings
 article_pages = [
-    {"file": "preview/index", "os": ["linux"],},
-    {"file": "preview/release", "os": ["linux"],},
-    {"file": "preview/install/index", "os": ["linux"],},
-    {"file": "preview/install/instinct-driver", "os": ["linux"],},
-    {"file": "preview/install/rocm", "os": ["linux"],},
-    {"file": "preview/benchmark-docker/index", "os": ["linux"],},
-    {"file": "preview/benchmark-docker/training", "os": ["linux"],},
-    {"file": "preview/benchmark-docker/pre-training-megatron-lm-llama-3-8b", "os": ["linux"],},
-    {"file": "preview/benchmark-docker/pre-training-torchtitan-llama-3-70b", "os": ["linux"],},
-    {"file": "preview/benchmark-docker/fine-tuning-lora-llama-2-70b", "os": ["linux"],},
-    {"file": "preview/benchmark-docker/inference", "os": ["linux"],},
-    {"file": "preview/benchmark-docker/inference-vllm-llama-3.1-405b-fp4", "os": ["linux"],},
-    {"file": "preview/benchmark-docker/inference-sglang-deepseek-r1-fp4", "os": ["linux"],},
+    {"file": "about/release-notes", "os": ["linux"], "date": "2025-05-07"},
+    {"file": "release/changelog", "os": ["linux"],},
+    {"file": "compatibility/compatibility-matrix", "os": ["linux"]},
+    {"file": "compatibility/ml-compatibility/pytorch-compatibility", "os": ["linux"]},
+    {"file": "compatibility/ml-compatibility/tensorflow-compatibility", "os": ["linux"]},
+    {"file": "compatibility/ml-compatibility/jax-compatibility", "os": ["linux"]},
+    {"file": "how-to/deep-learning-rocm", "os": ["linux"]},

-    # {"file": "about/release-notes", "os": ["linux"], "date": "2025-06-26"},
-    # {"file": "release/changelog", "os": ["linux"],},
-    # {"file": "compatibility/compatibility-matrix", "os": ["linux"]},
-    # {"file": "compatibility/ml-compatibility/pytorch-compatibility", "os": ["linux"]},
-    # {"file": "compatibility/ml-compatibility/tensorflow-compatibility", "os": ["linux"]},
-    # {"file": "compatibility/ml-compatibility/jax-compatibility", "os": ["linux"]},
-    # {"file": "how-to/deep-learning-rocm", "os": ["linux"]},
-    #
-    # {"file": "how-to/rocm-for-ai/index", "os": ["linux"]},
-    # {"file": "how-to/rocm-for-ai/install", "os": ["linux"]},
-    # {"file": "how-to/rocm-for-ai/system-health-check", "os": ["linux"]},
-    #
-    # {"file": "how-to/rocm-for-ai/training/index", "os": ["linux"]},
-    # {"file": "how-to/rocm-for-ai/training/train-a-model", "os": ["linux"]},
-    # {"file": "how-to/rocm-for-ai/training/prerequisite-system-validation", "os": ["linux"]},
-    # {"file": "how-to/rocm-for-ai/training/benchmark-docker/megatron-lm", "os": ["linux"]},
-    # {"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]},
-    # {"file": "how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry", "os": ["linux"]},
-    # {"file": "how-to/rocm-for-ai/training/scale-model-training", "os": ["linux"]},
-    #
-    # {"file": "how-to/rocm-for-ai/fine-tuning/index", "os": ["linux"]},
-    # {"file": "how-to/rocm-for-ai/fine-tuning/overview", "os": ["linux"]},
-    # {"file": "how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference", "os": ["linux"]},
-    # {"file": "how-to/rocm-for-ai/fine-tuning/single-gpu-fine-tuning-and-inference", "os": ["linux"]},
-    # {"file": "how-to/rocm-for-ai/fine-tuning/multi-gpu-fine-tuning-and-inference", "os": ["linux"]},
-    #
-    # {"file": "how-to/rocm-for-ai/inference/index", "os": ["linux"]},
-    # {"file": "how-to/rocm-for-ai/inference/hugging-face-models", "os": ["linux"]},
-    # {"file": "how-to/rocm-for-ai/inference/llm-inference-frameworks", "os": ["linux"]},
-    # {"file": "how-to/rocm-for-ai/inference/benchmark-docker/vllm", "os": ["linux"]},
-    # {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250513", "os": ["linux"]},
-    # {"file": "how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference", "os": ["linux"]},
-    # {"file": "how-to/rocm-for-ai/inference/deploy-your-model", "os": ["linux"]},
-    #
-    # {"file": "how-to/rocm-for-ai/inference-optimization/index", "os": ["linux"]},
-    # {"file": "how-to/rocm-for-ai/inference-optimization/model-quantization", "os": ["linux"]},
-    # {"file": "how-to/rocm-for-ai/inference-optimization/model-acceleration-libraries", "os": ["linux"]},
-    # {"file": "how-to/rocm-for-ai/inference-optimization/optimizing-with-composable-kernel", "os": ["linux"]},
-    # {"file": "how-to/rocm-for-ai/inference-optimization/optimizing-triton-kernel", "os": ["linux"]},
-    # {"file": "how-to/rocm-for-ai/inference-optimization/profiling-and-debugging", "os": ["linux"]},
-    # {"file": "how-to/rocm-for-ai/inference-optimization/workload", "os": ["linux"]},
-    #
-    # {"file": "how-to/system-optimization/index", "os": ["linux"]},
-    # {"file": "how-to/system-optimization/mi300x", "os": ["linux"]},
-    # {"file": "how-to/system-optimization/mi200", "os": ["linux"]},
-    # {"file": "how-to/system-optimization/mi100", "os": ["linux"]},
-    # {"file": "how-to/system-optimization/w6000-v620", "os": ["linux"]},
-    # {"file": "how-to/tuning-guides/mi300x/index", "os": ["linux"]},
-    # {"file": "how-to/tuning-guides/mi300x/system", "os": ["linux"]},
-    # {"file": "how-to/tuning-guides/mi300x/workload", "os": ["linux"]},
-    # {"file": "how-to/system-debugging", "os": ["linux"]},
-    # {"file": "how-to/gpu-enabled-mpi", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/index", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/install", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/system-health-check", "os": ["linux"]},
+
+    {"file": "how-to/rocm-for-ai/training/index", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/train-a-model", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/prerequisite-system-validation", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/megatron-lm", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/mpt-llm-foundry", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/scale-model-training", "os": ["linux"]},
+
+    {"file": "how-to/rocm-for-ai/fine-tuning/index", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/fine-tuning/overview", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/fine-tuning/single-gpu-fine-tuning-and-inference", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/fine-tuning/multi-gpu-fine-tuning-and-inference", "os": ["linux"]},
+
+    {"file": "how-to/rocm-for-ai/inference/index", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/hugging-face-models", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/llm-inference-frameworks", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/vllm", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250513", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/deploy-your-model", "os": ["linux"]},
+
+    {"file": "how-to/rocm-for-ai/inference-optimization/index", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference-optimization/model-quantization", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference-optimization/model-acceleration-libraries", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference-optimization/optimizing-with-composable-kernel", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference-optimization/optimizing-triton-kernel", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference-optimization/profiling-and-debugging", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference-optimization/workload", "os": ["linux"]},
+
+    {"file": "how-to/system-optimization/index", "os": ["linux"]},
+    {"file": "how-to/system-optimization/mi300x", "os": ["linux"]},
+    {"file": "how-to/system-optimization/mi200", "os": ["linux"]},
+    {"file": "how-to/system-optimization/mi100", "os": ["linux"]},
+    {"file": "how-to/system-optimization/w6000-v620", "os": ["linux"]},
+    {"file": "how-to/tuning-guides/mi300x/index", "os": ["linux"]},
+    {"file": "how-to/tuning-guides/mi300x/system", "os": ["linux"]},
+    {"file": "how-to/tuning-guides/mi300x/workload", "os": ["linux"]},
+    {"file": "how-to/system-debugging", "os": ["linux"]},
+    {"file": "how-to/gpu-enabled-mpi", "os": ["linux"]},
 ]

 external_toc_path = "./sphinx/_toc.yml"
-# Options to improve documentation build time for preview release documentation
-external_toc_exclude_missing = True # don't build files that aren't in the TOC
-external_projects_remote_repository = "" # don't fetch data to resolve intersphinx xrefs

 # Add the _extensions directory to Python's search path
 sys.path.append(str(Path(__file__).parent / 'extension'))
@@ -139,7 +122,7 @@ html_static_path = ["sphinx/static/css", "extension/how-to/rocm-for-ai/inference
 html_css_files = ["rocm_custom.css", "rocm_rn.css", "vllm-benchmark.css"]
 html_js_files = ["vllm-benchmark.js"]

-html_title = "ROCm 7.0 Alpha documentation"
+html_title = "ROCm Documentation"

 html_theme_options = {"link_main_doc": False}

--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.7.3_20250325-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.7.3_20250325-benchmark-models.yaml
@@ -1,159 +0,0 @@
-vllm_benchmark:
-  unified_docker:
-    latest:
-      pull_tag: rocm/vllm:rocm6.3.1_instinct_vllm0.7.3_20250325
-      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.7.3_20250325/images/sha256-25245924f61750b19be6dcd8e787e46088a496c1fe17ee9b9e397f3d84d35640
-      rocm_version: 6.3.1
-      vllm_version: 0.7.3
-      pytorch_version: 2.7.0 (dev nightly)
-      hipblaslt_version: 0.13
-  model_groups:
-    - group: Llama
-      tag: llama
-      models:
-      - model: Llama 3.1 8B
-        mad_tag: pyt_vllm_llama-3.1-8b
-        model_repo: meta-llama/Llama-3.1-8B-Instruct
-        url: https://huggingface.co/meta-llama/Llama-3.1-8B
-        precision: float16
-      - model: Llama 3.1 70B
-        mad_tag: pyt_vllm_llama-3.1-70b
-        model_repo: meta-llama/Llama-3.1-70B-Instruct
-        url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
-        precision: float16
-      - model: Llama 3.1 405B
-        mad_tag: pyt_vllm_llama-3.1-405b
-        model_repo: meta-llama/Llama-3.1-405B-Instruct
-        url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
-        precision: float16
-      - model: Llama 3.2 11B Vision
-        mad_tag: pyt_vllm_llama-3.2-11b-vision-instruct
-        model_repo: meta-llama/Llama-3.2-11B-Vision-Instruct
-        url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct
-        precision: float16
-      - model: Llama 2 7B
-        mad_tag: pyt_vllm_llama-2-7b
-        model_repo: meta-llama/Llama-2-7b-chat-hf
-        url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
-        precision: float16
-      - model: Llama 2 70B
-        mad_tag: pyt_vllm_llama-2-70b
-        model_repo: meta-llama/Llama-2-70b-chat-hf
-        url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
-        precision: float16
-      - model: Llama 3.1 8B FP8
-        mad_tag: pyt_vllm_llama-3.1-8b_fp8
-        model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
-        url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
-        precision: float8
-      - model: Llama 3.1 70B FP8
-        mad_tag: pyt_vllm_llama-3.1-70b_fp8
-        model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
-        url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
-        precision: float8
-      - model: Llama 3.1 405B FP8
-        mad_tag: pyt_vllm_llama-3.1-405b_fp8
-        model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
-        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
-        precision: float8
-    - group: Mistral
-      tag: mistral
-      models:
-      - model: Mixtral MoE 8x7B
-        mad_tag: pyt_vllm_mixtral-8x7b
-        model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
-        url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
-        precision: float16
-      - model: Mixtral MoE 8x22B
-        mad_tag: pyt_vllm_mixtral-8x22b
-        model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
-        url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
-        precision: float16
-      - model: Mistral 7B
-        mad_tag: pyt_vllm_mistral-7b
-        model_repo: mistralai/Mistral-7B-Instruct-v0.3
-        url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
-        precision: float16
-      - model: Mixtral MoE 8x7B FP8
-        mad_tag: pyt_vllm_mixtral-8x7b_fp8
-        model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
-        url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
-        precision: float8
-      - model: Mixtral MoE 8x22B FP8
-        mad_tag: pyt_vllm_mixtral-8x22b_fp8
-        model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
-        url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
-        precision: float8
-      - model: Mistral 7B FP8
-        mad_tag: pyt_vllm_mistral-7b_fp8
-        model_repo: amd/Mistral-7B-v0.1-FP8-KV
-        url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
-        precision: float8
-    - group: Qwen
-      tag: qwen
-      models:
-      - model: Qwen2 7B
-        mad_tag: pyt_vllm_qwen2-7b
-        model_repo: Qwen/Qwen2-7B-Instruct
-        url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
-        precision: float16
-      - model: Qwen2 72B
-        mad_tag: pyt_vllm_qwen2-72b
-        model_repo: Qwen/Qwen2-72B-Instruct
-        url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
-        precision: float16
-    - group: JAIS
-      tag: jais
-      models:
-      - model: JAIS 13B
-        mad_tag: pyt_vllm_jais-13b
-        model_repo: core42/jais-13b-chat
-        url: https://huggingface.co/core42/jais-13b-chat
-        precision: float16
-      - model: JAIS 30B
-        mad_tag: pyt_vllm_jais-30b
-        model_repo: core42/jais-30b-chat-v3
-        url: https://huggingface.co/core42/jais-30b-chat-v3
-        precision: float16
-    - group: DBRX
-      tag: dbrx
-      models:
-      - model: DBRX Instruct
-        mad_tag: pyt_vllm_dbrx-instruct
-        model_repo: databricks/dbrx-instruct
-        url: https://huggingface.co/databricks/dbrx-instruct
-        precision: float16
-      - model: DBRX Instruct FP8
-        mad_tag: pyt_vllm_dbrx_fp8
-        model_repo: amd/dbrx-instruct-FP8-KV
-        url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
-        precision: float8
-    - group: Gemma
-      tag: gemma
-      models:
-      - model: Gemma 2 27B
-        mad_tag: pyt_vllm_gemma-2-27b
-        model_repo: google/gemma-2-27b
-        url: https://huggingface.co/google/gemma-2-27b
-        precision: float16
-    - group: Cohere
-      tag: cohere
-      models:
-      - model: C4AI Command R+ 08-2024
-        mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
-        model_repo: CohereForAI/c4ai-command-r-plus-08-2024
-        url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
-        precision: float16
-      - model: C4AI Command R+ 08-2024 FP8
-        mad_tag: pyt_vllm_command-r-plus_fp8
-        model_repo: amd/c4ai-command-r-plus-FP8-KV
-        url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
-        precision: float8
-    - group: DeepSeek
-      tag: deepseek
-      models:
-      - model: DeepSeek MoE 16B
-        mad_tag: pyt_vllm_deepseek-moe-16b-chat
-        model_repo: deepseek-ai/deepseek-moe-16b-chat
-        url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
-        precision: float16
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.8.3_20250415-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.8.3_20250415-benchmark-models.yaml
@@ -1,152 +0,0 @@
-vllm_benchmark:
-  unified_docker:
-    latest:
-      pull_tag: rocm/vllm:rocm6.3.1_instinct_vllm0.8.3_20250415
-      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.8.3_20250415/images/sha256-ad9062dea3483d59dedb17c67f7c49f30eebd6eb37c3fac0a171fb19696cc845
-      rocm_version: 6.3.1
-      vllm_version: 0.8.3
-      pytorch_version: 2.7.0 (dev nightly)
-      hipblaslt_version: 0.13
-  model_groups:
-    - group: Llama
-      tag: llama
-      models:
-      - model: Llama 3.1 8B
-        mad_tag: pyt_vllm_llama-3.1-8b
-        model_repo: meta-llama/Llama-3.1-8B-Instruct
-        url: https://huggingface.co/meta-llama/Llama-3.1-8B
-        precision: float16
-      - model: Llama 3.1 70B
-        mad_tag: pyt_vllm_llama-3.1-70b
-        model_repo: meta-llama/Llama-3.1-70B-Instruct
-        url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
-        precision: float16
-      - model: Llama 3.1 405B
-        mad_tag: pyt_vllm_llama-3.1-405b
-        model_repo: meta-llama/Llama-3.1-405B-Instruct
-        url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
-        precision: float16
-      - model: Llama 3.2 11B Vision
-        mad_tag: pyt_vllm_llama-3.2-11b-vision-instruct
-        model_repo: meta-llama/Llama-3.2-11B-Vision-Instruct
-        url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct
-        precision: float16
-      - model: Llama 2 7B
-        mad_tag: pyt_vllm_llama-2-7b
-        model_repo: meta-llama/Llama-2-7b-chat-hf
-        url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
-        precision: float16
-      - model: Llama 2 70B
-        mad_tag: pyt_vllm_llama-2-70b
-        model_repo: meta-llama/Llama-2-70b-chat-hf
-        url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
-        precision: float16
-      - model: Llama 3.1 8B FP8
-        mad_tag: pyt_vllm_llama-3.1-8b_fp8
-        model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
-        url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
-        precision: float8
-      - model: Llama 3.1 70B FP8
-        mad_tag: pyt_vllm_llama-3.1-70b_fp8
-        model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
-        url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
-        precision: float8
-      - model: Llama 3.1 405B FP8
-        mad_tag: pyt_vllm_llama-3.1-405b_fp8
-        model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
-        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
-        precision: float8
-    - group: Mistral
-      tag: mistral
-      models:
-      - model: Mixtral MoE 8x7B
-        mad_tag: pyt_vllm_mixtral-8x7b
-        model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
-        url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
-        precision: float16
-      - model: Mixtral MoE 8x22B
-        mad_tag: pyt_vllm_mixtral-8x22b
-        model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
-        url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
-        precision: float16
-      - model: Mistral 7B
-        mad_tag: pyt_vllm_mistral-7b
-        model_repo: mistralai/Mistral-7B-Instruct-v0.3
-        url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
-        precision: float16
-      - model: Mixtral MoE 8x7B FP8
-        mad_tag: pyt_vllm_mixtral-8x7b_fp8
-        model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
-        url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
-        precision: float8
-      - model: Mixtral MoE 8x22B FP8
-        mad_tag: pyt_vllm_mixtral-8x22b_fp8
-        model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
-        url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
-        precision: float8
-      - model: Mistral 7B FP8
-        mad_tag: pyt_vllm_mistral-7b_fp8
-        model_repo: amd/Mistral-7B-v0.1-FP8-KV
-        url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
-        precision: float8
-    - group: Qwen
-      tag: qwen
-      models:
-      - model: Qwen2 7B
-        mad_tag: pyt_vllm_qwen2-7b
-        model_repo: Qwen/Qwen2-7B-Instruct
-        url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
-        precision: float16
-      - model: Qwen2 72B
-        mad_tag: pyt_vllm_qwen2-72b
-        model_repo: Qwen/Qwen2-72B-Instruct
-        url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
-        precision: float16
-      - model: QwQ-32B
-        mad_tag: pyt_vllm_qwq-32b
-        model_repo: Qwen/QwQ-32B
-        url: https://huggingface.co/Qwen/QwQ-32B
-        precision: float16
-        tunableop: true
-    - group: DBRX
-      tag: dbrx
-      models:
-      - model: DBRX Instruct
-        mad_tag: pyt_vllm_dbrx-instruct
-        model_repo: databricks/dbrx-instruct
-        url: https://huggingface.co/databricks/dbrx-instruct
-        precision: float16
-      - model: DBRX Instruct FP8
-        mad_tag: pyt_vllm_dbrx_fp8
-        model_repo: amd/dbrx-instruct-FP8-KV
-        url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
-        precision: float8
-    - group: Gemma
-      tag: gemma
-      models:
-      - model: Gemma 2 27B
-        mad_tag: pyt_vllm_gemma-2-27b
-        model_repo: google/gemma-2-27b
-        url: https://huggingface.co/google/gemma-2-27b
-        precision: float16
-    - group: Cohere
-      tag: cohere
-      models:
-      - model: C4AI Command R+ 08-2024
-        mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
-        model_repo: CohereForAI/c4ai-command-r-plus-08-2024
-        url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
-        precision: float16
-      - model: C4AI Command R+ 08-2024 FP8
-        mad_tag: pyt_vllm_command-r-plus_fp8
-        model_repo: amd/c4ai-command-r-plus-FP8-KV
-        url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
-        precision: float8
-    - group: DeepSeek
-      tag: deepseek
-      models:
-      - model: DeepSeek MoE 16B
-        mad_tag: pyt_vllm_deepseek-moe-16b-chat
-        model_repo: deepseek-ai/deepseek-moe-16b-chat
-        url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
-        precision: float16
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.8.5_20250521-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.8.5_20250521-benchmark-models.yaml
@@ -1,167 +0,0 @@
-vllm_benchmark:
-  unified_docker:
-    latest:
-      pull_tag: rocm/vllm:rocm6.3.1_vllm0.8.5_20250521
-      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250521/images/sha256-38410c51af7208897cd8b737c9bdfc126e9bc8952d4aa6b88c85482f03092a11
-      rocm_version: 6.3.1
-      vllm_version: 0.8.5 (0.8.6.dev315+g91a560098.rocm631)
-      pytorch_version: 2.7.0+gitf717b2a
-      hipblaslt_version: 0.15
-  model_groups:
-    - group: Meta Llama
-      tag: llama
-      models:
-        - model: Llama 3.1 8B
-          mad_tag: pyt_vllm_llama-3.1-8b
-          model_repo: meta-llama/Llama-3.1-8B-Instruct
-          url: https://huggingface.co/meta-llama/Llama-3.1-8B
-          precision: float16
-        - model: Llama 3.1 70B
-          mad_tag: pyt_vllm_llama-3.1-70b
-          model_repo: meta-llama/Llama-3.1-70B-Instruct
-          url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
-          precision: float16
-        - model: Llama 3.1 405B
-          mad_tag: pyt_vllm_llama-3.1-405b
-          model_repo: meta-llama/Llama-3.1-405B-Instruct
-          url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
-          precision: float16
-        - model: Llama 3.2 11B Vision
-          mad_tag: pyt_vllm_llama-3.2-11b-vision-instruct
-          model_repo: meta-llama/Llama-3.2-11B-Vision-Instruct
-          url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct
-          precision: float16
-        - model: Llama 2 7B
-          mad_tag: pyt_vllm_llama-2-7b
-          model_repo: meta-llama/Llama-2-7b-chat-hf
-          url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
-          precision: float16
-        - model: Llama 2 70B
-          mad_tag: pyt_vllm_llama-2-70b
-          model_repo: meta-llama/Llama-2-70b-chat-hf
-          url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
-          precision: float16
-        - model: Llama 3.1 8B FP8
-          mad_tag: pyt_vllm_llama-3.1-8b_fp8
-          model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
-          url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
-          precision: float8
-        - model: Llama 3.1 70B FP8
-          mad_tag: pyt_vllm_llama-3.1-70b_fp8
-          model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
-          url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
-          precision: float8
-        - model: Llama 3.1 405B FP8
-          mad_tag: pyt_vllm_llama-3.1-405b_fp8
-          model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
-          url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
-          precision: float8
-    - group: Mistral AI
-      tag: mistral
-      models:
-        - model: Mixtral MoE 8x7B
-          mad_tag: pyt_vllm_mixtral-8x7b
-          model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
-          url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
-          precision: float16
-        - model: Mixtral MoE 8x22B
-          mad_tag: pyt_vllm_mixtral-8x22b
-          model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
-          url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
-          precision: float16
-        - model: Mistral 7B
-          mad_tag: pyt_vllm_mistral-7b
-          model_repo: mistralai/Mistral-7B-Instruct-v0.3
-          url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
-          precision: float16
-        - model: Mixtral MoE 8x7B FP8
-          mad_tag: pyt_vllm_mixtral-8x7b_fp8
-          model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
-          url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
-          precision: float8
-        - model: Mixtral MoE 8x22B FP8
-          mad_tag: pyt_vllm_mixtral-8x22b_fp8
-          model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
-          url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
-          precision: float8
-        - model: Mistral 7B FP8
-          mad_tag: pyt_vllm_mistral-7b_fp8
-          model_repo: amd/Mistral-7B-v0.1-FP8-KV
-          url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
-          precision: float8
-    - group: Qwen
-      tag: qwen
-      models:
-        - model: Qwen2 7B
-          mad_tag: pyt_vllm_qwen2-7b
-          model_repo: Qwen/Qwen2-7B-Instruct
-          url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
-          precision: float16
-        - model: Qwen2 72B
-          mad_tag: pyt_vllm_qwen2-72b
-          model_repo: Qwen/Qwen2-72B-Instruct
-          url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
-          precision: float16
-        - model: QwQ-32B
-          mad_tag: pyt_vllm_qwq-32b
-          model_repo: Qwen/QwQ-32B
-          url: https://huggingface.co/Qwen/QwQ-32B
-          precision: float16
-          tunableop: true
-    - group: Databricks DBRX
-      tag: dbrx
-      models:
-        - model: DBRX Instruct
-          mad_tag: pyt_vllm_dbrx-instruct
-          model_repo: databricks/dbrx-instruct
-          url: https://huggingface.co/databricks/dbrx-instruct
-          precision: float16
-        - model: DBRX Instruct FP8
-          mad_tag: pyt_vllm_dbrx_fp8
-          model_repo: amd/dbrx-instruct-FP8-KV
-          url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
-          precision: float8
-    - group: Google Gemma
-      tag: gemma
-      models:
-        - model: Gemma 2 27B
-          mad_tag: pyt_vllm_gemma-2-27b
-          model_repo: google/gemma-2-27b
-          url: https://huggingface.co/google/gemma-2-27b
-          precision: float16
-    - group: Cohere
-      tag: cohere
-      models:
-        - model: C4AI Command R+ 08-2024
-          mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
-          model_repo: CohereForAI/c4ai-command-r-plus-08-2024
-          url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
-          precision: float16
-        - model: C4AI Command R+ 08-2024 FP8
-          mad_tag: pyt_vllm_command-r-plus_fp8
-          model_repo: amd/c4ai-command-r-plus-FP8-KV
-          url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
-          precision: float8
-    - group: DeepSeek
-      tag: deepseek
-      models:
-        - model: DeepSeek MoE 16B
-          mad_tag: pyt_vllm_deepseek-moe-16b-chat
-          model_repo: deepseek-ai/deepseek-moe-16b-chat
-          url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
-          precision: float16
-    - group: Microsoft Phi
-      tag: phi
-      models:
-        - model: Phi-4
-          mad_tag: pyt_vllm_phi-4
-          model_repo: microsoft/phi-4
-          url: https://huggingface.co/microsoft/phi-4
-    - group: TII Falcon
-      tag: falcon
-      models:
-        - model: Falcon 180B
-          mad_tag: pyt_vllm_falcon-180b
-          model_repo: tiiuae/falcon-180B
-          url: https://huggingface.co/tiiuae/falcon-180B
-          precision: float16
--- a/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
@@ -1,10 +1,10 @@
 vllm_benchmark:
  unified_docker:
    latest:
-      pull_tag: rocm/vllm:rocm6.4.1_vllm_0.9.0.1_20250605
-      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.0.1_20250605/images/sha256-f48beeb3d72663a93c77211eb45273d564451447c097e060befa713d565fa36c
-      rocm_version: 6.4.1
-      vllm_version: 0.9.0.1 (0.9.0.2.dev108+g71faa1880.rocm641)
+      pull_tag: rocm/vllm:rocm6.3.1_vllm0.8.5_20250521
+      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250521/images/sha256-38410c51af7208897cd8b737c9bdfc126e9bc8952d4aa6b88c85482f03092a11
+      rocm_version: 6.3.1
+      vllm_version: 0.8.5 (0.8.6.dev315+g91a560098.rocm631)
      pytorch_version: 2.7.0+gitf717b2a
      hipblaslt_version: 0.15
  model_groups:
@@ -26,6 +26,11 @@ vllm_benchmark:
        model_repo: meta-llama/Llama-3.1-405B-Instruct
        url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
        precision: float16
+      - model: Llama 3.2 11B Vision
+        mad_tag: pyt_vllm_llama-3.2-11b-vision-instruct
+        model_repo: meta-llama/Llama-3.2-11B-Vision-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct
+        precision: float16
      - model: Llama 2 7B
        mad_tag: pyt_vllm_llama-2-7b
        model_repo: meta-llama/Llama-2-7b-chat-hf
--- a/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
@@ -1,120 +0,0 @@
-unified_docker:
-  latest:
-    pull_tag: rocm/pytorch-training:v25.6
-    docker_hub_url: https://hub.docker.com/r/rocm/pytorch-training/tags
-    rocm_version: 6.4.1
-    pytorch_version: 2.8.0a0+git7d205b2
-    python_version: 3.10.17
-    transformer_engine_version: 1.14.0+2f85f5f2
-    flash_attention_version: 3.0.0.post1
-    hipblaslt_version: 0.15.0-8c6919d
-    triton_version: 3.3.0
-model_groups:
-  - group: Pre-training
-    tag: pre-training
-    models:
-    - model: Llama 3.1 8B
-      mad_tag: pyt_train_llama-3.1-8b
-      model_repo: Llama-3.1-8B
-      url: https://huggingface.co/meta-llama/Llama-3.1-8B
-      precision: BF16
-      training_modes: [pretrain]
-    - model: Llama 3.1 70B
-      mad_tag: pyt_train_llama-3.1-70b
-      model_repo: Llama-3.1-70B
-      url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
-      precision: BF16
-      training_modes: [pretrain]
-    - model: FLUX.1-dev
-      mad_tag: pyt_train_flux
-      model_repo: Flux
-      url: https://huggingface.co/black-forest-labs/FLUX.1-dev
-      precision: BF16
-      training_modes: [pretrain]
-  - group: Fine-tuning
-    tag: fine-tuning
-    models:
-    - model: Llama 4 Scout 17B-16E
-      mad_tag: pyt_train_llama-4-scout-17b-16e
-      model_repo: Llama-4-17B_16E
-      url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Llama 3.3 70B
-      mad_tag: pyt_train_llama-3.3-70b
-      model_repo: Llama-3.3-70B
-      url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
-    - model: Llama 3.2 1B
-      mad_tag: pyt_train_llama-3.2-1b
-      model_repo: Llama-3.2-1B
-      url: https://huggingface.co/meta-llama/Llama-3.2-1B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Llama 3.2 3B
-      mad_tag: pyt_train_llama-3.2-3b
-      model_repo: Llama-3.2-3B
-      url: https://huggingface.co/meta-llama/Llama-3.2-3B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Llama 3.2 Vision 11B
-      mad_tag: pyt_train_llama-3.2-vision-11b
-      model_repo: Llama-3.2-Vision-11B
-      url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision
-      precision: BF16
-      training_modes: [finetune_fw]
-    - model: Llama 3.2 Vision 90B
-      mad_tag: pyt_train_llama-3.2-vision-90b
-      model_repo: Llama-3.2-Vision-90B
-      url: https://huggingface.co/meta-llama/Llama-3.2-90B-Vision
-      precision: BF16
-      training_modes: [finetune_fw]
-    - model: Llama 3.1 8B
-      mad_tag: pyt_train_llama-3.1-8b
-      model_repo: Llama-3.1-8B
-      url: https://huggingface.co/meta-llama/Llama-3.1-8B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Llama 3.1 70B
-      mad_tag: pyt_train_llama-3.1-70b
-      model_repo: Llama-3.1-70B
-      url: https://huggingface.co/meta-llama/Llama-3.1-70B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
-    - model: Llama 3.1 405B
-      mad_tag: pyt_train_llama-3.1-405b
-      model_repo: Llama-3.1-405B
-      url: https://huggingface.co/meta-llama/Llama-3.1-405B
-      precision: BF16
-      training_modes: [finetune_qlora, HF_finetune_lora]
-    - model: Llama 3 8B
-      mad_tag: pyt_train_llama-3-8b
-      model_repo: Llama-3-8B
-      url: https://huggingface.co/meta-llama/Meta-Llama-3-8B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Llama 3 70B
-      mad_tag: pyt_train_llama-3-70b
-      model_repo: Llama-3-70B
-      url: https://huggingface.co/meta-llama/Meta-Llama-3-70B
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Llama 2 7B
-      mad_tag: pyt_train_llama-2-7b
-      model_repo: Llama-2-7B
-      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
-    - model: Llama 2 13B
-      mad_tag: pyt_train_llama-2-13b
-      model_repo: Llama-2-13B
-      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
-      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
-    - model: Llama 2 70B
-      mad_tag: pyt_train_llama-2-70b
-      model_repo: Llama-2-70B
-      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
-      precision: BF16
-      training_modes: [finetune_lora, finetune_qlora, HF_finetune_lora]
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.4.3.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.4.3.rst
@@ -1,346 +0,0 @@
-:orphan:
-
-.. meta::
-   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the unified
-                 ROCm Docker image.
-   :keywords: model, MAD, automation, dashboarding, validate
-
-**********************************
-vLLM inference performance testing
-**********************************
-
-.. caution::
-
-   This documentation does not reflect the latest version of ROCm vLLM
-   inference performance documentation. See :doc:`../vllm` for the latest version.
-
-.. _vllm-benchmark-unified-docker:
-
-The `ROCm vLLM Docker <https://hub.docker.com/r/rocm/vllm/tags>`_ image offers
-a prebuilt, optimized environment designed for validating large language model
-(LLM) inference performance on the AMD Instinct™ MI300X accelerator. This
-ROCm vLLM Docker image integrates vLLM and PyTorch tailored specifically for the
-MI300X accelerator and includes the following components:
-
-* `ROCm 6.2.0 <https://github.com/ROCm/ROCm>`_
-
-* `vLLM 0.4.3 <https://docs.vllm.ai/en/latest>`_
-
-* `PyTorch 2.4.0 <https://github.com/pytorch/pytorch>`_
-
-* Tuning files (in CSV format)
-
-With this Docker image, you can quickly validate the expected inference
-performance numbers on the MI300X accelerator. This topic also provides tips on
-optimizing performance with popular AI models.
-
-.. _vllm-benchmark-vllm:
-
-.. note::
-
-   vLLM is a toolkit and library for LLM inference and
-   serving. It deploys the PagedAttention algorithm, which reduces memory
-   consumption and increases throughput by leveraging dynamic key and value
-   allocation in GPU memory. vLLM also incorporates many LLM acceleration
-   and quantization algorithms. In addition, AMD implements high-performance
-   custom kernels and modules in vLLM to enhance performance further. See
-   :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for more
-   information.
-
-Getting started
-===============
-
-Use the following procedures to reproduce the benchmark results on an
-MI300X accelerator with the prebuilt vLLM Docker image.
-
-.. _vllm-benchmark-get-started:
-
-1. Disable NUMA auto-balancing.
-
-   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
-   might hang until the periodic balancing is finalized. For more information,
-   see :ref:`AMD Instinct MI300X system optimization <mi300x-disable-numa>`.
-
-   .. code-block:: shell
-
-      # disable automatic NUMA balancing
-      sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
-      # check if NUMA balancing is disabled (returns 0 if disabled)
-      cat /proc/sys/kernel/numa_balancing
-      0
-
-2. Download the :ref:`ROCm vLLM Docker image <vllm-benchmark-unified-docker>`.
-
-   Use the following command to pull the Docker image from Docker Hub.
-
-   .. code-block:: shell
-
-      docker pull rocm/vllm:rocm6.2_mi300_ubuntu22.04_py3.9_vllm_7c5fd50
-
-Once setup is complete, you can choose between two options to reproduce the
-benchmark results:
-
-  :ref:`MAD-integrated benchmarking <vllm-benchmark-mad>`
-
-  :ref:`Standalone benchmarking <vllm-benchmark-standalone>`
-
-.. _vllm-benchmark-mad:
-
-MAD-integrated benchmarking
-===========================
-
-Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
-directory and install the required packages on the host machine.
-
-.. code-block:: shell
-
-   git clone https://github.com/ROCm/MAD
-   cd MAD
-   pip install -r requirements.txt
-
-Use this command to run a performance benchmark test of the Llama 3.1 8B model
-on one GPU with ``float16`` data type in the host machine.
-
-.. code-block:: shell
-
-   export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
-   python3 tools/run_models.py --tags pyt_vllm_llama-3.1-8b --keep-model-dir --live-output --timeout 28800
-
-ROCm MAD launches a Docker container with the name
-``container_ci-pyt_vllm_llama-3.1-8b``. The latency and throughput reports of the
-model are collected in the following path: ``~/MAD/reports_float16/``
-
-Although the following eight models are pre-configured to collect latency and
-throughput performance data, users can also change the benchmarking parameters.
-Refer to the :ref:`Standalone benchmarking <vllm-benchmark-standalone>` section.
-
-Available models
----------------
-
-.. hlist::
-   :columns: 3
-
-   * ``pyt_vllm_llama-3.1-8b``
-
-   * ``pyt_vllm_llama-3.1-70b``
-
-   * ``pyt_vllm_llama-3.1-405b``
-
-   * ``pyt_vllm_llama-2-7b``
-
-   * ``pyt_vllm_mistral-7b``
-
-   * ``pyt_vllm_qwen2-7b``
-
-   * ``pyt_vllm_jais-13b``
-
-   * ``pyt_vllm_jais-30b``
-
-.. _vllm-benchmark-standalone:
-
-Standalone benchmarking
-=======================
-
-You can run the vLLM benchmark tool independently by starting the
-:ref:`Docker container <vllm-benchmark-get-started>` as shown in the following
-snippet.
-
-.. code-block::
-
-   docker pull rocm/vllm:rocm6.2_mi300_ubuntu22.04_py3.9_vllm_7c5fd50
-   docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 128G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name unified_docker_vllm rocm/vllm:rocm6.2_mi300_ubuntu22.04_py3.9_vllm_7c5fd50
-
-In the Docker container, clone the ROCm MAD repository and navigate to the
-benchmark scripts directory at ``~/MAD/scripts/vllm``.
-
-.. code-block::
-
-   git clone https://github.com/ROCm/MAD
-   cd MAD/scripts/vllm
-
-Multiprocessing distributed executor
--------------------------------------
-
-To optimize vLLM performance, add the multiprocessing API server argument ``--distributed-executor-backend mp``.
-
-Command
-^^^^^^^^^^^^^^^^^^^^^^^^^
-
-To start the benchmark, use the following command with the appropriate options.
-See :ref:`Options <vllm-benchmark-standalone-options>` for the list of
-options and their descriptions.
-
-.. code-block:: shell
-
-   ./vllm_benchmark_report.sh -s $test_option -m $model_repo -g $num_gpu -d $datatype
-
-See the :ref:`examples <vllm-benchmark-run-benchmark>` for more information.
-
-.. note::
-
-   The input sequence length, output sequence length, and tensor parallel (TP) are
-   already configured. You don't need to specify them with this script.
-
-.. note::
-
-   If you encounter the following error, pass your access-authorized Hugging
-   Face token to the gated models.
-
-   .. code-block:: shell
-
-      OSError: You are trying to access a gated repo.
-
-      # pass your HF_TOKEN
-      export HF_TOKEN=$your_personal_hf_token
-
-.. _vllm-benchmark-standalone-options:
-
-Options
-^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. list-table::
-   :header-rows: 1
-
-   * - Name
-     - Options
-     - Description
-
-   * - ``$test_option``
-     - latency
-     - Measure decoding token latency
-
-   * -
-     - throughput
-     - Measure token generation throughput
-
-   * -
-     - all
-     - Measure both throughput and latency
-
-   * - ``$model_repo``
-     - ``meta-llama/Meta-Llama-3.1-8B-Instruct``
-     - Llama 3.1 8B
-
-   * - (``float16``)
-     - ``meta-llama/Meta-Llama-3.1-70B-Instruct``
-     - Llama 3.1 70B
-
-   * -
-     - ``meta-llama/Meta-Llama-3.1-405B-Instruct``
-     - Llama 3.1 405B
-
-   * -
-     - ``meta-llama/Llama-2-7b-chat-hf``
-     - Llama 2 7B
-
-   * -
-     - ``mistralai/Mixtral-8x7B-Instruct-v0.1``
-     - Mixtral 8x7B
-
-   * -
-     - ``mistralai/Mixtral-8x22B-Instruct-v0.1``
-     - Mixtral 8x22B
-
-   * -
-     - ``mistralai/Mistral-7B-Instruct-v0.3``
-     - Mixtral 7B
-
-   * -
-     - ``Qwen/Qwen2-7B-Instruct``
-     - Qwen2 7B
-
-   * -
-     - ``core42/jais-13b-chat``
-     - JAIS 13B
-
-   * -
-     - ``core42/jais-30b-chat-v3``
-     - JAIS 30B
-
-   * - ``$num_gpu``
-     - 1 or 8
-     - Number of GPUs
-
-   * - ``$datatype``
-     - ``float16``
-     - Data type
-
-.. _vllm-benchmark-run-benchmark:
-
-Running the benchmark on the MI300X accelerator
-----------------------------------------------
-
-Here are some examples of running the benchmark with various options.
-See :ref:`Options <vllm-benchmark-standalone-options>` for the list of
-options and their descriptions.
-
-Latency benchmark example
-^^^^^^^^^^^^^^^^^^^^^^^^^
- 
-Use this command to benchmark the latency of the Llama 3.1 8B model on one GPU with the ``float16`` data type.
-
-.. code-block::
-
-   ./vllm_benchmark_report.sh -s latency -m meta-llama/Meta-Llama-3.1-8B-Instruct -g 1 -d float16
-
-Find the latency report at:
-
- ``./reports_float16/summary/Meta-Llama-3.1-8B-Instruct_latency_report.csv``
-
-Throughput benchmark example
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Use this command to benchmark the throughput of the Llama 3.1 8B model on one GPU with the ``float16`` and ``float8`` data types.
-
-.. code-block:: shell
-
-   ./vllm_benchmark_report.sh -s throughput -m meta-llama/Meta-Llama-3.1-8B-Instruct -g 1 -d float16
-
-Find the throughput reports at:
-
- ``./reports_float16/summary/Meta-Llama-3.1-8B-Instruct_throughput_report.csv``
-
-.. raw:: html
-
-   <style>
-   mjx-container[jax="CHTML"][display="true"] {
-       text-align: left;
-       margin: 0;
-   }
-
-   </style>
-
-.. note::
-
-   Throughput is calculated as:
-
-   - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
-
-   - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
-
-Further reading
-===============
-
- For application performance optimization strategies for HPC and AI workloads,
-  including inference with vLLM, see :doc:`/how-to/tuning-guides/mi300x/workload`.
-
- To learn more about the options for latency and throughput benchmark scripts,
-  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
-
- To learn more about system settings and management practices to configure your system for
-  MI300X accelerators, see :doc:`/how-to/system-optimization/mi300x`.
-
- To learn how to run LLM models from Hugging Face or your own model, see
-  :doc:`Using ROCm for AI </how-to/rocm-for-ai/index>`.
-
- To learn how to optimize inference on LLMs, see
-  :doc:`Fine-tuning LLMs and inference optimization </how-to/llm-fine-tuning-optimization/index>`.
-
- For a list of other ready-made Docker images for ROCm, see the
-  :doc:`Docker image support matrix <rocm-install-on-linux:reference/docker-image-support-matrix>`.
-
-Previous versions
-=================
-
-See :doc:`vllm-history` to find documentation for previous releases
-of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.4.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.4.rst
@@ -1,419 +0,0 @@
-:orphan:
-
-.. meta::
-   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the unified
-                 ROCm Docker image.
-   :keywords: model, MAD, automation, dashboarding, validate
-
-**********************************
-vLLM inference performance testing
-**********************************
-
-.. caution::
-
-   This documentation does not reflect the latest version of ROCm vLLM
-   inference performance documentation. See :doc:`../vllm` for the latest version.
-
-.. _vllm-benchmark-unified-docker:
-
-The `ROCm vLLM Docker <https://hub.docker.com/r/rocm/vllm/tags>`_ image offers
-a prebuilt, optimized environment designed for validating large language model
-(LLM) inference performance on the AMD Instinct™ MI300X accelerator. This
-ROCm vLLM Docker image integrates vLLM and PyTorch tailored specifically for the
-MI300X accelerator and includes the following components:
-
-* `ROCm 6.2.1 <https://github.com/ROCm/ROCm>`_
-
-* `vLLM 0.6.4 <https://docs.vllm.ai/en/latest>`_
-
-* `PyTorch 2.5.0 <https://github.com/pytorch/pytorch>`_
-
-* Tuning files (in CSV format)
-
-With this Docker image, you can quickly validate the expected inference
-performance numbers on the MI300X accelerator. This topic also provides tips on
-optimizing performance with popular AI models.
-
-.. hlist::
-   :columns: 6
-
-   * Llama 3.1 8B
-
-   * Llama 3.1 70B
-
-   * Llama 3.1 405B
-
-   * Llama 2 7B
-
-   * Llama 2 70B
-
-   * Mixtral 8x7B
-
-   * Mixtral 8x22B
-
-   * Mixtral 7B
-
-   * Qwen2 7B
-
-   * Qwen2 72B
-
-   * JAIS 13B
-
-   * JAIS 30B
-
-.. _vllm-benchmark-vllm:
-
-.. note::
-
-   vLLM is a toolkit and library for LLM inference and serving. AMD implements
-   high-performance custom kernels and modules in vLLM to enhance performance.
-   See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
-   more information.
-
-Getting started
-===============
-
-Use the following procedures to reproduce the benchmark results on an
-MI300X accelerator with the prebuilt vLLM Docker image.
-
-.. _vllm-benchmark-get-started:
-
-1. Disable NUMA auto-balancing.
-
-   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
-   might hang until the periodic balancing is finalized. For more information,
-   see :ref:`AMD Instinct MI300X system optimization <mi300x-disable-numa>`.
-
-   .. code-block:: shell
-
-      # disable automatic NUMA balancing
-      sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
-      # check if NUMA balancing is disabled (returns 0 if disabled)
-      cat /proc/sys/kernel/numa_balancing
-      0
-
-2. Download the :ref:`ROCm vLLM Docker image <vllm-benchmark-unified-docker>`.
-
-   Use the following command to pull the Docker image from Docker Hub.
-
-   .. code-block:: shell
-
-      docker pull rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
-
-Once setup is complete, you can choose between two options to reproduce the
-benchmark results:
-
-  :ref:`MAD-integrated benchmarking <vllm-benchmark-mad>`
-
-  :ref:`Standalone benchmarking <vllm-benchmark-standalone>`
-
-.. _vllm-benchmark-mad:
-
-MAD-integrated benchmarking
-===========================
-
-Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
-directory and install the required packages on the host machine.
-
-.. code-block:: shell
-
-   git clone https://github.com/ROCm/MAD
-   cd MAD
-   pip install -r requirements.txt
-
-Use this command to run a performance benchmark test of the Llama 3.1 8B model
-on one GPU with ``float16`` data type in the host machine.
-
-.. code-block:: shell
-
-   export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
-   python3 tools/run_models.py --tags pyt_vllm_llama-3.1-8b --keep-model-dir --live-output --timeout 28800
-
-ROCm MAD launches a Docker container with the name
-``container_ci-pyt_vllm_llama-3.1-8b``. The latency and throughput reports of the
-model are collected in the following path: ``~/MAD/reports_float16/``.
-
-Although the following models are preconfigured to collect latency and
-throughput performance data, you can also change the benchmarking parameters.
-Refer to the :ref:`Standalone benchmarking <vllm-benchmark-standalone>` section.
-
-Available models
----------------
-
-.. hlist::
-   :columns: 3
-
-   * ``pyt_vllm_llama-3.1-8b``
-
-   * ``pyt_vllm_llama-3.1-70b``
-
-   * ``pyt_vllm_llama-3.1-405b``
-
-   * ``pyt_vllm_llama-2-7b``
-
-   * ``pyt_vllm_llama-2-70b``
-
-   * ``pyt_vllm_mixtral-8x7b``
-
-   * ``pyt_vllm_mixtral-8x22b``
-
-   * ``pyt_vllm_mistral-7b``
-
-   * ``pyt_vllm_qwen2-7b``
-
-   * ``pyt_vllm_qwen2-72b``
-
-   * ``pyt_vllm_jais-13b``
-
-   * ``pyt_vllm_jais-30b``
-
-   * ``pyt_vllm_llama-3.1-8b_fp8``
-
-   * ``pyt_vllm_llama-3.1-70b_fp8``
-
-   * ``pyt_vllm_llama-3.1-405b_fp8``
-
-   * ``pyt_vllm_mixtral-8x7b_fp8``
-
-   * ``pyt_vllm_mixtral-8x22b_fp8``
-
-.. _vllm-benchmark-standalone:
-
-Standalone benchmarking
-=======================
-
-You can run the vLLM benchmark tool independently by starting the
-:ref:`Docker container <vllm-benchmark-get-started>` as shown in the following
-snippet.
-
-.. code-block::
-
-   docker pull rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
-   docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 128G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name vllm_v0.6.4 rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
-
-In the Docker container, clone the ROCm MAD repository and navigate to the
-benchmark scripts directory at ``~/MAD/scripts/vllm``.
-
-.. code-block::
-
-   git clone https://github.com/ROCm/MAD
-   cd MAD/scripts/vllm
-
-Command
-------
-
-To start the benchmark, use the following command with the appropriate options.
-See :ref:`Options <vllm-benchmark-standalone-options>` for the list of
-options and their descriptions.
-
-.. code-block:: shell
-
-   ./vllm_benchmark_report.sh -s $test_option -m $model_repo -g $num_gpu -d $datatype
-
-See the :ref:`examples <vllm-benchmark-run-benchmark>` for more information.
-
-.. note::
-
-   The input sequence length, output sequence length, and tensor parallel (TP) are
-   already configured. You don't need to specify them with this script.
-
-.. note::
-
-   If you encounter the following error, pass your access-authorized Hugging
-   Face token to the gated models.
-
-   .. code-block:: shell
-
-      OSError: You are trying to access a gated repo.
-
-      # pass your HF_TOKEN
-      export HF_TOKEN=$your_personal_hf_token
-
-.. _vllm-benchmark-standalone-options:
-
-Options
-------
-
-.. list-table::
-   :header-rows: 1
-   :align: center
-
-   * - Name
-     - Options
-     - Description
-
-   * - ``$test_option``
-     - latency
-     - Measure decoding token latency
-
-   * -
-     - throughput
-     - Measure token generation throughput
-
-   * -
-     - all
-     - Measure both throughput and latency
-
-   * - ``$model_repo``
-     - ``meta-llama/Meta-Llama-3.1-8B-Instruct``
-     - Llama 3.1 8B
-
-   * - (``float16``)
-     - ``meta-llama/Meta-Llama-3.1-70B-Instruct``
-     - Llama 3.1 70B
-
-   * -
-     - ``meta-llama/Meta-Llama-3.1-405B-Instruct``
-     - Llama 3.1 405B
-
-   * -
-     - ``meta-llama/Llama-2-7b-chat-hf``
-     - Llama 2 7B
-
-   * -
-     - ``meta-llama/Llama-2-70b-chat-hf``
-     - Llama 2 70B
-
-   * -
-     - ``mistralai/Mixtral-8x7B-Instruct-v0.1``
-     - Mixtral 8x7B
-
-   * -
-     - ``mistralai/Mixtral-8x22B-Instruct-v0.1``
-     - Mixtral 8x22B
-
-   * -
-     - ``mistralai/Mistral-7B-Instruct-v0.3``
-     - Mixtral 7B
-
-   * -
-     - ``Qwen/Qwen2-7B-Instruct``
-     - Qwen2 7B
-
-   * -
-     - ``Qwen/Qwen2-72B-Instruct``
-     - Qwen2 72B
-
-   * -
-     - ``core42/jais-13b-chat``
-     - JAIS 13B
-
-   * -
-     - ``core42/jais-30b-chat-v3``
-     - JAIS 30B
-
-   * - ``$model_repo``
-     - ``amd/Meta-Llama-3.1-8B-Instruct-FP8-KV``
-     - Llama 3.1 8B
-
-   * - (``float8``)
-     - ``amd/Meta-Llama-3.1-70B-Instruct-FP8-KV``
-     - Llama 3.1 70B
-
-   * -
-     - ``amd/Meta-Llama-3.1-405B-Instruct-FP8-KV``
-     - Llama 3.1 405B
-
-   * -
-     - ``amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV``
-     - Mixtral 8x7B
-
-   * -
-     - ``amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV``
-     - Mixtral 8x22B
-
-   * - ``$num_gpu``
-     - 1 or 8
-     - Number of GPUs
-
-   * - ``$datatype``
-     - ``float16`` or ``float8``
-     - Data type
-
-.. _vllm-benchmark-run-benchmark:
-
-Running the benchmark on the MI300X accelerator
-----------------------------------------------
-
-Here are some examples of running the benchmark with various options.
-See :ref:`Options <vllm-benchmark-standalone-options>` for the list of
-options and their descriptions.
-
-Example 1: latency benchmark
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^
- 
-Use this command to benchmark the latency of the Llama 3.1 8B model on one GPU with the ``float16`` and ``float8`` data types.
-
-.. code-block::
-
-   ./vllm_benchmark_report.sh -s latency -m meta-llama/Meta-Llama-3.1-8B-Instruct -g 1 -d float16
-   ./vllm_benchmark_report.sh -s latency -m amd/Meta-Llama-3.1-8B-Instruct-FP8-KV -g 1 -d float8
-
-Find the latency reports at:
-
- ``./reports_float16/summary/Meta-Llama-3.1-8B-Instruct_latency_report.csv``
-
- ``./reports_float8/summary/Meta-Llama-3.1-8B-Instruct-FP8-KV_latency_report.csv``
-
-Example 2: throughput benchmark
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Use this command to benchmark the throughput of the Llama 3.1 8B model on one GPU with the ``float16`` and ``float8`` data types.
-
-.. code-block:: shell
-
-   ./vllm_benchmark_report.sh -s throughput -m meta-llama/Meta-Llama-3.1-8B-Instruct -g 1 -d float16
-   ./vllm_benchmark_report.sh -s throughput -m amd/Meta-Llama-3.1-8B-Instruct-FP8-KV -g 1 -d float8
-
-Find the throughput reports at:
-
- ``./reports_float16/summary/Meta-Llama-3.1-8B-Instruct_throughput_report.csv``
-
- ``./reports_float8/summary/Meta-Llama-3.1-8B-Instruct-FP8-KV_throughput_report.csv``
-
-.. raw:: html
-
-   <style>
-   mjx-container[jax="CHTML"][display="true"] {
-       text-align: left;
-       margin: 0;
-   }
-   </style>
-
-.. note::
-
-   Throughput is calculated as:
-
-   - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
-
-   - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
-
-Further reading
-===============
-
- For application performance optimization strategies for HPC and AI workloads,
-  including inference with vLLM, see :doc:`/how-to/tuning-guides/mi300x/workload`.
-
- To learn more about the options for latency and throughput benchmark scripts,
-  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
-
- To learn more about system settings and management practices to configure your system for
-  MI300X accelerators, see :doc:`/how-to/system-optimization/mi300x`.
-
- To learn how to run LLM models from Hugging Face or your own model, see
-  :doc:`Using ROCm for AI </how-to/rocm-for-ai/index>`.
-
- To learn how to optimize inference on LLMs, see
-  :doc:`Fine-tuning LLMs and inference optimization </how-to/llm-fine-tuning-optimization/index>`.
-
- For a list of other ready-made Docker images for ROCm, see the
-  :doc:`Docker image support matrix <rocm-install-on-linux:reference/docker-image-support-matrix>`.
-
- To compare with the previous version of the ROCm vLLM Docker image for performance validation, refer to
-  `LLM inference performance validation on AMD Instinct MI300X (ROCm 6.2.0) <https://rocm.docs.amd.com/en/docs-6.2.0/how-to/performance-validation/mi300x/vllm-benchmark.html>`_.
-
-Previous versions
-=================
-
-See :doc:`vllm-history` to find documentation for previous releases
-of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.6.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.6.rst
@@ -1,461 +0,0 @@
-:orphan:
-
-.. meta::
-   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
-                 ROCm vLLM Docker image.
-   :keywords: model, MAD, automation, dashboarding, validate
-
-***********************************************************
-LLM inference performance validation on AMD Instinct MI300X
-***********************************************************
-
-.. caution::
-
-   This documentation does not reflect the latest version of ROCm vLLM
-   inference performance documentation. See :doc:`../vllm` for the latest version.
-
-.. _vllm-benchmark-unified-docker:
-
-The `ROCm vLLM Docker <https://hub.docker.com/r/rocm/vllm/tags>`_ image offers
-a prebuilt, optimized environment for validating large language model (LLM)
-inference performance on the AMD Instinct™ MI300X accelerator. This ROCm vLLM
-Docker image integrates vLLM and PyTorch tailored specifically for the MI300X
-accelerator and includes the following components:
-
-* `ROCm 6.3.1 <https://github.com/ROCm/ROCm>`_
-
-* `vLLM 0.6.6 <https://docs.vllm.ai/en/latest>`_
-
-* `PyTorch 2.7.0 (2.7.0a0+git3a58512) <https://github.com/pytorch/pytorch>`_
-
-With this Docker image, you can quickly validate the expected inference
-performance numbers for the MI300X accelerator. This topic also provides tips on
-optimizing performance with popular AI models. For more information, see the lists of
-:ref:`available models for MAD-integrated benchmarking <vllm-benchmark-mad-models>`
-and :ref:`standalone benchmarking <vllm-benchmark-standalone-options>`.
-
-.. _vllm-benchmark-vllm:
-
-.. note::
-
-   vLLM is a toolkit and library for LLM inference and serving. AMD implements
-   high-performance custom kernels and modules in vLLM to enhance performance.
-   See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
-   more information.
-
-Getting started
-===============
-
-Use the following procedures to reproduce the benchmark results on an
-MI300X accelerator with the prebuilt vLLM Docker image.
-
-.. _vllm-benchmark-get-started:
-
-1. Disable NUMA auto-balancing.
-
-   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
-   might hang until the periodic balancing is finalized. For more information,
-   see :ref:`AMD Instinct MI300X system optimization <mi300x-disable-numa>`.
-
-   .. code-block:: shell
-
-      # disable automatic NUMA balancing
-      sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
-      # check if NUMA balancing is disabled (returns 0 if disabled)
-      cat /proc/sys/kernel/numa_balancing
-      0
-
-2. Download the :ref:`ROCm vLLM Docker image <vllm-benchmark-unified-docker>`.
-
-   Use the following command to pull the Docker image from Docker Hub.
-
-   .. code-block:: shell
-
-      docker pull rocm/vllm:rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6
-
-Once the setup is complete, choose between two options to reproduce the
-benchmark results:
-
-  :ref:`MAD-integrated benchmarking <vllm-benchmark-mad>`
-
-  :ref:`Standalone benchmarking <vllm-benchmark-standalone>`
-
-.. _vllm-benchmark-mad:
-
-MAD-integrated benchmarking
-===========================
-
-Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
-directory and install the required packages on the host machine.
-
-.. code-block:: shell
-
-   git clone https://github.com/ROCm/MAD
-   cd MAD
-   pip install -r requirements.txt
-
-Use this command to run a performance benchmark test of the Llama 3.1 8B model
-on one GPU with ``float16`` data type in the host machine.
-
-.. code-block:: shell
-
-   export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
-   python3 tools/run_models.py --tags pyt_vllm_llama-3.1-8b --keep-model-dir --live-output --timeout 28800
-
-ROCm MAD launches a Docker container with the name
-``container_ci-pyt_vllm_llama-3.1-8b``. The latency and throughput reports of the
-model are collected in the following path: ``~/MAD/reports_float16/``.
-
-Although the following models are preconfigured to collect latency and
-throughput performance data, you can also change the benchmarking parameters.
-Refer to the :ref:`Standalone benchmarking <vllm-benchmark-standalone>` section.
-
-.. _vllm-benchmark-mad-models:
-
-Available models
----------------
-
-.. list-table::
-   :header-rows: 1
-   :widths: 2, 3
-
-   * - Model name
-     - Tag
-
-   * - `Llama 3.1 8B <https://huggingface.co/meta-llama/Llama-3.1-8B>`_
-     - ``pyt_vllm_llama-3.1-8b``
-
-   * - `Llama 3.1 70B <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
-     - ``pyt_vllm_llama-3.1-70b``
-
-   * - `Llama 3.1 405B <https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct>`_
-     - ``pyt_vllm_llama-3.1-405b``
-
-   * - `Llama 3.2 11B Vision <https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct>`_
-     - ``pyt_vllm_llama-3.2-11b-vision-instruct``
-
-   * - `Llama 2 7B <https://huggingface.co/meta-llama/Llama-2-7b-chat-hf>`_
-     - ``pyt_vllm_llama-2-7b``
-
-   * - `Llama 2 70B <https://huggingface.co/meta-llama/Llama-2-70b-chat-hf>`_
-     - ``pyt_vllm_llama-2-70b``
-
-   * - `Mixtral MoE 8x7B <https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1>`_
-     - ``pyt_vllm_mixtral-8x7b``
-
-   * - `Mixtral MoE 8x22B <https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1>`_
-     - ``pyt_vllm_mixtral-8x22b``
-
-   * - `Mistral 7B <https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3>`_
-     - ``pyt_vllm_mistral-7b``
-
-   * - `Qwen2 7B <https://huggingface.co/Qwen/Qwen2-7B-Instruct>`_
-     - ``pyt_vllm_qwen2-7b``
-
-   * - `Qwen2 72B <https://huggingface.co/Qwen/Qwen2-72B-Instruct>`_
-     - ``pyt_vllm_qwen2-72b``
-
-   * - `JAIS 13B <https://huggingface.co/core42/jais-13b-chat>`_
-     - ``pyt_vllm_jais-13b``
-
-   * - `JAIS 30B <https://huggingface.co/core42/jais-30b-chat-v3>`_
-     - ``pyt_vllm_jais-30b``
-
-   * - `DBRX Instruct <https://huggingface.co/databricks/dbrx-instruct>`_
-     - ``pyt_vllm_dbrx-instruct``
-
-   * - `Gemma 2 27B <https://huggingface.co/google/gemma-2-27b>`_
-     - ``pyt_vllm_gemma-2-27b``
-
-   * - `C4AI Command R+ 08-2024 <https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024>`_
-     - ``pyt_vllm_c4ai-command-r-plus-08-2024``
-
-   * - `DeepSeek MoE 16B <https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat>`_
-     - ``pyt_vllm_deepseek-moe-16b-chat``
-
-   * - `Llama 3.1 70B FP8 <https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV>`_
-     - ``pyt_vllm_llama-3.1-70b_fp8``
-
-   * - `Llama 3.1 405B FP8 <https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV>`_
-     - ``pyt_vllm_llama-3.1-405b_fp8``
-
-   * - `Mixtral MoE 8x7B FP8 <https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV>`_
-     - ``pyt_vllm_mixtral-8x7b_fp8``
-
-   * - `Mixtral MoE 8x22B FP8 <https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV>`_
-     - ``pyt_vllm_mixtral-8x22b_fp8``
-
-   * - `Mistral 7B FP8 <https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV>`_
-     - ``pyt_vllm_mistral-7b_fp8``
-
-   * - `DBRX Instruct FP8 <https://huggingface.co/amd/dbrx-instruct-FP8-KV>`_
-     - ``pyt_vllm_dbrx_fp8``
-
-   * - `C4AI Command R+ 08-2024 FP8 <https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV>`_
-     - ``pyt_vllm_command-r-plus_fp8``
-
-.. _vllm-benchmark-standalone:
-
-Standalone benchmarking
-=======================
-
-You can run the vLLM benchmark tool independently by starting the
-:ref:`Docker container <vllm-benchmark-get-started>` as shown in the following
-snippet.
-
-.. code-block::
-
-   docker pull rocm/vllm:rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6
-   docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name vllm_v0.6.6 rocm/vllm:rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6
-
-In the Docker container, clone the ROCm MAD repository and navigate to the
-benchmark scripts directory at ``~/MAD/scripts/vllm``.
-
-.. code-block::
-
-   git clone https://github.com/ROCm/MAD
-   cd MAD/scripts/vllm
-
-Command
-------
-
-To start the benchmark, use the following command with the appropriate options.
-See :ref:`Options <vllm-benchmark-standalone-options>` for the list of
-options and their descriptions.
-
-.. code-block:: shell
-
-   ./vllm_benchmark_report.sh -s $test_option -m $model_repo -g $num_gpu -d $datatype
-
-See the :ref:`examples <vllm-benchmark-run-benchmark>` for more information.
-
-.. note::
-
-   The input sequence length, output sequence length, and tensor parallel (TP) are
-   already configured. You don't need to specify them with this script.
-
-.. note::
-
-   If you encounter the following error, pass your access-authorized Hugging
-   Face token to the gated models.
-
-   .. code-block:: shell
-
-      OSError: You are trying to access a gated repo.
-
-      # pass your HF_TOKEN
-      export HF_TOKEN=$your_personal_hf_token
-
-.. _vllm-benchmark-standalone-options:
-
-Options and available models
----------------------------
-
-.. list-table::
-   :header-rows: 1
-   :align: center
-
-   * - Name
-     - Options
-     - Description
-
-   * - ``$test_option``
-     - latency
-     - Measure decoding token latency
-
-   * -
-     - throughput
-     - Measure token generation throughput
-
-   * -
-     - all
-     - Measure both throughput and latency
-
-   * - ``$model_repo``
-     - ``meta-llama/Llama-3.1-8B-Instruct``
-     - `Llama 3.1 8B <https://huggingface.co/meta-llama/Llama-3.1-8B>`_
-
-   * - (``float16``)
-     - ``meta-llama/Llama-3.1-70B-Instruct``
-     - `Llama 3.1 70B <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
-
-   * -
-     - ``meta-llama/Llama-3.1-405B-Instruct``
-     - `Llama 3.1 405B <https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct>`_
-
-   * -
-     - ``meta-llama/Llama-3.2-11B-Vision-Instruct``
-     - `Llama 3.2 11B Vision <https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct>`_
-
-   * -
-     - ``meta-llama/Llama-2-7b-chat-hf``
-     - `Llama 2 7B <https://huggingface.co/meta-llama/Llama-2-7b-chat-hf>`_
-
-   * -
-     - ``meta-llama/Llama-2-70b-chat-hf``
-     - `Llama 2 7B <https://huggingface.co/meta-llama/Llama-2-70b-chat-hf>`_
-
-   * -
-     - ``mistralai/Mixtral-8x7B-Instruct-v0.1``
-     - `Mixtral MoE 8x7B <https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1>`_
-
-   * -
-     - ``mistralai/Mixtral-8x22B-Instruct-v0.1``
-     - `Mixtral MoE 8x22B <https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1>`_
-
-   * -
-     - ``mistralai/Mistral-7B-Instruct-v0.3``
-     - `Mistral 7B <https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3>`_
-
-   * -
-     - ``Qwen/Qwen2-7B-Instruct``
-     - `Qwen2 7B <https://huggingface.co/Qwen/Qwen2-7B-Instruct>`_
-
-   * -
-     - ``Qwen/Qwen2-72B-Instruct``
-     - `Qwen2 72B <https://huggingface.co/Qwen/Qwen2-72B-Instruct>`_
-
-   * -
-     - ``core42/jais-13b-chat``
-     - `JAIS 13B <https://huggingface.co/core42/jais-13b-chat>`_
-
-   * -
-     - ``core42/jais-30b-chat-v3``
-     - `JAIS 30B <https://huggingface.co/core42/jais-30b-chat-v3>`_
-
-   * -
-     - ``databricks/dbrx-instruct``
-     - `DBRX Instruct <https://huggingface.co/databricks/dbrx-instruct>`_
-
-   * -
-     - ``google/gemma-2-27b``
-     - `Gemma 2 27B <https://huggingface.co/google/gemma-2-27b>`_
-
-   * -
-     - ``CohereForAI/c4ai-command-r-plus-08-2024``
-     - `C4AI Command R+ 08-2024 <https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024>`_
-
-   * -
-     - ``deepseek-ai/deepseek-moe-16b-chat``
-     - `DeepSeek MoE 16B <https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat>`_
-
-   * - ``$model_repo``
-     - ``amd/Llama-3.1-70B-Instruct-FP8-KV``
-     - `Llama 3.1 70B FP8 <https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV>`_
-
-   * - (``float8``)
-     - ``amd/Llama-3.1-405B-Instruct-FP8-KV``
-     - `Llama 3.1 405B FP8 <https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV>`_
-
-   * -
-     - ``amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV``
-     - `Mixtral MoE 8x7B FP8 <https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV>`_
-
-   * -
-     - ``amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV``
-     - `Mixtral MoE 8x22B FP8 <https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV>`_
-
-   * -
-     - ``amd/Mistral-7B-v0.1-FP8-KV``
-     - `Mistral 7B FP8 <https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV>`_
-
-   * -
-     - ``amd/dbrx-instruct-FP8-KV``
-     - `DBRX Instruct FP8 <https://huggingface.co/amd/dbrx-instruct-FP8-KV>`_
-
-   * -
-     - ``amd/c4ai-command-r-plus-FP8-KV``
-     - `C4AI Command R+ 08-2024 FP8 <https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV>`_
-
-   * - ``$num_gpu``
-     - 1 or 8
-     - Number of GPUs
-
-   * - ``$datatype``
-     - ``float16`` or ``float8``
-     - Data type
-
-.. _vllm-benchmark-run-benchmark:
-
-Running the benchmark on the MI300X accelerator
-----------------------------------------------
-
-Here are some examples of running the benchmark with various options.
-See :ref:`Options <vllm-benchmark-standalone-options>` for the list of
-options and their descriptions.
-
-Example 1: latency benchmark
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^
- 
-Use this command to benchmark the latency of the Llama 3.1 70B model on eight GPUs with the ``float16`` and ``float8`` data types.
-
-.. code-block::
-
-   ./vllm_benchmark_report.sh -s latency -m meta-llama/Llama-3.1-70B-Instruct -g 8 -d float16
-   ./vllm_benchmark_report.sh -s latency -m amd/Llama-3.1-70B-Instruct-FP8-KV -g 8 -d float8
-
-Find the latency reports at:
-
- ``./reports_float16/summary/Llama-3.1-70B-Instruct_latency_report.csv``
-
- ``./reports_float8/summary/Llama-3.1-70B-Instruct-FP8-KV_latency_report.csv``
-
-Example 2: throughput benchmark
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Use this command to benchmark the throughput of the Llama 3.1 70B model on eight GPUs with the ``float16`` and ``float8`` data types.
-
-.. code-block:: shell
-
-   ./vllm_benchmark_report.sh -s throughput -m meta-llama/Llama-3.1-70B-Instruct -g 8 -d float16
-   ./vllm_benchmark_report.sh -s throughput -m amd/Llama-3.1-70B-Instruct-FP8-KV -g 8 -d float8
-
-Find the throughput reports at:
-
- ``./reports_float16/summary/Llama-3.1-70B-Instruct_throughput_report.csv``
-
- ``./reports_float8/summary/Llama-3.1-70B-Instruct-FP8-KV_throughput_report.csv``
-
-.. raw:: html
-
-   <style>
-   mjx-container[jax="CHTML"][display="true"] {
-       text-align: left;
-       margin: 0;
-   }
-   </style>
-
-.. note::
-
-   Throughput is calculated as:
-
-   - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
-
-   - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
-
-Further reading
-===============
-
- For application performance optimization strategies for HPC and AI workloads,
-  including inference with vLLM, see :doc:`../inference-optimization/workload`.
-
- To learn more about the options for latency and throughput benchmark scripts,
-  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
-
- To learn more about system settings and management practices to configure your system for
-  MI300X accelerators, see :doc:`../../system-optimization/mi300x`.
-
- To learn how to run LLM models from Hugging Face or your own model, see
-  :doc:`Running models from Hugging Face <hugging-face-models>`.
-
- To learn how to optimize inference on LLMs, see
-  :doc:`Inference optimization <../inference-optimization/index>`.
-
- To learn how to fine-tune LLMs, see
-  :doc:`Fine-tuning LLMs <../fine-tuning/index>`.
-
-Previous versions
-=================
-
-See :doc:`vllm-history` to find documentation for previous releases
-of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.7.3-20250325.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.7.3-20250325.rst
@@ -1,329 +0,0 @@
-:orphan:
-
-.. meta::
-   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
-                 ROCm vLLM Docker image.
-   :keywords: model, MAD, automation, dashboarding, validate
-
-**********************************
-vLLM inference performance testing
-**********************************
-
-.. caution::
-
-   This documentation does not reflect the latest version of ROCm vLLM
-   inference performance documentation. See :doc:`../vllm` for the latest version.
-
-.. _vllm-benchmark-unified-docker:
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.7.3_20250325-benchmark-models.yaml
-
-   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
-   {% set model_groups = data.vllm_benchmark.model_groups %}
-
-   The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
-   a prebuilt, optimized environment for validating large language model (LLM)
-   inference performance on AMD Instinct™ MI300X series accelerator. This ROCm vLLM
-   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
-   accelerators and includes the following components:
-
-   * `ROCm {{ unified_docker.rocm_version }} <https://github.com/ROCm/ROCm>`_
-
-   * `vLLM {{ unified_docker.vllm_version }} <https://docs.vllm.ai/en/latest>`_
-
-   * `PyTorch {{ unified_docker.pytorch_version }} <https://github.com/pytorch/pytorch>`_
-
-   * `hipBLASLt {{ unified_docker.hipblaslt_version }} <https://github.com/ROCm/hipBLASLt>`_
-
-   With this Docker image, you can quickly test the :ref:`expected
-   inference performance numbers <vllm-benchmark-performance-measurements>` for
-   MI300X series accelerators.
-
-   .. _vllm-benchmark-available-models:
-
-   Available models
-   ================
-
-   .. raw:: html
-
-      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
-        <div class="row">
-          <div class="col-2 me-2 model-param-head">Model</div>
-          <div class="row col-10">
-   {% for model_group in model_groups %}
-            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
-   {% endfor %}
-          </div>
-        </div>
-
-        <div class="row mt-1">
-          <div class="col-2 me-2 model-param-head">Model variant</div>
-          <div class="row col-10">
-   {% for model_group in model_groups %}
-      {% set models = model_group.models %}
-      {% for model in models %}
-         {% if models|length % 3 == 0 %}
-            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-         {% else %}
-            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-         {% endif %}
-      {% endfor %}
-   {% endfor %}
-          </div>
-        </div>
-      </div>
-
-   .. _vllm-benchmark-vllm:
-
-   {% for model_group in model_groups %}
-      {% for model in model_group.models %}
-
-   .. container:: model-doc {{model.mad_tag}}
-
-      .. note::
-
-         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
-         Some models require access authorization prior to use via an external license agreement through a third party.
-
-      {% endfor %}
-   {% endfor %}
-
-   .. note::
-
-      vLLM is a toolkit and library for LLM inference and serving. AMD implements
-      high-performance custom kernels and modules in vLLM to enhance performance.
-      See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
-      more information.
-
-   .. _vllm-benchmark-performance-measurements:
-
-   Performance measurements
-   ========================
-
-   To evaluate performance, the
-   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
-   page provides reference throughput and latency measurements for inferencing
-   popular AI models.
-
-   .. important::
-
-      The performance data presented in
-      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
-      only reflects the :doc:`latest version of this inference benchmarking environment <../vllm>`_.
-      The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
-
-   Advanced features and known issues
-   ==================================
-
-   For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
-   see the developer's guide at `<https://github.com/ROCm/vllm/tree/25070a1841df0dca585b7ddcb967c42aaec4b7c5/docs/dev-docker>`__.
-
-   Getting started
-   ===============
-
-   Use the following procedures to reproduce the benchmark results on an
-   MI300X accelerator with the prebuilt vLLM Docker image.
-
-   .. _vllm-benchmark-get-started:
-
-   1. Disable NUMA auto-balancing.
-
-      To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
-      might hang until the periodic balancing is finalized. For more information,
-      see :ref:`AMD Instinct MI300X system optimization <mi300x-disable-numa>`.
-
-      .. code-block:: shell
-
-         # disable automatic NUMA balancing
-         sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
-         # check if NUMA balancing is disabled (returns 0 if disabled)
-         cat /proc/sys/kernel/numa_balancing
-         0
-
-   2. Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
-
-      Use the following command to pull the Docker image from Docker Hub.
-
-      .. code-block:: shell
-
-         docker pull {{ unified_docker.pull_tag }}
-
-   Benchmarking
-   ============
-
-   Once the setup is complete, choose between two options to reproduce the
-   benchmark results:
-
-   .. _vllm-benchmark-mad:
-
-   {% for model_group in model_groups %}
-      {% for model in model_group.models %}
-
-   .. container:: model-doc {{model.mad_tag}}
-
-      .. tab-set::
-
-         .. tab-item:: MAD-integrated benchmarking
-
-            Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
-            directory and install the required packages on the host machine.
-
-            .. code-block:: shell
-
-               git clone https://github.com/ROCm/MAD
-               cd MAD
-               pip install -r requirements.txt
-
-            Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
-            using one GPU with the ``{{model.precision}}`` data type on the host machine.
-
-            .. code-block:: shell
-
-               export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
-               python3 tools/run_models.py --tags {{model.mad_tag}} --keep-model-dir --live-output --timeout 28800
-
-            MAD launches a Docker container with the name
-            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
-            model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
-
-            Although the :ref:`available models <vllm-benchmark-available-models>` are preconfigured
-            to collect latency and throughput performance data, you can also change the benchmarking
-            parameters. See the standalone benchmarking tab for more information.
-
-         .. tab-item:: Standalone benchmarking
-
-            Run the vLLM benchmark tool independently by starting the
-            `Docker container <{{ unified_docker.docker_hub_url }}>`_
-            as shown in the following snippet.
-
-            .. code-block::
-
-               docker pull {{ unified_docker.pull_tag }}
-               docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name test {{ unified_docker.pull_tag }}
-
-            In the Docker container, clone the ROCm MAD repository and navigate to the
-            benchmark scripts directory at ``~/MAD/scripts/vllm``.
-
-            .. code-block::
-
-               git clone https://github.com/ROCm/MAD
-               cd MAD/scripts/vllm
-
-            To start the benchmark, use the following command with the appropriate options.
-
-            .. code-block::
-
-               ./vllm_benchmark_report.sh -s $test_option -m {{model.model_repo}} -g $num_gpu -d {{model.precision}}
-
-            .. list-table::
-               :header-rows: 1
-               :align: center
-
-               * - Name
-                 - Options
-                 - Description
-
-               * - ``$test_option``
-                 - latency
-                 - Measure decoding token latency
-
-               * -
-                 - throughput
-                 - Measure token generation throughput
-
-               * -
-                 - all
-                 - Measure both throughput and latency
-
-               * - ``$num_gpu``
-                 - 1 or 8
-                 - Number of GPUs
-
-               * - ``$datatype``
-                 - ``float16`` or ``float8``
-                 - Data type
-
-            .. note::
-
-               The input sequence length, output sequence length, and tensor parallel (TP) are
-               already configured. You don't need to specify them with this script.
-
-            .. note::
-
-               If you encounter the following error, pass your access-authorized Hugging
-               Face token to the gated models.
-
-               .. code-block::
-
-                  OSError: You are trying to access a gated repo.
-
-                  # pass your HF_TOKEN
-                  export HF_TOKEN=$your_personal_hf_token
-
-            Here are some examples of running the benchmark with various options.
-
-            * Latency benchmark
-
-              Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with the ``{{model.precision}}`` data type.
-
-              .. code-block::
-
-                 ./vllm_benchmark_report.sh -s latency -m {{model.model_repo}} -g 8 -d {{model.precision}}
-
-              Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
-
-            * Throughput benchmark
-
-              Use this command to throughput the latency of the {{model.model}} model on eight GPUs with the ``{{model.precision}}`` data type.
-
-              .. code-block:: shell
-
-                 ./vllm_benchmark_report.sh -s latency -m {{model.model_repo}} -g 8 -d {{model.precision}}
-
-              Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
-
-            .. raw:: html
-
-               <style>
-               mjx-container[jax="CHTML"][display="true"] {
-                  text-align: left;
-                  margin: 0;
-               }
-               </style>
-
-            .. note::
-
-               Throughput is calculated as:
-
-               - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
-
-               - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
-      {% endfor %}
-   {% endfor %}
-
-Further reading
-===============
-
- For application performance optimization strategies for HPC and AI workloads,
-  including inference with vLLM, see :doc:`../inference-optimization/workload`.
-
- To learn more about the options for latency and throughput benchmark scripts,
-  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
-
- To learn more about system settings and management practices to configure your system for
-  MI300X accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
-
- To learn how to run LLM models from Hugging Face or your own model, see
-  :doc:`Running models from Hugging Face <hugging-face-models>`.
-
- To learn how to optimize inference on LLMs, see
-  :doc:`Inference optimization <../inference-optimization/index>`.
-
- To learn how to fine-tune LLMs, see
-  :doc:`Fine-tuning LLMs <../fine-tuning/index>`.
-
-Previous versions
-=================
-
-See :doc:`vllm-history` to find documentation for previous releases
-of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.3-20250415.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.3-20250415.rst
@@ -1,343 +0,0 @@
-.. meta::
-   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
-                 ROCm vLLM Docker image.
-   :keywords: model, MAD, automation, dashboarding, validate
-
-**********************************
-vLLM inference performance testing
-**********************************
-
-.. _vllm-benchmark-unified-docker:
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.8.3_20250415-benchmark-models.yaml
-
-   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
-   {% set model_groups = data.vllm_benchmark.model_groups %}
-
-   The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
-   a prebuilt, optimized environment for validating large language model (LLM)
-   inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
-   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
-   accelerators and includes the following components:
-
-   * `ROCm {{ unified_docker.rocm_version }} <https://github.com/ROCm/ROCm>`_
-
-   * `vLLM {{ unified_docker.vllm_version }} <https://docs.vllm.ai/en/latest>`_
-
-   * `PyTorch {{ unified_docker.pytorch_version }} <https://github.com/pytorch/pytorch>`_
-
-   * `hipBLASLt {{ unified_docker.hipblaslt_version }} <https://github.com/ROCm/hipBLASLt>`_
-
-   With this Docker image, you can quickly test the :ref:`expected
-   inference performance numbers <vllm-benchmark-performance-measurements>` for
-   MI300X series accelerators.
-
-   .. _vllm-benchmark-available-models:
-
-   Supported models
-   ================
-
-   .. raw:: html
-
-      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
-        <div class="row">
-          <div class="col-2 me-2 model-param-head">Model</div>
-          <div class="row col-10">
-   {% for model_group in model_groups %}
-            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
-   {% endfor %}
-          </div>
-        </div>
-
-        <div class="row mt-1">
-          <div class="col-2 me-2 model-param-head">Model variant</div>
-          <div class="row col-10">
-   {% for model_group in model_groups %}
-      {% set models = model_group.models %}
-      {% for model in models %}
-         {% if models|length % 3 == 0 %}
-            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-         {% else %}
-            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-         {% endif %}
-      {% endfor %}
-   {% endfor %}
-          </div>
-        </div>
-      </div>
-
-   .. _vllm-benchmark-vllm:
-
-   {% for model_group in model_groups %}
-      {% for model in model_group.models %}
-
-   .. container:: model-doc {{model.mad_tag}}
-
-      .. note::
-
-         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
-         Some models require access authorization prior to use via an external license agreement through a third party.
-
-      {% endfor %}
-   {% endfor %}
-
-   .. note::
-
-      vLLM is a toolkit and library for LLM inference and serving. AMD implements
-      high-performance custom kernels and modules in vLLM to enhance performance.
-      See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
-      more information.
-
-   .. _vllm-benchmark-performance-measurements:
-
-   Performance measurements
-   ========================
-
-   To evaluate performance, the
-   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
-   page provides reference throughput and latency measurements for inferencing
-   popular AI models.
-
-   .. important::
-
-      The performance data presented in
-      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
-      only reflects the :doc:`latest version of this inference benchmarking environment <../vllm>`_.
-      The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
-
-   Advanced features and known issues
-   ==================================
-
-   For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
-   see the developer's guide at `<https://github.com/ROCm/vllm/tree/7a9f58aae0e7215a5f3dccde60e35072c41656c2/docs/dev-docker>`__.
-
-   System validation
-   =================
-
-   Before running AI workloads, it's important to validate that your AMD hardware is configured
-   correctly and performing optimally.
-
-   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
-   might hang until the periodic balancing is finalized. For more information,
-   see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
-
-   .. code-block:: shell
-
-      # disable automatic NUMA balancing
-      sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
-      # check if NUMA balancing is disabled (returns 0 if disabled)
-      cat /proc/sys/kernel/numa_balancing
-      0
-
-   To test for optimal performance, consult the recommended :ref:`System health benchmarks
-   <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
-   system's configuration.
-
-   Pull the Docker image
-   =====================
-
-   Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
-   Use the following command to pull the Docker image from Docker Hub.
-
-   .. code-block:: shell
-
-      docker pull {{ unified_docker.pull_tag }}
-
-   Benchmarking
-   ============
-
-   Once the setup is complete, choose between two options to reproduce the
-   benchmark results:
-
-   .. _vllm-benchmark-mad:
-
-   {% for model_group in model_groups %}
-      {% for model in model_group.models %}
-
-   .. container:: model-doc {{model.mad_tag}}
-
-      .. tab-set::
-
-         .. tab-item:: MAD-integrated benchmarking
-
-            Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
-            directory and install the required packages on the host machine.
-
-            .. code-block:: shell
-
-               git clone https://github.com/ROCm/MAD
-               cd MAD
-               pip install -r requirements.txt
-
-            Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
-            using one GPU with the ``{{model.precision}}`` data type on the host machine.
-
-            .. code-block:: shell
-
-               export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
-               python3 tools/run_models.py --tags {{model.mad_tag}} --keep-model-dir --live-output --timeout 28800
-
-            MAD launches a Docker container with the name
-            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
-            model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
-
-            Although the :ref:`available models <vllm-benchmark-available-models>` are preconfigured
-            to collect latency and throughput performance data, you can also change the benchmarking
-            parameters. See the standalone benchmarking tab for more information.
-
-            {% if model.tunableop %}
-
-            .. note::
-
-               For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
-               TunableOp automatically explores different implementations and configurations of certain PyTorch
-               operators to find the fastest one for your hardware.
-
-               By default, ``{{model.mad_tag}}`` runs with TunableOp disabled
-               (see
-               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To
-               enable it, edit the default run behavior in the ``models.json``
-               configuration before running inference -- update the model's run
-               ``args`` by changing ``--tunableop off`` to ``--tunableop on``.
-
-               Enabling TunableOp triggers a two-pass run -- a warm-up followed by the performance-collection run.
-
-            {% endif %}
-
-         .. tab-item:: Standalone benchmarking
-
-            Run the vLLM benchmark tool independently by starting the
-            `Docker container <{{ unified_docker.docker_hub_url }}>`_
-            as shown in the following snippet.
-
-            .. code-block::
-
-               docker pull {{ unified_docker.pull_tag }}
-               docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name test {{ unified_docker.pull_tag }}
-
-            In the Docker container, clone the ROCm MAD repository and navigate to the
-            benchmark scripts directory at ``~/MAD/scripts/vllm``.
-
-            .. code-block::
-
-               git clone https://github.com/ROCm/MAD
-               cd MAD/scripts/vllm
-
-            To start the benchmark, use the following command with the appropriate options.
-
-            .. code-block::
-
-               ./vllm_benchmark_report.sh -s $test_option -m {{model.model_repo}} -g $num_gpu -d {{model.precision}}
-
-            .. list-table::
-               :header-rows: 1
-               :align: center
-
-               * - Name
-                 - Options
-                 - Description
-
-               * - ``$test_option``
-                 - latency
-                 - Measure decoding token latency
-
-               * -
-                 - throughput
-                 - Measure token generation throughput
-
-               * -
-                 - all
-                 - Measure both throughput and latency
-
-               * - ``$num_gpu``
-                 - 1 or 8
-                 - Number of GPUs
-
-               * - ``$datatype``
-                 - ``float16`` or ``float8``
-                 - Data type
-
-            .. note::
-
-               The input sequence length, output sequence length, and tensor parallel (TP) are
-               already configured. You don't need to specify them with this script.
-
-            .. note::
-
-               If you encounter the following error, pass your access-authorized Hugging
-               Face token to the gated models.
-
-               .. code-block::
-
-                  OSError: You are trying to access a gated repo.
-
-                  # pass your HF_TOKEN
-                  export HF_TOKEN=$your_personal_hf_token
-
-            Here are some examples of running the benchmark with various options.
-
-            * Latency benchmark
-
-              Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision.
-
-              .. code-block::
-
-                 ./vllm_benchmark_report.sh -s latency -m {{model.model_repo}} -g 8 -d {{model.precision}}
-
-              Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
-
-            * Throughput benchmark
-
-              Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision.
-
-              .. code-block:: shell
-
-                 ./vllm_benchmark_report.sh -s throughput -m {{model.model_repo}} -g 8 -d {{model.precision}}
-
-              Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
-
-            .. raw:: html
-
-               <style>
-               mjx-container[jax="CHTML"][display="true"] {
-                  text-align: left;
-                  margin: 0;
-               }
-               </style>
-
-            .. note::
-
-               Throughput is calculated as:
-
-               - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
-
-               - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
-      {% endfor %}
-   {% endfor %}
-
-Further reading
-===============
-
- For application performance optimization strategies for HPC and AI workloads,
-  including inference with vLLM, see :doc:`../inference-optimization/workload`.
-
- To learn more about the options for latency and throughput benchmark scripts,
-  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
-
- To learn more about system settings and management practices to configure your system for
-  MI300X accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
-
- To learn how to run LLM models from Hugging Face or your own model, see
-  :doc:`Running models from Hugging Face <hugging-face-models>`.
-
- To learn how to optimize inference on LLMs, see
-  :doc:`Inference optimization <../inference-optimization/index>`.
-
- To learn how to fine-tune LLMs, see
-  :doc:`Fine-tuning LLMs <../fine-tuning/index>`.
-
-Previous versions
-=================
-
-See :doc:`vllm-history` to find documentation for previous releases
-of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250513.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250513.rst
@@ -12,7 +12,7 @@ vLLM inference performance testing
 .. caution::

   This documentation does not reflect the latest version of ROCm vLLM
-   inference performance documentation. See :doc:`../vllm` for the latest version.
+   performance benchmark documentation. See :doc:`../vllm` for the latest version.

 .. _vllm-benchmark-unified-docker:

@@ -109,18 +109,18 @@ vLLM inference performance testing
   page provides reference throughput and latency measurements for inferencing
   popular AI models.

-   .. important::
+   .. note::

      The performance data presented in
      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
-      only reflects the :doc:`latest version of this inference benchmarking environment <../vllm>`_.
-      The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
+      should not be interpreted as the peak performance achievable by AMD
+      Instinct MI325X and MI300X accelerators or ROCm software.

   Advanced features and known issues
   ==================================

   For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
-   see the developer's guide at `<https://github.com/ROCm/vllm/tree/16d2b92ebcf90fe55cf73fa0b9329a6c9d3dede8/docs/dev-docker>`__.
+   see the developer's guide at `<https://github.com/ROCm/vllm/blob/7bb0618b1fe725b7d4fad9e525aa44da12c94a8b/docs/dev-docker/README.md>`__.

   System validation
   =================
@@ -346,9 +346,3 @@ Further reading

 - To learn how to fine-tune LLMs, see
  :doc:`Fine-tuning LLMs <../../../fine-tuning/index>`.
-
-Previous versions
-=================
-
-See :doc:`vllm-history` to find documentation for previous releases
-of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250521.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250521.rst
@@ -1,354 +0,0 @@
-:orphan:
-
-.. meta::
-   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
-                 ROCm vLLM Docker image.
-   :keywords: model, MAD, automation, dashboarding, validate
-
-**********************************
-vLLM inference performance testing
-**********************************
-
-.. caution::
-
-   This documentation does not reflect the latest version of ROCm vLLM
-   inference performance documentation. See :doc:`../vllm` for the latest version.
-
-.. _vllm-benchmark-unified-docker:
-
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.8.5_20250521-benchmark-models.yaml
-
-   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
-   {% set model_groups = data.vllm_benchmark.model_groups %}
-
-   The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
-   a prebuilt, optimized environment for validating large language model (LLM)
-   inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
-   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
-   accelerators and includes the following components:
-
-   * `ROCm {{ unified_docker.rocm_version }} <https://github.com/ROCm/ROCm>`_
-
-   * `vLLM {{ unified_docker.vllm_version }} <https://docs.vllm.ai/en/latest>`_
-
-   * `PyTorch {{ unified_docker.pytorch_version }} <https://github.com/ROCm/pytorch.git>`_
-
-   * `hipBLASLt {{ unified_docker.hipblaslt_version }} <https://github.com/ROCm/hipBLASLt>`_
-
-   With this Docker image, you can quickly test the :ref:`expected
-   inference performance numbers <vllm-benchmark-performance-measurements>` for
-   MI300X series accelerators.
-
-   .. _vllm-benchmark-available-models:
-
-   Supported models
-   ================
-
-   The following models are supported for inference performance benchmarking
-   with vLLM and ROCm. Some instructions, commands, and recommendations in this
-   documentation might vary by model -- select one to get started.
-
-   .. raw:: html
-
-      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
-        <div class="row">
-          <div class="col-2 me-2 model-param-head">Model group</div>
-          <div class="row col-10">
-   {% for model_group in model_groups %}
-            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
-   {% endfor %}
-          </div>
-        </div>
-
-        <div class="row mt-1">
-          <div class="col-2 me-2 model-param-head">Model</div>
-          <div class="row col-10">
-   {% for model_group in model_groups %}
-      {% set models = model_group.models %}
-      {% for model in models %}
-         {% if models|length % 3 == 0 %}
-            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-         {% else %}
-            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-         {% endif %}
-      {% endfor %}
-   {% endfor %}
-          </div>
-        </div>
-      </div>
-
-   .. _vllm-benchmark-vllm:
-
-   {% for model_group in model_groups %}
-      {% for model in model_group.models %}
-
-   .. container:: model-doc {{model.mad_tag}}
-
-      .. note::
-
-         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
-         Some models require access authorization prior to use via an external license agreement through a third party.
-
-      {% endfor %}
-   {% endfor %}
-
-   .. note::
-
-      vLLM is a toolkit and library for LLM inference and serving. AMD implements
-      high-performance custom kernels and modules in vLLM to enhance performance.
-      See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
-      more information.
-
-   .. _vllm-benchmark-performance-measurements:
-
-   Performance measurements
-   ========================
-
-   To evaluate performance, the
-   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
-   page provides reference throughput and latency measurements for inferencing
-   popular AI models.
-
-   .. note::
-
-      The performance data presented in
-      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
-      should not be interpreted as the peak performance achievable by AMD
-      Instinct MI325X and MI300X accelerators or ROCm software.
-
-   Advanced features and known issues
-   ==================================
-
-   For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
-   see the developer's guide at `<https://github.com/ROCm/vllm/blob/main/docs/dev-docker/README.md>`__.
-
-   System validation
-   =================
-
-   Before running AI workloads, it's important to validate that your AMD hardware is configured
-   correctly and performing optimally.
-
-   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
-   might hang until the periodic balancing is finalized. For more information,
-   see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
-
-   .. code-block:: shell
-
-      # disable automatic NUMA balancing
-      sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
-      # check if NUMA balancing is disabled (returns 0 if disabled)
-      cat /proc/sys/kernel/numa_balancing
-      0
-
-   To test for optimal performance, consult the recommended :ref:`System health benchmarks
-   <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
-   system's configuration.
-
-   Pull the Docker image
-   =====================
-
-   Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
-   Use the following command to pull the Docker image from Docker Hub.
-
-   .. code-block:: shell
-
-      docker pull {{ unified_docker.pull_tag }}
-
-   Benchmarking
-   ============
-
-   Once the setup is complete, choose between two options to reproduce the
-   benchmark results:
-
-   .. _vllm-benchmark-mad:
-
-   {% for model_group in model_groups %}
-      {% for model in model_group.models %}
-
-   .. container:: model-doc {{model.mad_tag}}
-
-      .. tab-set::
-
-         .. tab-item:: MAD-integrated benchmarking
-
-            Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
-            directory and install the required packages on the host machine.
-
-            .. code-block:: shell
-
-               git clone https://github.com/ROCm/MAD
-               cd MAD
-               pip install -r requirements.txt
-
-            Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
-            using one GPU with the ``{{model.precision}}`` data type on the host machine.
-
-            .. code-block:: shell
-
-               export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
-               python3 tools/run_models.py --tags {{model.mad_tag}} --keep-model-dir --live-output --timeout 28800
-
-            MAD launches a Docker container with the name
-            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
-            model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
-
-            Although the :ref:`available models <vllm-benchmark-available-models>` are preconfigured
-            to collect latency and throughput performance data, you can also change the benchmarking
-            parameters. See the standalone benchmarking tab for more information.
-
-            {% if model.tunableop %}
-
-            .. note::
-
-               For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
-               TunableOp automatically explores different implementations and configurations of certain PyTorch
-               operators to find the fastest one for your hardware.
-
-               By default, ``{{model.mad_tag}}`` runs with TunableOp disabled
-               (see
-               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To
-               enable it, edit the default run behavior in the ``models.json``
-               configuration before running inference -- update the model's run
-               ``args`` by changing ``--tunableop off`` to ``--tunableop on``.
-
-               Enabling TunableOp triggers a two-pass run -- a warm-up followed by the performance-collection run.
-
-            {% endif %}
-
-         .. tab-item:: Standalone benchmarking
-
-            Run the vLLM benchmark tool independently by starting the
-            `Docker container <{{ unified_docker.docker_hub_url }}>`_
-            as shown in the following snippet.
-
-            .. code-block::
-
-               docker pull {{ unified_docker.pull_tag }}
-               docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name test {{ unified_docker.pull_tag }}
-
-            In the Docker container, clone the ROCm MAD repository and navigate to the
-            benchmark scripts directory at ``~/MAD/scripts/vllm``.
-
-            .. code-block::
-
-               git clone https://github.com/ROCm/MAD
-               cd MAD/scripts/vllm
-
-            To start the benchmark, use the following command with the appropriate options.
-
-            .. code-block::
-
-               ./vllm_benchmark_report.sh -s $test_option -m {{model.model_repo}} -g $num_gpu -d {{model.precision}}
-
-            .. list-table::
-               :header-rows: 1
-               :align: center
-
-               * - Name
-                 - Options
-                 - Description
-
-               * - ``$test_option``
-                 - latency
-                 - Measure decoding token latency
-
-               * -
-                 - throughput
-                 - Measure token generation throughput
-
-               * -
-                 - all
-                 - Measure both throughput and latency
-
-               * - ``$num_gpu``
-                 - 1 or 8
-                 - Number of GPUs
-
-               * - ``$datatype``
-                 - ``float16`` or ``float8``
-                 - Data type
-
-            .. note::
-
-               The input sequence length, output sequence length, and tensor parallel (TP) are
-               already configured. You don't need to specify them with this script.
-
-            .. note::
-
-               If you encounter the following error, pass your access-authorized Hugging
-               Face token to the gated models.
-
-               .. code-block::
-
-                  OSError: You are trying to access a gated repo.
-
-                  # pass your HF_TOKEN
-                  export HF_TOKEN=$your_personal_hf_token
-
-            Here are some examples of running the benchmark with various options.
-
-            * Latency benchmark
-
-              Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision.
-
-              .. code-block::
-
-                 ./vllm_benchmark_report.sh -s latency -m {{model.model_repo}} -g 8 -d {{model.precision}}
-
-              Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
-
-            * Throughput benchmark
-
-              Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision.
-
-              .. code-block:: shell
-
-                 ./vllm_benchmark_report.sh -s throughput -m {{model.model_repo}} -g 8 -d {{model.precision}}
-
-              Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
-
-            .. raw:: html
-
-               <style>
-               mjx-container[jax="CHTML"][display="true"] {
-                  text-align: left;
-                  margin: 0;
-               }
-               </style>
-
-            .. note::
-
-               Throughput is calculated as:
-
-               - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
-
-               - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
-      {% endfor %}
-   {% endfor %}
-
-Further reading
-===============
-
- To learn more about the options for latency and throughput benchmark scripts,
-  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
-
- To learn more about system settings and management practices to configure your system for
-  MI300X accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
-
- For application performance optimization strategies for HPC and AI workloads,
-  including inference with vLLM, see :doc:`../../inference-optimization/workload`.
-
- To learn how to run LLM models from Hugging Face or your own model, see
-  :doc:`Running models from Hugging Face <../hugging-face-models>`.
-
- To learn how to optimize inference on LLMs, see
-  :doc:`Inference optimization <../../inference-optimization/index>`.
-
- To learn how to fine-tune LLMs, see
-  :doc:`Fine-tuning LLMs <../../fine-tuning/index>`.
-
-Previous versions
-=================
-
-See :doc:`vllm-history` to find documentation for previous releases
-of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst
@@ -1,75 +0,0 @@
-:orphan:
-
-**************************************************
-vLLM inference performance testing version history
-**************************************************
-
-This table lists previous versions of the ROCm vLLM inference Docker image for
-inference performance testing. For detailed information about available models
-for benchmarking, see the version-specific documentation. You can find tagged
-previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub <https://hub.docker.com/r/rocm/vllm/tags>`_.
-
-.. list-table::
-   :header-rows: 1
-   :stub-columns: 1
-
-   * - ROCm version
-     - vLLM version
-     - PyTorch version
-     - Resources
-
-   * - 6.4.0
-     - 0.9.0.1
-     - 2.7.0
-     - 
-       * :doc:`Documentation <../vllm>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250521/images/sha256-38410c51af7208897cd8b737c9bdfc126e9bc8952d4aa6b88c85482f03092a11>`_
-
-   * - 6.3.1
-     - 0.8.5 (0.8.6.dev)
-     - 2.7.0
-     - 
-       * :doc:`Documentation <vllm-0.8.5-20250521>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250521/images/sha256-38410c51af7208897cd8b737c9bdfc126e9bc8952d4aa6b88c85482f03092a11>`_
-
-   * - 6.3.1
-     - 0.8.5
-     - 2.7.0
-     - 
-       * :doc:`Documentation <vllm-0.8.5-20250513>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250513/images/sha256-5c8b4436dd0464119d9df2b44c745fadf81512f18ffb2f4b5dc235c71ebe26b4>`_
-
-   * - 6.3.1
-     - 0.8.3
-     - 2.7.0
-     - 
-       * :doc:`Documentation <vllm-0.8.3-20250415>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.8.3_20250415/images/sha256-ad9062dea3483d59dedb17c67f7c49f30eebd6eb37c3fac0a171fb19696cc845>`_
-
-   * - 6.3.1
-     - 0.7.3
-     - 2.7.0
-     - 
-       * :doc:`Documentation <vllm-0.7.3-20250325>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.7.3_20250325/images/sha256-25245924f61750b19be6dcd8e787e46088a496c1fe17ee9b9e397f3d84d35640>`_
-
-   * - 6.3.1
-     - 0.6.6
-     - 2.7.0
-     - 
-       * :doc:`Documentation <vllm-0.6.6>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6/images/sha256-9a12ef62bbbeb5a4c30a01f702c8e025061f575aa129f291a49fbd02d6b4d6c9>`_
-
-   * - 6.2.1
-     - 0.6.4
-     - 2.5.0
-     - 
-       * :doc:`Documentation <vllm-0.6.4>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4/images/sha256-ccbb74cc9e7adecb8f7bdab9555f7ac6fc73adb580836c2a35ca96ff471890d8>`_
-
-   * - 6.2.0
-     - 0.4.3
-     - 2.4.0
-     - 
-       * :doc:`Documentation <vllm-0.4.3>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.2_mi300_ubuntu22.04_py3.9_vllm_7c5fd50/images/sha256-9e4dd4788a794c3d346d7d0ba452ae5e92d39b8dfac438b2af8efdc7f15d22c0>`_
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
@@ -99,20 +99,21 @@ vLLM inference performance testing

   To evaluate performance, the
   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
-   page provides reference throughput and latency measurements for inferencing popular AI models.
+   page provides reference throughput and latency measurements for inferencing
+   popular AI models.

-   .. important::
+   .. note::

      The performance data presented in
      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
-      only reflects the latest version of this inference benchmarking environment.
-      The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
+      should not be interpreted as the peak performance achievable by AMD
+      Instinct MI325X and MI300X accelerators or ROCm software.

   Advanced features and known issues
   ==================================

   For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
-   see the developer's guide at `<https://github.com/ROCm/vllm/tree/7bb0618b1fe725b7d4fad9e525aa44da12c94a8b/docs/dev-docker>`__.
+   see the developer's guide at `<https://github.com/ROCm/vllm/blob/main/docs/dev-docker/README.md>`__.

   System validation
   =================
@@ -342,5 +343,57 @@ Further reading
 Previous versions
 =================

-See :doc:`previous-versions/vllm-history` to find documentation for previous releases
-of the ``ROCm/vllm`` Docker image.
+This table lists previous versions of the ROCm vLLM inference Docker image for
+inference performance testing. For detailed information about available models
+for benchmarking, see the version-specific documentation.
+
+.. list-table::
+   :header-rows: 1
+   :stub-columns: 1
+
+   * - ROCm version
+     - vLLM version
+     - PyTorch version
+     - Resources
+
+   * - 6.3.1
+     - 0.8.5
+     - 2.7.0
+     - 
+       * :doc:`Documentation <previous-versions/vllm-0.8.5-20250513>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250513/images/sha256-5c8b4436dd0464119d9df2b44c745fadf81512f18ffb2f4b5dc235c71ebe26b4>`_
+
+   * - 6.3.1
+     - 0.8.3
+     - 2.7.0
+     - 
+       * `Documentation <https://rocm.docs.amd.com/en/docs-6.4.0/how-to/rocm-for-ai/inference/vllm-benchmark.html>`_
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.8.3_20250415/images/sha256-ad9062dea3483d59dedb17c67f7c49f30eebd6eb37c3fac0a171fb19696cc845>`_
+
+   * - 6.3.1
+     - 0.7.3
+     - 2.7.0
+     - 
+       * `Documentation <https://rocm.docs.amd.com/en/docs-6.3.3/how-to/rocm-for-ai/inference/vllm-benchmark.html>`_
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.7.3_20250325/images/sha256-25245924f61750b19be6dcd8e787e46088a496c1fe17ee9b9e397f3d84d35640>`_
+
+   * - 6.3.1
+     - 0.6.6
+     - 2.7.0
+     - 
+       * `Documentation <https://rocm.docs.amd.com/en/docs-6.3.2/how-to/rocm-for-ai/inference/vllm-benchmark.html>`_
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6/images/sha256-9a12ef62bbbeb5a4c30a01f702c8e025061f575aa129f291a49fbd02d6b4d6c9>`_
+
+   * - 6.2.1
+     - 0.6.4
+     - 2.5.0
+     - 
+       * `Documentation <https://rocm.docs.amd.com/en/docs-6.3.0/how-to/performance-validation/mi300x/vllm-benchmark.html>`_
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4/images/sha256-ccbb74cc9e7adecb8f7bdab9555f7ac6fc73adb580836c2a35ca96ff471890d8>`_
+
+   * - 6.2.0
+     - 0.4.3
+     - 2.4.0
+     - 
+       * `Documentation <https://rocm.docs.amd.com/en/docs-6.2.0/how-to/performance-validation/mi300x/vllm-benchmark.html>`_
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.2_mi300_ubuntu22.04_py3.9_vllm_7c5fd50/images/sha256-9e4dd4788a794c3d346d7d0ba452ae5e92d39b8dfac438b2af8efdc7f15d22c0>`_
--- a/docs/how-to/rocm-for-ai/install.rst
+++ b/docs/how-to/rocm-for-ai/install.rst
@@ -1,5 +1,5 @@
 .. meta::
-   :description: How to install the ROCm 7.0 preview
+   :description: How to install ROCm and popular machine learning frameworks.
   :keywords: ROCm, AI, LLM, train, fine-tune, FSDP, DeepSpeed, LLaMA, tutorial

 .. _rocm-for-ai-install:
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
@@ -374,5 +374,22 @@ own cluster setup.
 Previous versions
 =================

-See :doc:`previous-versions/jax-maxtext-history` to find documentation for previous releases
-of the ``ROCm/jax-training`` Docker image.
+This table lists previous versions of the ROCm JAX MaxText Docker image for training
+performance testing. For detailed information about available models for
+benchmarking, see the version-specific documentation.
+
+.. list-table::
+   :header-rows: 1
+   :stub-columns: 1
+
+   * - Image version
+     - ROCm version
+     - JAX version
+     - Resources
+
+   * - 25.4
+     - 6.3.0
+     - 0.4.31
+     - 
+       * `Documentation <https://rocm.docs.amd.com/en/docs-6.3.3/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.html>`_
+       * `Docker Hub <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.4/images/sha256-fb3eb71cd74298a7b3044b7130cf84113f14d518ff05a2cd625c11ea5f6a7b01>`_
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
@@ -119,12 +119,12 @@ To evaluate performance, the
 page provides reference throughput and latency measurements for training
 popular AI models.

-.. important::
+.. note::

   The performance data presented in
-   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
-   only reflects the latest version of this training benchmarking environment.
-   The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
+   should not be interpreted as the peak performance achievable by AMD
+   Instinct MI325X and MI300X accelerators or ROCm software.

 System validation
 =================
@@ -749,5 +749,36 @@ The benchmark tests support the following sets of variables.
 Previous versions
 =================

-See :doc:`previous-versions/megatron-lm-history` to find documentation for previous releases
-of the ``ROCm/megatron-lm`` Docker image.
+This table lists previous versions of the ROCm Megatron-LM Docker image for training
+performance testing. For detailed information about available models for
+benchmarking, see the version-specific documentation.
+
+.. list-table::
+   :header-rows: 1
+   :stub-columns: 1
+
+   * - Image version
+     - ROCm version
+     - PyTorch version
+     - Resources
+
+   * - 25.4
+     - 6.3.0
+     - 2.7.0a0+git637433 
+     - 
+       * `Documentation <https://rocm.docs.amd.com/en/docs-6.3.3/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.html>`_
+       * `Docker Hub <https://hub.docker.com/layers/rocm/megatron-lm/v25.4/images/sha256-941aa5387918ea91c376c13083aa1e6c9cab40bb1875abbbb73bbb65d8736b3f>`_
+
+   * - 25.3
+     - 6.3.0
+     - 2.7.0a0+git637433 
+     - 
+       * `Documentation <https://rocm.docs.amd.com/en/docs-6.3.2/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.html>`_
+       * `Docker Hub <https://hub.docker.com/layers/rocm/megatron-lm/v25.3/images/sha256-1e6ed9bdc3f4ca397300d5a9907e084ab5e8ad1519815ee1f868faf2af1e04e2>`_
+
+   * - 24.12-dev
+     - 6.1.0
+     - 2.4.0
+     - 
+       * `Documentation <https://rocm.docs.amd.com/en/docs-6.3.0/how-to/rocm-for-ai/train-a-model.html>`_
+       * `Docker Hub <https://hub.docker.com/layers/rocm/megatron-lm/24.12-dev/images/sha256-5818c50334ce3d69deeeb8f589d83ec29003817da34158ebc9e2d112b929bf2e>`_
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history.rst
@@ -1,34 +0,0 @@
-:orphan:
-
-********************************************************
-JAX MaxText training performance testing version history
-********************************************************
-
-This table lists previous versions of the ROCm JAX MaxText Docker image for training
-performance testing. For detailed information about available models for
-benchmarking, see the version-specific documentation.
-You can find tagged
-previous releases of the ``ROCm/jax-training`` Docker image on `Docker Hub <https://hub.docker.com/r/rocm/jax-training/tags>`_.
-
-.. list-table::
-   :header-rows: 1
-   :stub-columns: 1
-
-   * - Image version
-     - ROCm version
-     - JAX version
-     - Resources
-
-   * - 25.5
-     - 6.3.4
-     - 0.4.35
-     - 
-       * :doc:`Documentation <../jax-maxtext>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.5/images/sha256-4e0516358a227cae8f552fb866ec07e2edcf244756f02e7b40212abfbab5217b>`_
-
-   * - 25.4
-     - 6.3.0
-     - 0.4.31
-     - 
-       * :doc:`Documentation <jax-maxtext-v25.4>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.4/images/sha256-fb3eb71cd74298a7b3044b7130cf84113f14d518ff05a2cd625c11ea5f6a7b01>`_
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4.rst
@@ -1,358 +0,0 @@
-:orphan:
-
-.. meta::
-   :description: How to train a model using JAX MaxText for ROCm.
-   :keywords: ROCm, AI, LLM, train, jax, torch, Llama, flux, tutorial, docker
-
-**************************************
-Training a model with MaxText for ROCm
-**************************************
-
-.. caution::
-
-   This documentation does not reflect the latest version of ROCm JAX MaxText
-   training performance documentation. See :doc:`../jax-maxtext` for the latest version.
-
-MaxText is a high-performance, open-source framework built on the Google JAX
-machine learning library to train LLMs at scale. The MaxText framework for
-ROCm is an optimized fork of the upstream
-`<https://github.com/AI-Hypercomputer/maxtext>`__ enabling efficient AI workloads
-on AMD MI300X series accelerators.
-
-The MaxText for ROCm training Docker (``rocm/jax-training:maxtext-v25.4``) image
-provides a prebuilt environment for training on AMD Instinct MI300X and MI325X accelerators,
-including essential components like JAX, XLA, ROCm libraries, and MaxText utilities.
-It includes the following software components:
-
-+--------------------------+--------------------------------+
-| Software component       | Version                        |
-+==========================+================================+
-| ROCm                     | 6.3.0                          |
-+--------------------------+--------------------------------+
-| JAX                      | 0.4.31                         |
-+--------------------------+--------------------------------+
-| Python                   | 3.10                           |
-+--------------------------+--------------------------------+
-| Transformer Engine       | 1.12.0.dev0+f81a3eb            |
-+--------------------------+--------------------------------+
-| hipBLASLt                | git78ec8622                    |
-+--------------------------+--------------------------------+
-
-Supported features and models
-=============================
-
-MaxText provides the following key features to train large language models efficiently:
-
- Transformer Engine (TE)
-
- Flash Attention (FA) 3
-
- GEMM tuning
-
- Multi-node support
-
-.. _amd-maxtext-model-support:
-
-The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
-
-* Llama 3.1 8B
-
-* Llama 3.1 70B
-
-* Llama 3 8B
-
-* Llama 3 70B
-
-* Llama 2 7B
-
-* Llama 2 70B
-
-* DeepSeek-V2-Lite
-
-.. note::
-
-   Some models, such as Llama 3, require an external license agreement through
-   a third party (for example, Meta).
-
-Unsupported features
--------------------
-
-Currently, MaxText's default packed input format is not supported. Using this format
-with the current Docker image results in incorrect attention calculations
-across different input sequences. Support for packed input format is planned for a future release.
-
-System validation
-=================
-
-If you have already validated your system settings, including NUMA
-auto-balancing, skip this step. Otherwise, complete the :ref:`system validation
-and optimization steps <train-a-model-system-validation>` to set up your system
-before starting training.
-
-Environment setup
-=================
-
-This Docker image is optimized for specific model configurations outlined
-as follows. Performance can vary for other training workloads, as AMD
-doesn’t validate configurations and run conditions outside those described.
-
-.. _amd-maxtext-multi-node-setup:
-
-Multi-node setup
----------------
-
-For multi-node environments, ensure you have all the necessary packages for
-your network device, such as, RDMA. If you're not using a multi-node setup
-with RDMA, skip ahead to :ref:`amd-maxtext-download-docker`.
-
-1. Install the following packages to build and install the RDMA driver.
-
-   .. code-block:: shell
-
-      sudo apt install iproute2 -y
-      sudo apt install -y linux-headers-"$(uname-r)" libelf-dev
-      sudo apt install -y gcc make libtool autoconf librdmacm-dev rdmacm-utils infiniband-diags ibverbs-utils perftest ethtool libibverbs-dev rdma-core strace libibmad5 libibnetdisc5 ibverbs-providers libibumad-dev libibumad3 libibverbs1 libnl-3-dev libnl-route-3-dev
-
-   Refer to your NIC manufacturer's documentation for further steps on
-   compiling and installing the RoCE driver. For example, for Broadcom,
-   see `Compiling Broadcom NIC software from source <https://docs.broadcom.com/doc/957608-AN2XX#G3.484341>`_
-   in `Ethernet networking guide for AMD Instinct MI300X GPU clusters <https://docs.broadcom.com/doc/957608-AN2XX>`_.
-
-2. Set the following environment variables.
-
-   a. Master address
-
-      Change `localhost` to the master node's resolvable hostname or IP address:
-
-      .. code-block:: bash
-
-         export MASTER_ADDR="${MASTER_ADDR:-localhost}"
-
-   b. Number of nodes
-
-      Set the number of nodes you want to train on (for example, ``2``, ``4``, or ``8``):
-
-      .. code-block:: bash
-
-         export NNODES="${NNODES:-1}"
-
-   c. Node ranks
-
-      Set the rank of each node (``0`` for master, ``1`` for the first worker node, and so on)
-      Node ranks should be unique across all nodes in the cluster.
-
-      .. code-block:: bash
-
-         export NODE_RANK="${NODE_RANK:-0}"
-
-   d. Network interface
-
-      Update the network interface in the script to match your system's network interface. To
-      find your network interface, run the following (outside of any Docker container):
-
-      .. code-block:: bash
-
-         ip a
-
-      Look for an active interface with an IP address in the same subnet as
-      your other nodes. Then, update the following variable in the script, for
-      example:
-
-      .. code-block:: bash
-
-         export NCCL_SOCKET_IFNAME=ens50f0np0
-
-      This variable specifies which network interface to use for inter-node communication.
-      Setting this variable to the incorrect interface can result in communication failures
-      or significantly reduced performance.
-
-   e. RDMA interface
-
-      Ensure the :ref:`required packages <amd-maxtext-multi-node-setup>` are installed on all nodes.
-      Then, set the RDMA interfaces to use for communication.
-
-      .. code-block:: bash
-
-         # If using Broadcom NIC
-         export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
-         # If using Mellanox NIC
-         export NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_8,mlx5_9
-
-.. _amd-maxtext-download-docker:
-
-Download the Docker image
-------------------------
-
-1. Use the following command to pull the Docker image from Docker Hub.
-
-   .. code-block:: shell
-
-      docker pull rocm/jax-training:maxtext-v25.4
-
-2. Run the Docker container.
-
-   .. code-block:: shell
-
-      docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME/.ssh:/root/.ssh --shm-size 128G --name maxtext_training rocm/jax-training:maxtext-v25.4
-
-.. _amd-maxtext-get-started:
-
-Getting started
-===============
-
-The following examples demonstrate how to get started with single node
-and multi-node training using the benchmarking scripts provided at
-`<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/>`__.
-
-.. important::
-
-   The provided scripts launch a Docker container and execute a benchmark. Ensure you run these commands outside of any existing Docker container.
-
-Before running any benchmarks, ensure the ``$HF_HOME`` environment variable is
-set correctly and points to your Hugging Face cache directory. Refer to the
-README at `<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/>`__
-for more detailed instructions.
-
-Single node training benchmarking examples
------------------------------------------
-
-* Example 1: Single node training with Llama 2 7B
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_7b.sh
-
-  Run the single node training benchmark:
-
-  IMAGE="rocm/jax-training:maxtext-v25.4" bash ./llama2_7b.sh
-
-* Example 2: Single node training with Llama 2 70B
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_70b.sh
-
-  Run the single node training benchmark:
-
-  .. code-block:: shell
-
-     IMAGE="rocm/jax-training:maxtext-v25.4" bash ./llama2_70b.sh
-
-* Example 3: Single node training with Llama 3 8B
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_8b.sh
-
-  Run the single node training benchmark:
-
-  .. code-block:: shell
-
-     IMAGE="rocm/jax-training:maxtext-v25.4" bash ./llama3_8b.sh
-
-* Example 4: Single node training with Llama 3 70B
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_70b.sh
-
-  Run the single node training benchmark:
-
-  .. code-block:: shell
-
-     IMAGE="rocm/jax-training:maxtext-v25.4" bash ./llama3_70b.sh
-
-* Example 5: Single node training with DeepSeek V2 16B
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/deepseek_v2_16b.sh
-
-  Run the single node training benchmark:
-
-  .. code-block:: shell
-
-     IMAGE="rocm/jax-training:maxtext-v25.4" bash ./deepseek_v2_16b.sh
-
-  .. note::
-
-     The reported TFLOP/s by MaxText for DeepSeek is not accurate. Use
-     the tokens/s as a performance indicator.
-
-Multi-node training benchmarking examples
-----------------------------------------
-
-The following examples use SLURM for running on multiple nodes -- the commands might need to be adjusted for your
-own cluster setup.
-
-* Example 1: Multi-node training with Llama 2 7B
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_7b_multinode.sh
-
-  Run the multi-node training benchmark. For example:
-
-  .. code-block:: shell
-
-     sbatch -N <num_nodes> llama2_7b_multinode.sh
-
-* Example 2: Multi-node training with Llama 2 70B
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_70b_multinode.sh
-
-  Run the multi-node training benchmark. For example:
-
-  .. code-block:: shell
-
-     sbatch -N <num_nodes> llama2_70b_multinode.sh
-
-* Example 3: Multi-node training with Llama 3 8B model
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_8b_multinode.sh
-
-  Run the multi-node training benchmark. For example:
-
-  .. code-block:: shell
-
-     sbatch -N <num_nodes> llama3_8b_multinode.sh
-
-* Example 4: Multi-node training with Llama 3 70B model
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_70b_multinode.sh
-
-  Run the multi-node training benchmark. For example:
-
-  .. code-block:: shell
-
-     sbatch -N <num_nodes> llama3_70b_multinode.sh
-
-Previous versions
-=================
-
-See :doc:`jax-maxtext-history` to find documentation for previous releases
-of the ``ROCm/jax-training`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst
@@ -1,47 +0,0 @@
-:orphan:
-
-********************************************************
-Megatron-LM training performance testing version history
-********************************************************
-
-This table lists previous versions of the ROCm Megatron-LM training Docker image for
-inference performance testing. For detailed information about available models
-for benchmarking, see the version-specific documentation. You can find tagged
-previous releases of the ``ROCm/megatron-lm`` Docker image on `Docker Hub <https://hub.docker.com/r/rocm/megatron-lm/tags>`_.
-
-.. list-table::
-   :header-rows: 1
-   :stub-columns: 1
-
-   * - Image version
-     - ROCm version
-     - PyTorch version
-     - Resources
-
-   * - v25.5
-     - 6.3.4
-     - 2.8.0a0+gite2f9759
-     - 
-       * `Documentation <../megatron-lm>`_
-       * `Docker Hub <https://hub.docker.com/layers/rocm/megatron-lm/v25.5_py312/images/sha256-4506f18ba188d24189c6b1f95130b425f52c528a543bb3f420351824edceadc2>`_
-
-   * - v25.4
-     - 6.3.0
-     - 2.7.0a0+git637433 
-     - 
-       * :doc:`Documentation <megatron-lm-v25.4>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/megatron-lm/v25.4/images/sha256-941aa5387918ea91c376c13083aa1e6c9cab40bb1875abbbb73bbb65d8736b3f>`_
-
-   * - v25.3
-     - 6.3.0
-     - 2.7.0a0+git637433 
-     - 
-       * :doc:`Documentation <megatron-lm-v25.3>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/megatron-lm/v25.3/images/sha256-1e6ed9bdc3f4ca397300d5a9907e084ab5e8ad1519815ee1f868faf2af1e04e2>`_
-
-   * - v24.12-dev
-     - 6.1.0
-     - 2.4.0
-     - 
-       * :doc:`Documentation <megatron-lm-v24.12-dev>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/megatron-lm/24.12-dev/images/sha256-5818c50334ce3d69deeeb8f589d83ec29003817da34158ebc9e2d112b929bf2e>`_
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v24.12-dev.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v24.12-dev.rst
@@ -1,515 +0,0 @@
-:orphan:
-
-.. meta::
-   :description: How to train a model using ROCm Megatron-LM
-   :keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
-
-**************************************
-Training a model with ROCm Megatron-LM
-**************************************
-
-.. caution::
-
-   This documentation does not reflect the latest version of ROCm Megatron-LM
-   training performance documentation. See :doc:`../megatron-lm` for the latest version.
-
-.. _amd-megatron-lm:
-
-The ROCm Megatron-LM framework is a specialized fork of the robust Megatron-LM, designed to
-enable efficient training of large-scale language models on AMD GPUs. By leveraging AMD Instinct™ MI300X
-accelerators, AMD Megatron-LM delivers enhanced scalability, performance, and resource utilization for AI
-workloads. It is purpose-built to :ref:`support models <amd-megatron-lm-model-support>`
-like Meta's Llama 2, Llama 3, and Llama 3.1, enabling developers to train next-generation AI models with greater
-efficiency. See the GitHub repository at `<https://github.com/ROCm/Megatron-LM>`__.
-
-For ease of use, AMD provides a ready-to-use Docker image for MI300X accelerators containing essential
-components, including PyTorch, PyTorch Lightning, ROCm libraries, and Megatron-LM utilities. It contains the
-following software to accelerate training workloads:
-
-+--------------------------+--------------------------------+
-| Software component       | Version                        |
-+==========================+================================+
-| ROCm                     | 6.1                            |
-+--------------------------+--------------------------------+
-| PyTorch                  | 2.4.0                          |
-+--------------------------+--------------------------------+
-| PyTorch Lightning        | 2.4.0                          |
-+--------------------------+--------------------------------+
-| Megatron Core            | 0.9.0                          |
-+--------------------------+--------------------------------+
-| Transformer Engine       | 1.5.0                          |
-+--------------------------+--------------------------------+
-| Flash Attention          | v2.6                           |
-+--------------------------+--------------------------------+
-| Transformers             | 4.44.0                         |
-+--------------------------+--------------------------------+
-
-Supported features and models
-=============================
-
-Megatron-LM provides the following key features to train large language models efficiently:
-
- Transformer Engine (TE)
-
- APEX
-
- GEMM tuning
-
- Torch.compile
-
- 3D parallelism: TP + SP + CP
-
- Distributed optimizer
-
- Flash Attention (FA) 2
-
- Fused kernels
-
- Pre-training
-
-.. _amd-megatron-lm-model-support:
-
-The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.
-
-* Llama 2 7B
-
-* Llama 2 70B
-
-* Llama 3 8B
-
-* Llama 3 70B
-
-* Llama 3.1 8B
-
-* Llama 3.1 70B
-
-Prerequisite system validation steps
-====================================
-
-Complete the following system validation and optimization steps to set up your system before starting training.
-
-Disable NUMA auto-balancing
---------------------------
-
-Generally, application performance can benefit from disabling NUMA auto-balancing. However,
-it might be detrimental to performance with certain types of workloads.
-
-Run the command ``cat /proc/sys/kernel/numa_balancing`` to check your current NUMA (Non-Uniform
-Memory Access) settings. Output ``0`` indicates this setting is disabled. If there is no output or
-the output is ``1``, run the following command to disable NUMA auto-balancing.
-
-.. code-block:: shell
-
-   sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
-
-See :ref:`mi300x-disable-numa` for more information.
-
-Hardware verification with ROCm
-------------------------------
-
-Use the command ``rocm-smi --setperfdeterminism 1900`` to set the max clock speed up to 1900 MHz
-instead of the default 2100 MHz. This can reduce the chance of a PCC event lowering the attainable
-GPU clocks. This setting will not be required for new IFWI releases with the production PRC feature.
-You can restore this setting to its default value with the ``rocm-smi -r`` command.
-
-Run the command:
-
-.. code-block:: shell
-
-   rocm-smi --setperfdeterminism 1900
-
-See :ref:`mi300x-hardware-verification-with-rocm` for more information.
-
-RCCL Bandwidth Test
-------------------
-
-ROCm Collective Communications Library (RCCL) is a standalone library of standard collective communication
-routines for GPUs. See the :doc:`RCCL documentation <rccl:index>` for more information. Before starting
-pre-training, running a RCCL bandwidth test helps ensure that the multi-GPU or multi-node setup is optimized
-for efficient distributed training.
-
-Running the RCCL bandwidth test helps verify that:
-
- The GPUs can communicate across nodes or within a single node.
-
- The interconnect (such as InfiniBand, Ethernet, or Infinite fabric) is functioning as expected and
-  provides adequate bandwidth for communication.
-
- No hardware setup or cabling issues could affect the communication between GPUs
-
-Tuning and optimizing hyperparameters
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-In distributed training, specific hyperparameters related to distributed communication can be tuned based on
-the results of the RCCL bandwidth test. These variables are already set in the Docker image:
-
-.. code-block:: shell
-
-   # force all RCCL streams to be high priority
-   export TORCH_NCCL_HIGH_PRIORITY=1
-
-   # specify which RDMA interfaces to use for communication
-   export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
-
-   # define the Global ID index used in RoCE mode
-   export NCCL_IB_GID_INDEX=3
-
-   # avoid data corruption/mismatch issue that existed in past releases
-   export RCCL_MSCCL_ENABLE=0
-
-Running the RCCL Bandwidth Test
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-It's recommended you run the RCCL bandwidth test before launching training. It ensures system
-performance is sufficient to launch training. RCCL is not included in the AMD Megatron-LM Docker
-image; follow the instructions in `<https://github.com/ROCm/rccl-tests>`__ to get started.
-See :ref:`mi300x-rccl` for more information.
-
-Run on 8 GPUs (``-g 8``), scanning from 8 bytes to 10 GB:
-
-.. code-block:: shell
-
-   ./build/all_reduce_perf -b 8 -e 10G -f 2 -g 8
-
-.. image:: ../../data/how-to/rocm-for-ai/rccl-tests-8-gpu.png
-   :width: 800
-
-Using one MPI process per GPU and ``-g 1`` for performance-oriented runs on both single-node and multi-node is
-recommended. So, a run on 8 GPUs looks something like:
-
-.. code-block:: shell
-
-   mpirun -np 8 --bind-to numa ./build/all_reduce_perf -b 8 -e 10G -f 2 -g 1
-
-.. image:: ../../data/how-to/rocm-for-ai/rccl-tests-1-mpi-process-per-gpu.png
-   :width: 800
-
-Running with one MPI process per GPU ensures a one-to-one mapping for CPUs and GPUs, which can be beneficial
-for smaller message sizes. This better represents the real-world use of RCCL in deep learning frameworks like
-PyTorch and TensorFlow.
-
-Use the following script to run the RCCL test for four MI300X GPU nodes. Modify paths and node addresses as needed.
-
-.. code-block::
-
-   /home/$USER/ompi_for_gpu/ompi/bin/mpirun -np 32 -H tw022:8,tw024:8,tw010:8, tw015:8 \
-   --mca pml ucx \
-   --mca btl ^openib \
-   -x NCCL_SOCKET_IFNAME=ens50f0np0 \
-   -x NCCL_IB_HCA=rdma0:1,rdma1:1,rdma2:1,rdma3:1,rdma4:1,rdma5:1,rdma6:1,rdma7:1 \
-   -x NCCL_IB_GID_INDEX=3 \
-   -x NCCL_MIN_NCHANNELS=40 \
-   -x NCCL_DEBUG=version \
-   $HOME/rccl-tests/build/all_reduce_perf -b 8 -e 8g -f 2 -g 1
-
-.. image:: ../../data/how-to/rocm-for-ai/rccl-tests-4-mi300x-gpu-nodes.png
-   :width: 800
-
-.. _mi300x-amd-megatron-lm-training:
-
-Start training on MI300X accelerators
-=====================================
-
-The pre-built ROCm Megatron-LM environment allows users to quickly validate system performance, conduct
-training benchmarks, and achieve superior performance for models like Llama 2 and Llama 3.1.
-
-Use the following instructions to set up the environment, configure the script to train models, and
-reproduce the benchmark results on the MI300X accelerators with the AMD Megatron-LM Docker
-image.
-
-.. _amd-megatron-lm-requirements:
-
-Download the Docker image and required packages
-----------------------------------------------
-
-1. Use the following command to pull the Docker image from Docker Hub.
-
-   .. code-block:: shell
-
-      docker pull rocm/megatron-lm:24.12-dev
-
-2. Launch the Docker container.
-
-   .. code-block:: shell
-
-      docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $CACHE_DIR:/root/.cache --name megatron-dev-env rocm/megatron-lm:24.12-dev /bin/bash
-
-3. Clone the ROCm Megatron-LM repository to a local directory and install the required packages on the host machine.
-
-   .. code-block:: shell
-
-      git clone https://github.com/ROCm/Megatron-LM
-      cd Megatron-LM
-
-   .. note::
-
-      This release is validated with ``ROCm/Megatron-LM`` commit `bb93ccb <https://github.com/ROCm/Megatron-LM/tree/bb93ccbfeae6363c67b361a97a27c74ab86e7e92>`_.
-      Checking out this specific commit is recommended for a stable and reproducible environment.
-
-      .. code-block:: shell
-         
-         git checkout bb93ccbfeae6363c67b361a97a27c74ab86e7e92
-
-Prepare training datasets
-------------------------
-
-If you already have the preprocessed data, you can skip this section.
-
-Use the following command to process datasets. We use GPT data as an example. You may change the merge table, use an
-end-of-document token, remove sentence splitting, and use the tokenizer type.
-
-.. code-block:: shell
-
-   python tools/preprocess_data.py \
-       --input my-corpus.json \
-       --output-prefix my-gpt2 \
-       --vocab-file gpt2-vocab.json \
-       --tokenizer-type GPT2BPETokenizer \
-       --merge-file gpt2-merges.txt \
-       --append-eod
-
-In this case, the automatically generated output files are named ``my-gpt2_text_document.bin`` and
-``my-gpt2_text_document.idx``.
-
-.. image:: ../../data/how-to/rocm-for-ai/prep-training-datasets-my-gpt2-text-document.png
-   :width: 800
-
-.. _amd-megatron-lm-environment-setup:
-
-Environment setup
-----------------
-
-In the ``examples/llama`` directory of Megatron-LM, if you're working with Llama 2 7B or Llama 2 70 B, use the
-``train_llama2.sh`` configuration script. Likewise, if you're working with Llama 3 or Llama 3.1, then use
-``train_llama3.sh`` and update the configuration script accordingly.
-
-Network interface
-^^^^^^^^^^^^^^^^^
-
-To avoid connectivity issues, ensure the correct network interface is set in your training scripts.
-
-1. Run the following command to find the active network interface on your system.
-
-   .. code-block:: shell
-
-      ip a
-
-2. Update the ``NCCL_SOCKET_IFNAME`` and ``GLOO_SOCKET_IFNAME`` variables with your system’s network interface. For
-   example:
-
-   .. code-block:: shell
-
-      export NCCL_SOCKET_IFNAME=ens50f0np0
-
-      export GLOO_SOCKET_IFNAME=ens50f0np0
-
-Dataset options
-^^^^^^^^^^^^^^^
-
-You can use either mock data or real data for training.
-
-* If you're using a real dataset, update the ``DATA_PATH`` variable to point to the location of your dataset.
-
-  .. code-block:: shell
-
-     DATA_DIR="/root/.cache/data" # Change to where your dataset is stored
-
-     DATA_PATH=${DATA_DIR}/bookcorpus_text_sentence
-
-  .. code-block:: shell
-
-     --data-path $DATA_PATH
-
-  Ensure that the files are accessible inside the Docker container.
-
-* Mock data can be useful for testing and validation. If you're using mock data, replace ``--data-path $DATA_PATH`` with the ``--mock-data`` option.
-
-  .. code-block:: shell
-
-     --mock-data
-
-Tokenizer
-^^^^^^^^^
-
-Tokenization is the process of converting raw text into tokens that can be processed by the model. For Llama
-models, this typically involves sub-word tokenization, where words are broken down into smaller units based on
-a fixed vocabulary. The tokenizer is trained along with the model on a large corpus of text, and it learns a
-fixed vocabulary that can represent a wide range of text from different domains. This allows Llama models to
-handle a variety of input sequences, including unseen words or domain-specific terms.
-
-To train any of the Llama 2 models that this Docker image supports, use the ``Llama2Tokenizer``.
-
-To train any of Llama 3 and Llama 3.1 models that this Docker image supports, use the ``HuggingFaceTokenizer``.
-Set the Hugging Face model link in the ``TOKENIZER_MODEL`` variable.
-
-For example, if you're using the Llama 3.1 8B model:
-
-.. code-block:: shell
-
-   TOKENIZER_MODEL=meta-llama/Llama-3.1-8B
-
-Run benchmark tests
-------------------
-
-.. note::
-
-   If you're running **multi node training**, update the following environment variables. They can
-   also be passed as command line arguments.
-
-   * Change ``localhost`` to the master node's hostname:
-
-     .. code-block:: shell
-
-        MASTER_ADDR="${MASTER_ADDR:-localhost}"
-
-   * Set the number of nodes you want to train on (for instance, ``2``, ``4``, ``8``):
-
-     .. code-block:: shell
-
-        NNODES="${NNODES:-1}"
-
-   * Set the rank of each node (0 for master, 1 for the first worker node, and so on):
-
-     .. code-block:: shell
-
-        NODE_RANK="${NODE_RANK:-0}"
-
-* Use this command to run a performance benchmark test of any of the Llama 2 models that this Docker image supports (see :ref:`variables <amd-megatron-lm-benchmark-test-vars>`).
-
-  .. code-block:: shell
-
-     {variables} bash examples/llama/train_llama2.sh
-
-* Use this command to run a performance benchmark test of any of the Llama 3 and Llama 3.1 models that this Docker image supports (see :ref:`variables <amd-megatron-lm-benchmark-test-vars>`).
-
-  .. code-block:: shell
-
-     {variables} bash examples/llama/train_llama3.sh
-
-.. _amd-megatron-lm-benchmark-test-vars:
-
-The benchmark tests support the same set of variables:
-
-+--------------------------+-----------------------+-----------------------+
-| Name                     | Options               | Description           |
-+==========================+=======================+=======================+
-| ``TEE_OUTPUT``           | 0 or 1                | 0: disable training   |
-|                          |                       | log                   |
-|                          |                       |                       |
-|                          |                       | 1: enable training    |
-|                          |                       | log                   |
-+--------------------------+-----------------------+-----------------------+
-| ``MBS``                  |                       | Micro batch size      |
-+--------------------------+-----------------------+-----------------------+
-| ``BS``                   |                       | Batch size            |
-+--------------------------+-----------------------+-----------------------+
-| ``TP``                   | 1, 2, 4, 8            | Tensor parallel       |
-+--------------------------+-----------------------+-----------------------+
-| ``TE_FP8``               | 0 or 1                | Datatype.             |
-|                          |                       | If it is set to 1,    |
-|                          |                       | FP8.                  |
-|                          |                       |                       |
-|                          |                       | If it is set to 0.    |
-|                          |                       | BP16                  |
-+--------------------------+-----------------------+-----------------------+
-| ``NO_TORCH_COMPILE``     | 0 or 1                | If it is set to 1,    |
-|                          |                       | enable torch.compile. |
-|                          |                       |                       |
-|                          |                       | If it is set to 0.    |
-|                          |                       | Disable torch.compile |
-|                          |                       | (default)             |
-+--------------------------+-----------------------+-----------------------+
-| ``SEQ_LENGTH``           |                       | Input sequence length |
-+--------------------------+-----------------------+-----------------------+
-| ``GEMM_TUNING``          | 0 or 1                | If it is set to 1,    |
-|                          |                       | enable gemm tuning.   |
-|                          |                       |                       |
-|                          |                       | If it is set to 0,    |
-|                          |                       | disable gemm tuning   |
-+--------------------------+-----------------------+-----------------------+
-| ``USE_FLASH_ATTN``       | 0 or 1                | 0: disable flash      |
-|                          |                       | attention             |
-|                          |                       |                       |
-|                          |                       | 1: enable flash       |
-|                          |                       | attention             |
-+--------------------------+-----------------------+-----------------------+
-| ``ENABLE_PROFILING``     | 0 or 1                | 0: disable torch      |
-|                          |                       | profiling             |
-|                          |                       |                       |
-|                          |                       | 1: enable torch       |
-|                          |                       | profiling             |
-+--------------------------+-----------------------+-----------------------+
-| ``MODEL_SIZE``           |                       | The size of the mode: |
-|                          |                       | 7B/70B, etc.          |
-+--------------------------+-----------------------+-----------------------+
-| ``TOTAL_ITERS``          |                       | Total number of       |
-|                          |                       | iterations            |
-+--------------------------+-----------------------+-----------------------+
-| ``transformer-impl``     | transformer_engine or | Enable transformer    |
-|                          | local                 | engine by default     |
-+--------------------------+-----------------------+-----------------------+
-
-Benchmarking examples
-^^^^^^^^^^^^^^^^^^^^^
-
-.. tab-set::
-
-   .. tab-item:: Single node training
-      :sync: single
-
-      Use this command to run training with Llama 2 7B model on a single node. You can specify MBS, BS, FP,
-      datatype, and so on.
-
-      .. code-block:: bash
-
-         TEE_OUTPUT=1 MBS=5 BS=120 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
-         SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
-
-      You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup>`.
-
-      See the sample output:
-
-      .. image:: ../../data/how-to/rocm-for-ai/llama2-7b-training-log-sample.png
-         :width: 800
-
-   .. tab-item:: Multi node training
-      :sync: multi
-
-      Launch the Docker container on each node.
-
-      In this example, run training with Llama 2 7B model on 2 nodes with specific MBS, BS, FP, datatype, and
-      so on.
-
-      On the master node:
-
-      .. code-block:: bash
-
-         TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
-         SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
-
-      On the worker node:
-
-      .. code-block:: bash
-
-         TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
-         SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
-
-      You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup>`.
-
-      Sample output for 2-node training:
-
-      Master node:
-
-      .. image:: ../../data/how-to/rocm-for-ai/2-node-training-master.png
-         :width: 800
-
-      Worker node:
-
-      .. image:: ../../data/how-to/rocm-for-ai/2-node-training-worker.png
-         :width: 800
-
-Previous versions
-=================
-
-See :doc:`megatron-lm-history` to find documentation for previous releases
-of the ``ROCm/megatron-lm`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3.rst
@@ -1,535 +0,0 @@
-:orphan:
-
-.. meta::
-   :description: How to train a model using Megatron-LM for ROCm.
-   :keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
-
-******************************************
-Training a model with Megatron-LM for ROCm
-******************************************
-
-.. caution::
-
-   This documentation does not reflect the latest version of ROCm Megatron-LM
-   training performance documentation. See :doc:`../megatron-lm` for the latest version.
-
-The Megatron-LM framework for ROCm is a specialized fork of the robust Megatron-LM,
-designed to enable efficient training of large-scale language models on AMD
-GPUs. By leveraging AMD Instinct™ MI300X series accelerators, Megatron-LM delivers
-enhanced scalability, performance, and resource utilization for AI workloads.
-It is purpose-built to support models like Llama 2, Llama 3, Llama 3.1, and
-DeepSeek, enabling developers to train next-generation AI models more
-efficiently. See the GitHub repository at `<https://github.com/ROCm/Megatron-LM>`__.
-
-AMD provides a ready-to-use Docker image for MI300X accelerators containing
-essential components, including PyTorch, ROCm libraries, and Megatron-LM
-utilities. It contains the following software components to accelerate training
-workloads:
-
-+--------------------------+--------------------------------+
-| Software component       | Version                        |
-+==========================+================================+
-| ROCm                     | 6.3.0                          |
-+--------------------------+--------------------------------+
-| PyTorch                  | 2.7.0a0+git637433              |
-+--------------------------+--------------------------------+
-| Python                   | 3.10                           |
-+--------------------------+--------------------------------+
-| Transformer Engine       | 1.11                           |
-+--------------------------+--------------------------------+
-| Flash Attention          | 3.0.0                          |
-+--------------------------+--------------------------------+
-| hipBLASLt                | git258a2162                    |
-+--------------------------+--------------------------------+
-| Triton                   | 3.1                            |
-+--------------------------+--------------------------------+
-
-Supported features and models
-=============================
-
-Megatron-LM provides the following key features to train large language models efficiently:
-
- Transformer Engine (TE)
-
- APEX
-
- GEMM tuning
-
- Torch.compile
-
- 3D parallelism: TP + SP + CP
-
- Distributed optimizer
-
- Flash Attention (FA) 3
-
- Fused kernels
-
- Pre-training
-
-.. _amd-megatron-lm-model-support:
-
-The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.
-
-* Llama 2 7B
-
-* Llama 2 70B
-
-* Llama 3 8B
-
-* Llama 3 70B
-
-* Llama 3.1 8B
-
-* Llama 3.1 70B
-
-* DeepSeek-V2-Lite
-
-.. note::
-
-   Some models, such as Llama 3, require an external license agreement through
-   a third party (for example, Meta).
-
-System validation
-=================
-
-If you have already validated your system settings, skip this step. Otherwise,
-complete the :ref:`system validation and optimization steps <train-a-model-system-validation>`
-to set up your system before starting training.
-
-Disable NUMA auto-balancing
---------------------------
-
-Generally, application performance can benefit from disabling NUMA auto-balancing. However,
-it might be detrimental to performance with certain types of workloads.
-
-Run the command ``cat /proc/sys/kernel/numa_balancing`` to check your current NUMA (Non-Uniform
-Memory Access) settings. Output ``0`` indicates this setting is disabled. If there is no output or
-the output is ``1``, run the following command to disable NUMA auto-balancing.
-
-.. code-block:: shell
-
-   sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
-
-See :ref:`mi300x-disable-numa` for more information.
-
-.. _mi300x-amd-megatron-lm-training:
-
-Environment setup
-=================
-
-The pre-built ROCm Megatron-LM environment allows users to quickly validate system performance, conduct
-training benchmarks, and achieve superior performance for models like Llama 3.1, Llama 2, and DeepSeek V2.
-
-Use the following instructions to set up the environment, configure the script to train models, and
-reproduce the benchmark results on the MI300X accelerators with the AMD Megatron-LM Docker
-image.
-
-.. _amd-megatron-lm-requirements:
- 
-Download the Docker image
-------------------------
-
-1. Use the following command to pull the Docker image from Docker Hub.
-
-   .. code-block:: shell
-
-      docker pull rocm/megatron-lm:v25.3
-
-2. Launch the Docker container.
-
-   .. code-block:: shell
-
-      docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 64G --name megatron_training_env rocm/megatron-lm:v25.3
-
-3. Use these commands if you exit the ``megatron_training_env`` container and need to return to it.
-
-   .. code-block:: shell
-
-      docker start megatron_training_env
-      docker exec -it megatron_training_env bash
-
-The Docker container includes a pre-installed, verified version of Megatron-LM from the `release branch <https://github.com/ROCm/Megatron-LM/tree/megatron_release_v25.3>`_.
-
-.. _amd-megatron-lm-environment-setup:
-
-Configuration scripts
---------------------
-
-.. tab-set::
-
-   .. tab-item:: Llama
-      :sync: llama
-
-      If you're working with Llama 2 7B or Llama 2 70 B, use the ``train_llama2.sh`` configuration
-      script in the ``examples/llama`` directory of
-      `<https://github.com/ROCm/Megatron-LM/tree/megatron_release_v25.3/examples/llama>`__.
-      Likewise, if you're working with Llama 3 or Llama 3.1, then use ``train_llama3.sh`` and update
-      the configuration script accordingly.
-
-   .. tab-item:: DeepSeek V2
-      :sync: deepseek
-
-      Use the ``train_deepseek_v2.sh`` configuration script in the ``examples/deepseek_v2``
-      directory of
-      `<https://github.com/ROCm/Megatron-LM/tree/megatron_release_v25.3/examples/deepseek_v2>`__
-      and update the configuration script accordingly.
-
-Network interface
-^^^^^^^^^^^^^^^^^
-
-.. tab-set::
-
-   .. tab-item:: Llama
-      :sync: llama
-
-      To avoid connectivity issues in multi-node deployments, ensure the correct network interface
-      is set in your training scripts.
-
-      1. Run the following command (outside the container) to find the active network interface on your system.
-
-         .. code-block:: shell
-
-            ip a
-
-      2. Update the ``NCCL_SOCKET_IFNAME`` and ``GLOO_SOCKET_IFNAME`` variables with your system’s network interface. For
-         example:
-
-         .. code-block:: shell
-
-            export NCCL_SOCKET_IFNAME=ens50f0np0
-
-            export GLOO_SOCKET_IFNAME=ens50f0np0
-
-Dataset options
-^^^^^^^^^^^^^^^
-
-.. tab-set::
-
-   .. tab-item:: Llama
-      :sync: llama
-
-      You can use either mock data or real data for training.
-
-      * Mock data can be useful for testing and validation. Use the ``MOCK_DATA`` variable to toggle between mock and real data. The default
-        value is ``1`` for enabled.
-
-        .. code-block:: bash
-
-           MOCK_DATA=1
-
-      * If you're using a real dataset, update the ``DATA_PATH`` variable to point to the location of your dataset.
-
-        .. code-block:: bash
-
-           MOCK_DATA=0
-
-           DATA_PATH=${DATA_PATH:-"/data/bookcorpus_text_sentence"}  # Change to where your dataset is stored
-
-        Ensure that the files are accessible inside the Docker container.
-
-   .. tab-item:: DeepSeek V2
-      :sync: deepseek
-
-      If you don't already have the dataset, download the DeepSeek dataset using the following
-      commands:
-
-      .. code-block:: shell
-
-         mkdir deepseek-datasets
-         cd deepseek-datasets
-         wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/SlimPajama.json
-         wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-train.json
-         wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-valid.json
-         wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.bin
-         wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.idx
-
-      You can use either mock data or real data for training.
-
-      * Mock data can be useful for testing and validation. Use the ``MOCK_DATA`` variable to toggle between mock and real data. The default
-        value is ``1`` for enabled.
-
-        .. code-block:: bash
-
-           MOCK_DATA=1
-
-      * If you're using a real dataset, update the ``DATA_DIR`` variable to point to the location of your dataset.
-
-        .. code-block:: bash
-
-           MOCK_DATA=0
-
-           DATA_DIR="/root/data/deepseek-datasets"  # Change to where your dataset is stored
-
-        Ensure that the files are accessible inside the Docker container.
-
-Tokenizer
-^^^^^^^^^
-
-Tokenization is the process of converting raw text into tokens that can be processed by the model. For Llama
-models, this typically involves sub-word tokenization, where words are broken down into smaller units based on
-a fixed vocabulary. The tokenizer is trained along with the model on a large corpus of text, and it learns a
-fixed vocabulary that can represent a wide range of text from different domains. This allows Llama models to
-handle a variety of input sequences, including unseen words or domain-specific terms.
-
-.. tab-set::
-
-   .. tab-item:: Llama
-      :sync: llama
-
-      To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``Llama2Tokenizer``.
-
-      To train any of Llama 3 and Llama 3.1 models that this Docker image supports, use the ``HuggingFaceTokenizer``.
-      Set the Hugging Face model link in the ``TOKENIZER_MODEL`` variable.
-
-      For example, if you're using the Llama 3.1 8B model:
-
-      .. code-block:: shell
-
-         TOKENIZER_MODEL=meta-llama/Llama-3.1-8B
-
-   .. tab-item:: DeepSeek V2
-      :sync: deepseek
-
-      To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``DeepSeekV2Tokenizer``.
-
-Multi-node training
-^^^^^^^^^^^^^^^^^^^
-
-.. tab-set::
-
-   .. tab-item:: Llama
-      :sync: llama
-
-      If you're running multi-node training, update the following environment variables. They can
-      also be passed as command line arguments.
-
-      * Change ``localhost`` to the master node's hostname:
-
-        .. code-block:: shell
-
-           MASTER_ADDR="${MASTER_ADDR:-localhost}"
-
-      * Set the number of nodes you want to train on (for instance, ``2``, ``4``, ``8``):
-
-        .. code-block:: shell
-
-           NNODES="${NNODES:-1}"
-
-      * Set the rank of each node (0 for master, 1 for the first worker node, and so on):
-
-        .. code-block:: shell
-
-           NODE_RANK="${NODE_RANK:-0}"
-
-      * Set ``DATA_CACHE_PATH`` to a common directory accessible by all the nodes (for example, an
-        NFS directory) for multi-node runs:
-
-        .. code-block:: shell
-
-           DATA_CACHE_PATH=/root/cache # Set to a common directory for multi-node runs
-
-      * For multi-node runs, make sure the correct network drivers are installed on the nodes. If
-        inside a Docker, either install the drivers inside the Docker container or pass the network
-        drivers from the host while creating the Docker container.
-
-Start training on AMD Instinct accelerators
-===========================================
-
-The prebuilt Megatron-LM with ROCm training environment allows users to quickly validate
-system performance, conduct training benchmarks, and achieve superior
-performance for models like Llama 3.1 and Llama 2. This container should not be
-expected to provide generalized performance across all training workloads. You
-can expect the container to perform in the model configurations described in
-the following section, but other configurations are not validated by AMD.
-
-Use the following instructions to set up the environment, configure the script
-to train models, and reproduce the benchmark results on MI300X series
-accelerators with the AMD Megatron-LM Docker image.
-
-.. tab-set::
-
-   .. tab-item:: Llama
-      :sync: llama
-
-      .. tab-set::
-
-         .. tab-item:: Single node training
-            :sync: single-node
-
-            To run training on a single node, navigate to the Megatron-LM folder and use the
-            following command:
-
-            .. code-block:: shell
-
-               TEE_OUTPUT=1 MBS=2 BS=128 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 bash examples/llama/train_llama3.sh
-
-         .. tab-item:: Multi-node training
-            :sync: multi-node
-
-            To run training on multiple nodes, launch the Docker container on each node. For example, for a two node setup (``NODE0`` as the master node), use these commands.
-
-            * On the master node ``NODE0``:
-
-              .. code-block:: shell
-
-                 TEE_OUTPUT=1 MBS=2 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 MASTER_ADDR=IP_NODE0 NNODES=2 NODE_RANK=0 bash examples/llama/train_llama3.sh
-
-            * On the worker node ``NODE1``:
-
-              .. code-block:: shell
-
-                 TEE_OUTPUT=1 MBS=2 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 MASTER_ADDR=IP_NODE0 NNODES=2 NODE_RANK=1 bash examples/llama/train_llama3.sh
-
-
-   .. tab-item:: DeepSeek V2
-      :sync: deepseek
-
-      To run the training on a single node, go to ``/Megatron-LM`` folder and use the following command:
-
-      .. code-block:: shell
-
-         cd /workspace/Megatron-LM
-         GEMM_TUNING=1 PR=bf16 MBS=4 AC=none bash examples/deepseek_v2/train_deepseekv2.sh
-
-Key options
-----------
-
-.. _amd-megatron-lm-benchmark-test-vars:
-
-The benchmark tests support the following sets of variables:
-
-.. tab-set::
-
-   .. tab-item:: Llama
-      :sync: llama
-
-      ``TEE_OUTPUT``
-        ``1`` to enable training logs or ``0`` to disable.
-
-      ``TE_FP8``
-        ``0`` for BP16 (default) or ``1`` for FP8 GEMMs.
-
-      ``GEMM_TUNING``
-        ``1`` to enable GEMM tuning, which boosts performance by using the best GEMM kernels.
-
-      ``USE_FLASH_ATTN``
-        ``1`` to enable Flash Attention.
-
-      ``ENABLE_PROFILING``
-        ``1`` to enable PyTorch profiling for performance analysis.
-
-      ``transformer-impl``
-        ``transformer_engine`` to use the Transformer Engine (TE) or ``local`` to disable TE.
-
-      ``MODEL_SIZE``
-        ``8B`` or ``70B`` for Llama 3 and 3.1. ``7B`` or ``70B`` for Llama 2.
-
-      ``TOTAL_ITERS``
-        The total number of iterations -- ``10`` by default.
-
-      ``MOCK_DATA``
-        ``1`` to use mock data or ``0`` to use real data provided by you.
-
-      ``MBS``
-        Micro batch size.
-
-      ``BS``
-        Global batch size.
-
-      ``TP``
-        Tensor parallel (``1``, ``2``, ``4``, ``8``).
-
-      ``SEQ_LENGTH``
-        Input sequence length.
-
-   .. tab-item:: DeepSeek V2
-      :sync: deepseek
-
-      ``PR``
-        Precision for training. ``bf16`` for BF16 (default) or ``fp8`` for FP8 GEMMs.
-
-      ``GEMM_TUNING``
-        ``1`` to enable GEMM tuning, which boosts performance by using the best GEMM kernels.
-
-      ``TOTAL_ITERS``
-        The total number of iterations -- ``10`` by default.
-
-      ``MOCK_DATA``
-        ``1`` to use mock data or ``0`` to use real data provided by you.
-
-      ``MBS``
-        Micro batch size.
-
-      ``GBS``
-        Global batch size.
-
-Benchmarking examples
---------------------
-
-.. tab-set::
-
-   .. tab-item:: Llama
-      :sync: llama
-
-      .. tab-set::
-
-         .. tab-item:: Single node training
-            :sync: single-node
-
-            Use this command to run training with Llama 2 7B model on a single node. You can specify MBS, BS, FP,
-            datatype, and so on.
-
-            .. code-block:: bash
-
-               TEE_OUTPUT=1 MBS=5 BS=120 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
-               SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
-
-            You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup>`.
-
-            See the sample output:
-
-            .. image:: ../../../../data/how-to/rocm-for-ai/llama2-7b-training-log-sample.png
-               :width: 800
-
-         .. tab-item:: Multi-node training
-            :sync: multi-node
-
-            Launch the Docker container on each node.
-
-            In this example, run training with Llama 2 7B model on 2 nodes with specific MBS, BS, FP, datatype, and
-            so on.
-
-            On the master node:
-
-            .. code-block:: bash
-
-               TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
-               SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
-
-            On the worker node:
-
-            .. code-block:: bash
-
-               TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
-               SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
-
-            You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup>`.
-
-            Sample output for 2-node training:
-
-            Master node:
-
-            .. image:: ../../../../data/how-to/rocm-for-ai/2-node-training-master.png
-               :width: 800
-
-            Worker node:
-
-            .. image:: ../../../../data/how-to/rocm-for-ai/2-node-training-worker.png
-               :width: 800
-
-Previous versions
-=================
-
-See :doc:`megatron-lm-history` to find documentation for previous releases
-of the ``ROCm/megatron-lm`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4.rst
@@ -1,618 +0,0 @@
-:orphan:
-
-.. meta::
-   :description: How to train a model using Megatron-LM for ROCm.
-   :keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
-
-******************************************
-Training a model with Megatron-LM for ROCm
-******************************************
-
-.. caution::
-
-   This documentation does not reflect the latest version of ROCm Megatron-LM
-   training performance documentation. See :doc:`../megatron-lm` for the latest version.
-
-The Megatron-LM framework for ROCm is a specialized fork of the robust Megatron-LM,
-designed to enable efficient training of large-scale language models on AMD
-GPUs. By leveraging AMD Instinct™ MI300X series accelerators, Megatron-LM delivers
-enhanced scalability, performance, and resource utilization for AI workloads.
-It is purpose-built to support models like Llama 2, Llama 3, Llama 3.1, and
-DeepSeek, enabling developers to train next-generation AI models more
-efficiently. See the GitHub repository at `<https://github.com/ROCm/Megatron-LM>`__.
-
-AMD provides a ready-to-use Docker image for MI300X series accelerators containing
-essential components, including PyTorch, ROCm libraries, and Megatron-LM
-utilities. It contains the following software components to accelerate training
-workloads:
-
-+--------------------------+--------------------------------+
-| Software component       | Version                        |
-+==========================+================================+
-| ROCm                     | 6.3.0                          |
-+--------------------------+--------------------------------+
-| PyTorch                  | 2.7.0a0+git637433              |
-+--------------------------+--------------------------------+
-| Python                   | 3.10                           |
-+--------------------------+--------------------------------+
-| Transformer Engine       | 1.11                           |
-+--------------------------+--------------------------------+
-| Flash Attention          | 3.0.0                          |
-+--------------------------+--------------------------------+
-| hipBLASLt                | git258a2162                    |
-+--------------------------+--------------------------------+
-| Triton                   | 3.1                            |
-+--------------------------+--------------------------------+
-
-Supported features and models
-=============================
-
-Megatron-LM provides the following key features to train large language models efficiently:
-
- Transformer Engine (TE)
-
- APEX
-
- GEMM tuning
-
- Torch.compile
-
- 3D parallelism: TP + SP + CP
-
- Distributed optimizer
-
- Flash Attention (FA) 3
-
- Fused kernels
-
- Pre-training
-
-.. _amd-megatron-lm-model-support:
-
-The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
-
-* Llama 3.1 8B
-
-* Llama 3.1 70B
-
-* Llama 3 8B
-
-* Llama 3 70B
-
-* Llama 2 7B
-
-* Llama 2 70B
-
-* DeepSeek-V2-Lite
-
-.. note::
-
-   Some models, such as Llama, require an external license agreement through
-   a third party (for example, Meta).
-
-.. _amd-megatron-lm-performance-measurements:
-
-Performance measurements
-========================
-
-To evaluate performance, the
-`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
-page provides reference throughput and latency measurements for training
-popular AI models.
-
-.. important::
-
-   The performance data presented in
-   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
-   only reflects the :doc:`latest version of this training benchmarking environment <../megatron-lm>`_.
-   The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
-
-System validation
-=================
-
-If you have already validated your system settings, including NUMA
-auto-balancing, skip this step. Otherwise, complete the :ref:`system validation
-and optimization steps <train-a-model-system-validation>` to set up your system
-before starting training.
-
-.. _mi300x-amd-megatron-lm-training:
-
-Environment setup
-=================
-
-The prebuilt ROCm Megatron-LM environment allows users to quickly validate system performance, conduct
-training benchmarks, and achieve superior performance for models like Llama 3.1, Llama 2, and DeepSeek V2.
-
-Use the following instructions to set up the environment, configure the script to train models, and
-reproduce the benchmark results on MI300X series accelerators with the AMD Megatron-LM Docker
-image.
-
-.. _amd-megatron-lm-requirements:
- 
-Download the Docker image
-------------------------
-
-1. Use the following command to pull the Docker image from Docker Hub.
-
-   .. code-block:: shell
-
-      docker pull rocm/megatron-lm:v25.4
-
-2. Launch the Docker container.
-
-   .. code-block:: shell
-
-      docker run -it --device /dev/dri --device /dev/kfd --device /dev/infiniband --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 64G --name megatron_training_env rocm/megatron-lm:v25.4
-
-3. Use these commands if you exit the ``megatron_training_env`` container and need to return to it.
-
-   .. code-block:: shell
-
-      docker start megatron_training_env
-      docker exec -it megatron_training_env bash
-
-The Docker container includes a pre-installed, verified version of the ROCm Megatron-LM development branch `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev>`__
-(commit `fd6f01 <https://github.com/ROCm/Megatron-LM/tree/fd6f0d11d7f9480ace32f22eb7e4dab5314fa350>`_).
-
-.. _amd-megatron-lm-environment-setup:
-
-Configuration scripts
---------------------
-
-.. tab-set::
-
-   .. tab-item:: Llama
-      :sync: llama
-
-      If you're working with Llama 2 7B or Llama 2 70 B, use the ``train_llama2.sh`` configuration
-      script in the ``examples/llama`` directory of
-      `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/llama>`__.
-      Likewise, if you're working with Llama 3 or Llama 3.1, use ``train_llama3.sh`` and update
-      the configuration script accordingly.
-
-   .. tab-item:: DeepSeek V2
-      :sync: deepseek
-
-      Use the ``train_deepseek_v2.sh`` configuration script in the ``examples/deepseek_v2``
-      directory of
-      `<https://github.com/ROCm/Megatron-LM/tree/rocm_dev/examples/deepseek_v2>`__
-      and update the configuration script accordingly.
-
-Network interface
-^^^^^^^^^^^^^^^^^
-
-.. tab-set::
-
-   .. tab-item:: Llama
-      :sync: llama
-
-      Update the network interface in the script to match your system's network interface. To
-      find your network interface, run the following (outside of any Docker container):
-
-      .. code-block:: bash
-
-         ip a
-
-      Look for an active interface that has an IP address in the same subnet as
-      your other nodes. Then, update the following variables in the script, for
-      example:
-
-      .. code-block:: bash
-
-         export NCCL_SOCKET_IFNAME=ens50f0np0
-
-         export GLOO_SOCKET_IFNAME=ens50f0np0
-
-Dataset options
-^^^^^^^^^^^^^^^
-
-.. tab-set::
-
-   .. tab-item:: Llama
-      :sync: llama
-
-      You can use either mock data or real data for training.
-
-      * Mock data can be useful for testing and validation. Use the ``MOCK_DATA`` variable to toggle between mock and real data. The default
-        value is ``1`` for enabled.
-
-        .. code-block:: bash
-
-           MOCK_DATA=1
-
-      * If you're using a real dataset, update the ``DATA_PATH`` variable to point to the location of your dataset.
-
-        .. code-block:: bash
-
-           MOCK_DATA=0
-
-           DATA_PATH="/data/bookcorpus_text_sentence"  # Change to where your dataset is stored
-
-        Ensure that the files are accessible inside the Docker container.
-
-        To download the dataset, set the ``DATASET`` variable to the dataset you'd like to use. Two datasets are supported: ``DATASET=wiki`` and ``DATASET=bookcorpus``.
-        Use the following command to download the dataset.
-
-        .. code-block:: shell
-
-           DATASET=wiki bash examples/llama/prepare_dataset.sh # For wiki-en dataset
-           DATASET=bookcorpus bash examples/llama/prepare_dataset.sh # For bookcorpus dataset
-
-   .. tab-item:: DeepSeek V2
-      :sync: deepseek
-
-      If you don't already have the dataset, download the DeepSeek dataset using the following
-      commands:
-
-      .. code-block:: shell
-
-         mkdir deepseek-datasets
-         cd deepseek-datasets
-         wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/SlimPajama.json
-         wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-train.json
-         wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/alpaca_zh-valid.json
-         wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.bin
-         wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/deepseek-datasets/mmap_deepseekv2_datasets_text_document.idx
-
-      You can use either mock data or real data for training.
-
-      * Mock data can be useful for testing and validation. Use the ``MOCK_DATA`` variable to toggle between mock and real data. The default
-        value is ``1`` for enabled.
-
-        .. code-block:: bash
-
-           MOCK_DATA=1
-
-      * If you're using a real dataset, update the ``DATA_DIR`` variable to point to the location of your dataset.
-
-        .. code-block:: bash
-
-           MOCK_DATA=0
-
-           DATA_DIR="/root/data/deepseek-datasets"  # Change to where your dataset is stored
-
-        Ensure that the files are accessible inside the Docker container.
-
-Tokenizer
-^^^^^^^^^
-
-Tokenization is the process of converting raw text into tokens that can be processed by the model. For Llama
-models, this typically involves sub-word tokenization, where words are broken down into smaller units based on
-a fixed vocabulary. The tokenizer is trained along with the model on a large corpus of text, and it learns a
-fixed vocabulary that can represent a wide range of text from different domains. This allows Llama models to
-handle a variety of input sequences, including unseen words or domain-specific terms.
-
-You can assign the path of an existing tokenizer to the ``TOKENIZER_MODEL`` as shown in the following examples.
-If the tokenizer is not found, it'll be downloaded to the default tokenizer model path: ``${DATA_DIR}/tokenizer_llama3``
-or ``${DATA_DIR}/tokenizer_llama2``.
-
-.. tab-set::
-
-   .. tab-item:: Llama
-      :sync: llama
-
-      To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``Llama2Tokenizer``
-      or the default ``HuggingFaceTokenizer``.
-
-      To train any of Llama 3 and Llama 3.1 models that this Docker image supports, use the ``HuggingFaceTokenizer``.
-      Set the Hugging Face model path in the ``TOKENIZER_MODEL`` variable.
-
-      For example, if you're using the Llama 3.1 8B model:
-
-      .. code-block:: shell
-
-         TOKENIZER_MODEL=meta-llama/Llama-3.1-8B
-
-      .. note::
-
-         If you don't already have the Llama 3.1 tokenizer locally, set your
-         personal Hugging Face access token ``HF_TOKEN`` to download the
-         tokenizer. If you encounter the following error, set ``HF_TOKEN`` to
-         your access-authorized Hugging Face token.
-
-         .. code-block:: shell
-
-            OSError: You are trying to access a gated repo.
-
-            # pass your HF_TOKEN
-            export HF_TOKEN=$your_personal_hf_token
-
-   .. tab-item:: DeepSeek V2
-      :sync: deepseek
-
-      To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``DeepSeekV2Tokenizer``.
-
-Multi-node training
-^^^^^^^^^^^^^^^^^^^
-
-.. tab-set::
-
-   .. tab-item:: Llama
-      :sync: llama
-
-      If you're running multi-node training, update the following environment variables. They can
-      also be passed as command line arguments.
-
-      * Change ``localhost`` to the master node's hostname:
-
-        .. code-block:: shell
-
-           MASTER_ADDR="${MASTER_ADDR:-localhost}"
-
-      * Set the number of nodes you want to train on (for instance, ``2``, ``4``, ``8``):
-
-        .. code-block:: shell
-
-           NNODES="${NNODES:-1}"
-
-      * Set the rank of each node (0 for master, 1 for the first worker node, and so on):
-
-        .. code-block:: shell
-
-           NODE_RANK="${NODE_RANK:-0}"
-
-      * Set ``DATA_CACHE_PATH`` to a common directory accessible by all the nodes (for example, an
-        NFS directory) for multi-node runs:
-
-        .. code-block:: shell
-
-           DATA_CACHE_PATH=/root/cache # Set to a common directory for multi-node runs
-
-      * For multi-node runs, make sure the correct network drivers are installed on the nodes. If
-        inside a Docker container, either install the drivers inside the Docker container or pass the network
-        drivers from the host while creating the Docker container.
-
-        .. code-block:: shell
-
-           # Specify which RDMA interfaces to use for communication
-           export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
-
-Start training on AMD Instinct accelerators
-===========================================
-
-The prebuilt Megatron-LM with ROCm training environment allows users to quickly validate
-system performance, conduct training benchmarks, and achieve superior
-performance for models like Llama 3.1 and Llama 2. This container should not be
-expected to provide generalized performance across all training workloads. You
-can expect the container to perform in the model configurations described in
-the following section, but other configurations are not validated by AMD.
-
-Use the following instructions to set up the environment, configure the script
-to train models, and reproduce the benchmark results on MI300X series
-accelerators with the AMD Megatron-LM Docker image.
-
-.. tab-set::
-
-   .. tab-item:: Llama
-      :sync: llama
-
-      .. tab-set::
-
-         .. tab-item:: Single node training
-            :sync: single-node
-
-            To run training on a single node, navigate to the Megatron-LM folder and use one of the
-            following commands.
-
-            - For Llama 3.1 8B FP8:
-
-              .. code-block:: shell
-
-                 TEE_OUTPUT=1 MBS=2 BS=128 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 TOTAL_ITERS=50 bash examples/llama/train_llama3.sh
-
-            - For Llama 3.1 8B BF16:
-
-              .. code-block:: shell
-
-                 TEE_OUTPUT=1 MBS=2 BS=128 TP=1 TE_FP8=0 SEQ_LENGTH=8192 MODEL_SIZE=8 TOTAL_ITERS=50 bash examples/llama/train_llama3.sh
-
-            - For Llama 2 7B FP8:
-
-              .. code-block:: shell
-
-                 TEE_OUTPUT=1 MBS=4 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=4096 MODEL_SIZE=7 TOTAL_ITERS=50 bash examples/llama/train_llama2.sh
-
-            - For Llama 2 7B BF16:
-
-              .. code-block:: shell
-
-                 TEE_OUTPUT=1 MBS=4 BS=256 TP=1 TE_FP8=0 SEQ_LENGTH=4096 MODEL_SIZE=7 TOTAL_ITERS=50 bash examples/llama/train_llama2.sh
-
-            To run training with FSDP2 enabled, add the ``FSDP=1`` argument. For example:
-
-            - For Llama 3 70B BF16:
-
-              .. code-block:: shell
-
-                 TEE_OUTPUT=1 MBS=3 BS=24 TP=1 TE_FP8=0 FSDP=1 RECOMPUTE=1 SEQ_LENGTH=8192 MODEL_SIZE=70 TOTAL_ITERS=50 bash examples/llama/train_llama3.sh
-
-            - For Llama 2 70B BF16:
-
-              .. code-block:: shell
-
-                 TEE_OUTPUT=1 MBS=3 BS=56 TP=1 TE_FP8=0 FSDP=1 RECOMPUTE=1 SEQ_LENGTH=4096 MODEL_SIZE=70 TOTAL_ITERS=50 bash examples/llama/train_llama2.sh
-
-            .. note::
-
-               It's suggested to use ``TP=1`` when FSDP is enabled for higher throughput. FSDP2 is not supported with pipeline parallelism,
-               expert parallelism, MCore's distributed optimizer, gradient accumulation fusion, and ``FP16`` precision.
-
-         .. tab-item:: Multi-node training
-            :sync: multi-node
-
-            To run training on multiple nodes, launch the Docker container on each node. For example, for a two node setup (``NODE0`` as the master node), use these commands.
-
-            * On the master node ``NODE0``:
-
-              .. code-block:: shell
-
-                 TEE_OUTPUT=1 MBS=2 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 MASTER_ADDR=IP_NODE0 NNODES=2 NODE_RANK=0 bash examples/llama/train_llama3.sh
-
-            * On the worker node ``NODE1``:
-
-              .. code-block:: shell
-
-                 TEE_OUTPUT=1 MBS=2 BS=256 TP=1 TE_FP8=1 SEQ_LENGTH=8192 MODEL_SIZE=8 MASTER_ADDR=IP_NODE0 NNODES=2 NODE_RANK=1 bash examples/llama/train_llama3.sh
-
-
-   .. tab-item:: DeepSeek V2
-      :sync: deepseek
-
-      To run the training on a single node, go to ``/Megatron-LM`` folder and use the following command:
-
-      .. code-block:: shell
-
-         cd /workspace/Megatron-LM
-         GEMM_TUNING=1 PR=bf16 MBS=4 AC=none SEQ_LEN=4096 PAD_LEN=4096 TRAIN_ITERS=50 bash examples/deepseek_v2/train_deepseekv2.sh
-
-Key options
-----------
-
-.. _amd-megatron-lm-benchmark-test-vars:
-
-The benchmark tests support the following sets of variables:
-
-.. tab-set::
-
-   .. tab-item:: Llama
-      :sync: llama
-
-      ``TEE_OUTPUT``
-        ``1`` to enable training logs or ``0`` to disable.
-
-      ``TE_FP8``
-        ``0`` for B16 or ``1`` for FP8 -- ``0`` by default.
-
-      ``GEMM_TUNING``
-        ``1`` to enable GEMM tuning, which boosts performance by using the best GEMM kernels.
-
-      ``USE_FLASH_ATTN``
-        ``1`` to enable Flash Attention.
-
-      ``FSDP``
-        ``1`` to enable PyTorch FSDP2. If FSDP is enabled, ``--use-distributed-optimizer``,
-        ``--overlap-param-gather``, and ``--sequence-parallel`` are automaticallyu disabled.
-
-      ``ENABLE_PROFILING``
-        ``1`` to enable PyTorch profiling for performance analysis.
-
-      ``transformer-impl``
-        ``transformer_engine`` to use the Transformer Engine (TE) or ``local`` to disable TE.
-
-      ``MODEL_SIZE``
-        ``8B`` or ``70B`` for Llama 3 and 3.1. ``7B`` or ``70B`` for Llama 2.
-
-      ``TOTAL_ITERS``
-        The total number of iterations -- ``10`` by default.
-
-      ``MOCK_DATA``
-        ``1`` to use mock data or ``0`` to use real data you provide.
-
-      ``MBS``
-        Micro batch size.
-
-      ``BS``
-        Global batch size.
-
-      ``TP``
-        Tensor parallel (``1``, ``2``, ``4``, ``8``). ``TP`` is disabled when ``FSDP`` is turned on.
-
-      ``SEQ_LENGTH``
-        Input sequence length.
-
-   .. tab-item:: DeepSeek V2
-      :sync: deepseek
-
-      ``PR``
-        Precision for training. ``bf16`` for BF16 (default) or ``fp8`` for FP8 GEMMs.
-
-      ``GEMM_TUNING``
-        ``1`` to enable GEMM tuning, which boosts performance by using the best GEMM kernels.
-
-      ``TRAIN_ITERS``
-        The total number of iterations.
-
-      ``MOCK_DATA``
-        ``1`` to use mock data or ``0`` to use real data you provide.
-
-      ``MBS``
-        Micro batch size.
-
-      ``GBS``
-        Global batch size.
-
-      ``SEQ_LEN``
-        Input sequence length.
-
-      ``AC``
-        Activation checkpointing (``none``, ``sel``, or ``full``) -- ``sel`` by default.
-
-Benchmarking examples
---------------------
-
-.. tab-set::
-
-   .. tab-item:: Llama
-      :sync: llama
-
-      .. tab-set::
-
-         .. tab-item:: Single node training
-            :sync: single-node
-
-            Use this command to run training with Llama 2 7B model on a single node. You can specify MBS, BS, FP,
-            datatype, and so on.
-
-            .. code-block:: bash
-
-               TEE_OUTPUT=1 MBS=5 BS=120 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
-               SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
-
-            You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup>`.
-
-            See the sample output:
-
-            .. image:: ../../../../data/how-to/rocm-for-ai/llama2-7b-training-log-sample.png
-               :width: 800
-
-         .. tab-item:: Multi-node training
-            :sync: multi-node
-
-            Launch the Docker container on each node.
-
-            In this example, run training with Llama 2 7B model on 2 nodes with specific MBS, BS, FP, datatype, and
-            so on.
-
-            On the master node:
-
-            .. code-block:: bash
-
-               TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
-               SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
-
-            On the worker node:
-
-            .. code-block:: bash
-
-               TEE_OUTPUT=1 MBS=4 BS=64 TP=8 TE_FP8=0 NO_TORCH_COMPILE=1
-               SEQ_LENGTH=4096 bash examples/llama/train_llama2.sh
-
-            You can find the training logs at the location defined in ``$TRAIN_LOG`` in the :ref:`configuration script <amd-megatron-lm-environment-setup>`.
-
-            Sample output for 2-node training:
-
-            Master node:
-
-            .. image:: ../../../../data/how-to/rocm-for-ai/2-node-training-master.png
-               :width: 800
-
-            Worker node:
-
-            .. image:: ../../../../data/how-to/rocm-for-ai/2-node-training-worker.png
-               :width: 800
-
-Previous versions
-=================
-
-See :doc:`megatron-lm-history` to find documentation for previous releases
-of the ``ROCm/megatron-lm`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst
@@ -1,47 +0,0 @@
-:orphan:
-
-****************************************************
-PyTorch training performance testing version history
-****************************************************
-
-This table lists previous versions of the ROCm Megatron-LM training Docker image for
-inference performance testing. For detailed information about available models
-for benchmarking, see the version-specific documentation. You can find tagged
-previous releases of the ``ROCm/pytorch-training`` Docker image on `Docker Hub <https://hub.docker.com/r/rocm/pytorch-training/tags>`_.
-
-.. list-table::
-   :header-rows: 1
-   :stub-columns: 1
-
-   * - Image version
-     - ROCm version
-     - PyTorch version
-     - Resources
-
-   * - v25.6
-     - 6.3.4
-     - 2.8.0a0+git7d205b2
-     - 
-       * :doc:`Documentation <../pytorch-training>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.6/images/sha256-a4cea3c493a4a03d199a3e81960ac071d79a4a7a391aa9866add3b30a7842661>`_
-
-   * - v25.5
-     - 6.3.4
-     - 2.7.0a0+git637433
-     - 
-       * :doc:`Documentation <pytorch-training-v25.5>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.5/images/sha256-d47850a9b25b4a7151f796a8d24d55ea17bba545573f0d50d54d3852f96ecde5>`_
-
-   * - v25.4
-     - 6.3.0
-     - 2.7.0a0+git637433
-     - 
-       * :doc:`Documentation <pytorch-training-v25.4>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.4/images/sha256-fa98a9aa69968e654466c06f05aaa12730db79b48b113c1ab4f7a5fe6920a20b>`_
-
-   * - v25.3
-     - 6.3.0
-     - 2.7.0a0+git637433
-     - 
-       * :doc:`Documentation <pytorch-training-v25.3>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.3/images/sha256-0ffdde1b590fd2787b1c7adf5686875b100980b0f314090901387c44253e709b>`_
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.3.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.3.rst
@@ -1,352 +0,0 @@
-:orphan:
-
-.. meta::
-   :description: How to train a model using PyTorch for ROCm.
-   :keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
-
-**************************************
-Training a model with PyTorch for ROCm
-**************************************
-
-.. caution::
-
-   This documentation does not reflect the latest version of ROCm PyTorch
-   training performance documentation. See :doc:`../pytorch-training` for the latest version.
-
-PyTorch is an open-source machine learning framework that is widely used for
-model training with GPU-optimized components for transformer-based models.
-
-The PyTorch for ROCm training Docker (``rocm/pytorch-training:v25.3``) image
-provides a prebuilt optimized environment for fine-tuning and pretraining a
-model on AMD Instinct MI325X and MI300X accelerators. It includes the following
-software components to accelerate training workloads:
-
-+--------------------------+--------------------------------+
-| Software component       | Version                        |
-+==========================+================================+
-| ROCm                     | 6.3.0                          |
-+--------------------------+--------------------------------+
-| PyTorch                  | 2.7.0a0+git637433              |
-+--------------------------+--------------------------------+
-| Python                   | 3.10                           |
-+--------------------------+--------------------------------+
-| Transformer Engine       | 1.11                           |
-+--------------------------+--------------------------------+
-| Flash Attention          | 3.0.0                          |
-+--------------------------+--------------------------------+
-| hipBLASLt                | git258a2162                    |
-+--------------------------+--------------------------------+
-| Triton                   | 3.1                            |
-+--------------------------+--------------------------------+
-
-.. _amd-pytorch-training-model-support:
-
-Supported models
-================
-
-The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.
-
-* Llama 3.1 8B
-
-* Llama 3.1 70B
-
-* FLUX.1-dev
-
-.. note::
-
-   Only these models are supported in the following steps.
-
-   Some models, such as Llama 3, require an external license agreement through
-   a third party (for example, Meta).
-
-System validation
-=================
-
-If you have already validated your system settings, skip this step. Otherwise,
-complete the :ref:`system validation and optimization steps <train-a-model-system-validation>`
-to set up your system before starting training.
-
-Disable NUMA auto-balancing
---------------------------
-
-Generally, application performance can benefit from disabling NUMA auto-balancing. However,
-it might be detrimental to performance with certain types of workloads.
-
-Run the command ``cat /proc/sys/kernel/numa_balancing`` to check your current NUMA (Non-Uniform
-Memory Access) settings. Output ``0`` indicates this setting is disabled. If there is no output or
-the output is ``1``, run the following command to disable NUMA auto-balancing.
-
-.. code-block:: shell
-
-   sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
-
-See :ref:`mi300x-disable-numa` for more information.
-
-Environment setup
-=================
-
-This Docker image is optimized for specific model configurations outlined
-below. Performance can vary for other training workloads, as AMD 
-doesn’t validate configurations and run conditions outside those described.
-
-Download the Docker image
-------------------------
-
-1. Use the following command to pull the Docker image from Docker Hub.
-
-   .. code-block:: shell
-
-      docker pull rocm/pytorch-training:v25.3
-
-2. Run the Docker container.
-
-   .. code-block:: shell
-
-      docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env rocm/pytorch-training:v25.3
-
-3. Use these commands if you exit the ``training_env`` container and need to return to it.
-
-   .. code-block:: shell
-
-      docker start training_env
-      docker exec -it training_env bash
-
-4. In the Docker container, clone the `<https://github.com/ROCm/MAD>`__ repository and navigate to the benchmark scripts directory.
-
-   .. code-block:: shell
-
-      git clone https://github.com/ROCm/MAD
-      cd MAD/scripts/pytorch-train
-
-Prepare training datasets and dependencies
------------------------------------------
-
-The following benchmarking examples may require downloading models and datasets
-from Hugging Face. To ensure successful access to gated repos, set your
-``HF_TOKEN``.
-
-Run the setup script to install libraries and datasets needed for benchmarking.
-
-.. code-block:: shell
-
-   ./pytorch_benchmark_setup.sh
-
-``pytorch_benchmark_setup.sh`` installs the following libraries:
-
-.. list-table::
-   :header-rows: 1
-
-   * - Library
-     - Benchmark model
-     - Reference
-
-   * - ``accelerate``
-     - Llama 3.1 8B, FLUX
-     - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
-
-   * - ``datasets``
-     - Llama 3.1 8B, 70B, FLUX
-     - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
-
-   * - ``torchdata``
-     - Llama 3.1 70B
-     - `TorchData <https://pytorch.org/data/beta/index.html>`_
-
-   * - ``tomli``
-     - Llama 3.1 70B
-     - `Tomli <https://pypi.org/project/tomli/>`_
-
-   * - ``tiktoken``
-     - Llama 3.1 70B
-     - `tiktoken <https://github.com/openai/tiktoken>`_
-
-   * - ``blobfile``
-     - Llama 3.1 70B
-     - `blobfile <https://pypi.org/project/blobfile/>`_
-
-   * - ``tabulate``
-     - Llama 3.1 70B
-     - `tabulate <https://pypi.org/project/tabulate/>`_
-
-   * - ``wandb``
-     - Llama 3.1 70B
-     - `Weights & Biases <https://github.com/wandb/wandb>`_
-
-   * - ``sentencepiece``
-     - Llama 3.1 70B, FLUX
-     - `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
-
-   * - ``tensorboard``
-     - Llama 3.1 70 B, FLUX
-     - `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
-
-   * - ``csvkit``
-     - FLUX
-     - `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1
-
-   * - ``deepspeed``
-     - FLUX
-     - `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2
-
-   * - ``diffusers``
-     - FLUX
-     - `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0
-
-   * - ``GitPython``
-     - FLUX
-     - `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44
-
-   * - ``opencv-python-headless``
-     - FLUX
-     - `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84
-
-   * - ``peft``
-     - FLUX
-     - `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0
-
-   * - ``protobuf``
-     - FLUX
-     - `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2
-
-   * - ``pytest``
-     - FLUX
-     - `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4
-
-   * - ``python-dotenv``
-     - FLUX
-     - `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1
-
-   * - ``seaborn``
-     - FLUX
-     - `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2
-
-   * - ``transformers``
-     - FLUX
-     - `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0
-
-``pytorch_benchmark_setup.sh`` downloads the following models from Hugging Face:
-
-* `meta-llama/Llama-3.1-70B-Instruct <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
-
-* `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
-
-Along with the following datasets:
-
-* `WikiText <https://huggingface.co/datasets/Salesforce/wikitext>`_
-
-* `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_
-
-Start training on AMD Instinct accelerators
-===========================================
-
-The prebuilt PyTorch with ROCm training environment allows users to quickly validate
-system performance, conduct training benchmarks, and achieve superior
-performance for models like Llama 3.1 and Llama 2. This container should not be
-expected to provide generalized performance across all training workloads. You
-can expect the container to perform in the model configurations described in
-the following section, but other configurations are not validated by AMD.
-
-Use the following instructions to set up the environment, configure the script
-to train models, and reproduce the benchmark results on MI300X series
-accelerators with the AMD PyTorch training Docker image.
-
-Once your environment is set up, use the following commands and examples to start benchmarking.
-
-Pretraining
-----------
-
-To start the pretraining benchmark, use the following command with the
-appropriate options. See the following list of options and their descriptions.
-
-.. code-block:: shell
-
-   ./pytorch_benchmark_report.sh -t $training_mode -m $model_repo -p $datatype -s $sequence_length
-
-Options and available models
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. list-table::
-   :header-rows: 1
-
-   * - Name
-     - Options
-     - Description
-
-   * - ``$training_mode``
-     - ``pretrain``
-     - Benchmark pretraining
-
-   * -
-     - ``finetune_fw``
-     - Benchmark full weight fine-tuning (Llama 3.1 70B with BF16)
-
-   * -
-     - ``finetune_lora``
-     - Benchmark LoRA fine-tuning (Llama 3.1 70B with BF16)
-
-   * - ``$datatype``
-     - FP8 or BF16
-     - Only Llama 3.1 8B supports FP8 precision.
-
-   * - ``$model_repo``
-     - Llama-3.1-8B
-     - `Llama 3.1 8B <https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct>`_
-
-   * - 
-     - Llama-3.1-70B
-     - `Llama 3.1 70B <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
-
-   * - 
-     - Flux
-     - `FLUX.1 [dev] <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
-
-Fine-tuning
-----------
-
-To start the fine-tuning benchmark, use the following command. It will run the benchmarking example of Llama 2 70B
-with the WikiText dataset using the AMD fork of `torchtune <https://github.com/AMD-AIG-AIMA/torchtune>`_.
-
-.. code-block:: shell
-
-   ./pytorch_benchmark_report.sh -t {finetune_fw, finetune_lora} -p BF16 -m Llama-3.1-70B
-
-Benchmarking examples
---------------------
-
-Here are some examples of how to use the command.
-
-* Example 1: Llama 3.1 70B with BF16 precision with `torchtitan <https://github.com/ROCm/torchtitan>`_.
-
-  .. code-block:: shell
-
-     ./pytorch_benchmark_report.sh -t pretrain -p BF16 -m Llama-3.1-70B -s 8192
-
-* Example 2: Llama 3.1 8B with FP8 precision using Transformer Engine (TE) and Hugging Face Accelerator.
-
-  .. code-block:: shell
-
-     ./pytorch_benchmark_report.sh -t pretrain -p FP8 -m Llama-3.1-70B -s 8192
-
-* Example 3: FLUX.1-dev with BF16 precision with FluxBenchmark.
-
-  .. code-block:: shell
-
-     ./pytorch_benchmark_report.sh -t pretrain -p BF16 -m Flux
-
-* Example 4: Torchtune full weight fine-tuning with Llama 3.1 70B
-
-  .. code-block:: shell
-
-     ./pytorch_benchmark_report.sh -t finetune_fw -p BF16 -m Llama-3.1-70B
-
-* Example 5: Torchtune LoRA fine-tuning with Llama 3.1 70B
-
-  .. code-block:: shell
-
-     ./pytorch_benchmark_report.sh -t finetune_lora -p BF16 -m Llama-3.1-70B
-
-Previous versions
-=================
-
-See :doc:`pytorch-training-history` to find documentation for previous releases
-of the ``ROCm/pytorch-training`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.4.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.4.rst
@@ -1,397 +0,0 @@
-:orphan:
-
-.. meta::
-   :description: How to train a model using PyTorch for ROCm.
-   :keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
-
-**************************************
-Training a model with PyTorch for ROCm
-**************************************
-
-.. caution::
-
-   This documentation does not reflect the latest version of ROCm PyTorch
-   training performance documentation. See :doc:`../pytorch-training` for the latest version.
-
-PyTorch is an open-source machine learning framework that is widely used for
-model training with GPU-optimized components for transformer-based models.
-
-The PyTorch for ROCm training Docker (``rocm/pytorch-training:v25.4``) image
-provides a prebuilt optimized environment for fine-tuning and pretraining a
-model on AMD Instinct MI325X and MI300X accelerators. It includes the following
-software components to accelerate training workloads:
-
-+--------------------------+--------------------------------+
-| Software component       | Version                        |
-+==========================+================================+
-| ROCm                     | 6.3.0                          |
-+--------------------------+--------------------------------+
-| PyTorch                  | 2.7.0a0+git637433              |
-+--------------------------+--------------------------------+
-| Python                   | 3.10                           |
-+--------------------------+--------------------------------+
-| Transformer Engine       | 1.11                           |
-+--------------------------+--------------------------------+
-| Flash Attention          | 3.0.0                          |
-+--------------------------+--------------------------------+
-| hipBLASLt                | git258a2162                    |
-+--------------------------+--------------------------------+
-| Triton                   | 3.1                            |
-+--------------------------+--------------------------------+
-
-.. _amd-pytorch-training-model-support:
-
-Supported models
-================
-
-The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X accelerators.
-
-* Llama 3.1 8B
-
-* Llama 3.1 70B
-
-* Llama 2 70B
-
-* FLUX.1-dev
-
-.. note::
-
-   Only these models are supported in the following steps.
-
-   Some models, such as Llama 3, require an external license agreement through
-   a third party (for example, Meta).
-
-.. _amd-pytorch-training-performance-measurements:
-
-Performance measurements
-========================
-
-To evaluate performance, the
-`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
-page provides reference throughput and latency measurements for training
-popular AI models.
-
-.. note::
-
-   The performance data presented in
-   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
-   should not be interpreted as the peak performance achievable by AMD
-   Instinct MI325X and MI300X accelerators or ROCm software.
-
-System validation
-=================
-
-If you have already validated your system settings, including NUMA
-auto-balancing, skip this step. Otherwise, complete the :ref:`system validation
-and optimization steps <train-a-model-system-validation>` to set up your system
-before starting training.
-
-Environment setup
-=================
-
-This Docker image is optimized for specific model configurations outlined
-below. Performance can vary for other training workloads, as AMD 
-doesn’t validate configurations and run conditions outside those described.
-
-Download the Docker image
-------------------------
-
-1. Use the following command to pull the Docker image from Docker Hub.
-
-   .. code-block:: shell
-
-      docker pull rocm/pytorch-training:v25.4
-
-2. Run the Docker container.
-
-   .. code-block:: shell
-
-      docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env rocm/pytorch-training:v25.4
-
-3. Use these commands if you exit the ``training_env`` container and need to return to it.
-
-   .. code-block:: shell
-
-      docker start training_env
-      docker exec -it training_env bash
-
-4. In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
-   repository and navigate to the benchmark scripts directory
-   ``/workspace/MAD/scripts/pytorch_train``.
-
-   .. code-block:: shell
-
-      git clone https://github.com/ROCm/MAD
-      cd MAD/scripts/pytorch_train
-
-Prepare training datasets and dependencies
------------------------------------------
-
-The following benchmarking examples require downloading models and datasets
-from Hugging Face. To ensure successful access to gated repos, set your
-``HF_TOKEN``.
-
-.. code-block:: shell
-
-   export HF_TOKEN=$your_personal_hugging_face_access_token
-
-Run the setup script to install libraries and datasets needed for benchmarking.
-
-.. code-block:: shell
-
-   ./pytorch_benchmark_setup.sh
-
-``pytorch_benchmark_setup.sh`` installs the following libraries:
-
-.. list-table::
-   :header-rows: 1
-
-   * - Library
-     - Benchmark model
-     - Reference
-
-   * - ``accelerate``
-     - Llama 3.1 8B, FLUX
-     - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
-
-   * - ``datasets``
-     - Llama 3.1 8B, 70B, FLUX
-     - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
-
-   * - ``torchdata``
-     - Llama 3.1 70B
-     - `TorchData <https://pytorch.org/data/beta/index.html>`_
-
-   * - ``tomli``
-     - Llama 3.1 70B
-     - `Tomli <https://pypi.org/project/tomli/>`_
-
-   * - ``tiktoken``
-     - Llama 3.1 70B
-     - `tiktoken <https://github.com/openai/tiktoken>`_
-
-   * - ``blobfile``
-     - Llama 3.1 70B
-     - `blobfile <https://pypi.org/project/blobfile/>`_
-
-   * - ``tabulate``
-     - Llama 3.1 70B
-     - `tabulate <https://pypi.org/project/tabulate/>`_
-
-   * - ``wandb``
-     - Llama 3.1 70B
-     - `Weights & Biases <https://github.com/wandb/wandb>`_
-
-   * - ``sentencepiece``
-     - Llama 3.1 70B, FLUX
-     - `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
-
-   * - ``tensorboard``
-     - Llama 3.1 70 B, FLUX
-     - `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
-
-   * - ``csvkit``
-     - FLUX
-     - `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1
-
-   * - ``deepspeed``
-     - FLUX
-     - `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2
-
-   * - ``diffusers``
-     - FLUX
-     - `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0
-
-   * - ``GitPython``
-     - FLUX
-     - `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44
-
-   * - ``opencv-python-headless``
-     - FLUX
-     - `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84
-
-   * - ``peft``
-     - FLUX
-     - `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0
-
-   * - ``protobuf``
-     - FLUX
-     - `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2
-
-   * - ``pytest``
-     - FLUX
-     - `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4
-
-   * - ``python-dotenv``
-     - FLUX
-     - `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1
-
-   * - ``seaborn``
-     - FLUX
-     - `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2
-
-   * - ``transformers``
-     - FLUX
-     - `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0
-
-``pytorch_benchmark_setup.sh`` downloads the following models from Hugging Face:
-
-* `meta-llama/Llama-3.1-70B-Instruct <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
-
-* `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
-
-Along with the following datasets:
-
-* `WikiText <https://huggingface.co/datasets/Salesforce/wikitext>`_
-
-* `UltraChat 200k <https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k>`_
-
-* `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_
-
-Getting started
-===============
-
-The prebuilt PyTorch with ROCm training environment allows users to quickly validate
-system performance, conduct training benchmarks, and achieve superior
-performance for models like Llama 3.1 and Llama 2. This container should not be
-expected to provide generalized performance across all training workloads. You
-can expect the container to perform in the model configurations described in
-the following section, but other configurations are not validated by AMD.
-
-Use the following instructions to set up the environment, configure the script
-to train models, and reproduce the benchmark results on MI325X and MI300X
-accelerators with the AMD PyTorch training Docker image.
-
-Once your environment is set up, use the following commands and examples to start benchmarking.
-
-Pretraining
-----------
-
-To start the pretraining benchmark, use the following command with the
-appropriate options. See the following list of options and their descriptions.
-
-.. code-block:: shell
-
-   ./pytorch_benchmark_report.sh -t $training_mode -m $model_repo -p $datatype -s $sequence_length
-
-Options and available models
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. list-table::
-   :header-rows: 1
-
-   * - Name
-     - Options
-     - Description
-
-   * - ``$training_mode``
-     - ``pretrain``
-     - Benchmark pretraining
-
-   * -
-     - ``finetune_fw``
-     - Benchmark full weight fine-tuning (Llama 3.1 70B with BF16)
-
-   * -
-     - ``finetune_lora``
-     - Benchmark LoRA fine-tuning (Llama 3.1 70B with BF16)
-
-   * -
-     - ``HF_finetune_lora``
-     - Benchmark LoRA fine-tuning with Hugging Face PEFT (Llama 2 70B with BF16)
-
-   * - ``$datatype``
-     - ``FP8`` or ``BF16``
-     - Only Llama 3.1 8B supports FP8 precision.
-
-   * - ``$model_repo``
-     - ``Llama-3.1-8B``
-     - `Llama 3.1 8B <https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct>`_
-
-   * - 
-     - ``Llama-3.1-70B``
-     - `Llama 3.1 70B <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
-
-   * - 
-     - ``Llama-2-70B``
-     - `Llama 2 70B <https://huggingface.co/meta-llama/Llama-2-70B>`_
-
-   * - 
-     - ``Flux``
-     - `FLUX.1 [dev] <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
-
-   * - ``$sequence_length``
-     - Sequence length for the language model.
-     - Between 2048 and 8192. 8192 by default.
-
-.. note::
-
-   Occasionally, downloading the Flux dataset might fail. In the event of this
-   error, manually download it from Hugging Face at
-   `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
-   and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
-   the required dataset.
-
-Fine-tuning
-----------
-
-To start the fine-tuning benchmark, use the following command. It will run the benchmarking example of Llama 3.1 70B
-with the WikiText dataset using the AMD fork of `torchtune <https://github.com/AMD-AIG-AIMA/torchtune>`_.
-
-.. code-block:: shell
-
-   ./pytorch_benchmark_report.sh -t {finetune_fw, finetune_lora} -p BF16 -m Llama-3.1-70B
-
-Use the following command to run the benchmarking example of Llama 2 70B with the UltraChat 200k dataset using
-`Hugging Face PEFT <https://huggingface.co/docs/peft/en/index>`_.
-
-.. code-block:: shell
-
-   ./pytorch_benchmark_report.sh -t HF_finetune_lora -p BF16 -m Llama-2-70B
-
-Benchmarking examples
---------------------
-
-Here are some examples of how to use the command.
-
-* Example 1: Llama 3.1 70B with BF16 precision with `torchtitan <https://github.com/ROCm/torchtitan>`_.
-
-  .. code-block:: shell
-
-     ./pytorch_benchmark_report.sh -t pretrain -p BF16 -m Llama-3.1-70B -s 8192
-
-* Example 2: Llama 3.1 8B with FP8 precision using Transformer Engine (TE) and Hugging Face Accelerator.
-
-  .. code-block:: shell
-
-     ./pytorch_benchmark_report.sh -t pretrain -p FP8 -m Llama-3.1-70B -s 8192
-
-* Example 3: FLUX.1-dev with BF16 precision with FluxBenchmark.
-
-  .. code-block:: shell
-
-     ./pytorch_benchmark_report.sh -t pretrain -p BF16 -m Flux
-
-* Example 4: Torchtune full weight fine-tuning with Llama 3.1 70B
-
-  .. code-block:: shell
-
-     ./pytorch_benchmark_report.sh -t finetune_fw -p BF16 -m Llama-3.1-70B
-
-* Example 5: Torchtune LoRA fine-tuning with Llama 3.1 70B
-
-  .. code-block:: shell
-
-     ./pytorch_benchmark_report.sh -t finetune_lora -p BF16 -m Llama-3.1-70B
-
-* Example 6: Hugging Face PEFT LoRA fine-tuning with Llama 2 70B
-
-  .. code-block:: shell
-
-     ./pytorch_benchmark_report.sh -t HF_finetune_lora -p BF16 -m Llama-2-70B
-
-Previous versions
-=================
-
-See :doc:`pytorch-training-history` to find documentation for previous releases
-of the ``ROCm/pytorch-training`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5.rst
@@ -1,437 +0,0 @@
-.. meta::
-   :description: How to train a model using PyTorch for ROCm.
-   :keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
-
-**************************************
-Training a model with PyTorch for ROCm
-**************************************
-
-.. caution::
-
-   This documentation does not reflect the latest version of ROCm vLLM
-   performance benchmark documentation. See :doc:`../pytorch-training` for the latest version.
-
-PyTorch is an open-source machine learning framework that is widely used for
-model training with GPU-optimized components for transformer-based models.
-
-The `PyTorch for ROCm training Docker <https://hub.docker.com/layers/rocm/pytorch-training/v25.5/images/sha256-d47850a9b25b4a7151f796a8d24d55ea17bba545573f0d50d54d3852f96ecde5>`_
-(``rocm/pytorch-training:v25.5``) image
-provides a prebuilt optimized environment for fine-tuning and pretraining a
-model on AMD Instinct MI325X and MI300X accelerators. It includes the following
-software components to accelerate training workloads:
-
-+--------------------------+--------------------------------+
-| Software component       | Version                        |
-+==========================+================================+
-| ROCm                     | 6.3.4                          |
-+--------------------------+--------------------------------+
-| PyTorch                  | 2.7.0a0+git637433              |
-+--------------------------+--------------------------------+
-| Python                   | 3.10                           |
-+--------------------------+--------------------------------+
-| Transformer Engine       | 1.12.0.dev0+25a33da            |
-+--------------------------+--------------------------------+
-| Flash Attention          | 3.0.0                          |
-+--------------------------+--------------------------------+
-| hipBLASLt                | git53b53bf                     |
-+--------------------------+--------------------------------+
-| Triton                   | 3.2.0                          |
-+--------------------------+--------------------------------+
-
-.. _amd-pytorch-training-model-support:
-
-Supported models
-================
-
-The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X accelerators.
-
-* Llama 3.3 70B
-
-* Llama 3.1 8B
-
-* Llama 3.1 70B
-
-* Llama 2 70B
-
-* FLUX.1-dev
-
-.. note::
-
-   Only these models are supported in the following steps.
-
-   Some models, such as Llama 3, require an external license agreement through
-   a third party (for example, Meta).
-
-.. _amd-pytorch-training-performance-measurements:
-
-Performance measurements
-========================
-
-To evaluate performance, the
-`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
-page provides reference throughput and latency measurements for training
-popular AI models.
-
-.. note::
-
-   The performance data presented in
-   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
-   should not be interpreted as the peak performance achievable by AMD
-   Instinct MI325X and MI300X accelerators or ROCm software.
-
-System validation
-=================
-
-Before running AI workloads, it's important to validate that your AMD hardware is configured
-correctly and performing optimally.
-
-If you have already validated your system settings, including aspects like NUMA auto-balancing, you
-can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
-optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
-before starting training.
-
-To test for optimal performance, consult the recommended :ref:`System health benchmarks
-<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
-system's configuration.
-
-This Docker image is optimized for specific model configurations outlined
-below. Performance can vary for other training workloads, as AMD 
-doesn’t validate configurations and run conditions outside those described.
-
-Benchmarking
-============
-
-Once the setup is complete, choose between two options to start benchmarking:
-
-.. tab-set::
-
-   .. tab-item:: MAD-integrated benchmarking
-
-      Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
-      directory and install the required packages on the host machine.
-
-      .. code-block:: shell
-
-         git clone https://github.com/ROCm/MAD
-         cd MAD
-         pip install -r requirements.txt
-
-      For example, use this command to run the performance benchmark test on the Llama 3.1 8B model
-      using one GPU with the float16 data type on the host machine.
-
-      .. code-block:: shell
-
-         export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
-         python3 tools/run_models.py --tags pyt_train_llama-3.1-8b --keep-model-dir --live-output --timeout 28800
-
-      The available models for MAD-integrated benchmarking are:
-
-      * ``pyt_train_llama-3.3-70b``
-
-      * ``pyt_train_llama-3.1-8b``
-
-      * ``pyt_train_llama-3.1-70b``
-
-      * ``pyt_train_flux``
-
-      MAD launches a Docker container with the name
-      ``container_ci-pyt_train_llama-3.1-8b``, for example. The latency and throughput reports of the
-      model are collected in the following path: ``~/MAD/perf.csv``.
-
-   .. tab-item:: Standalone benchmarking
-
-      .. rubric:: Download the Docker image and required packages
-
-      Use the following command to pull the Docker image from Docker Hub.
-
-      .. code-block:: shell
-
-         docker pull rocm/pytorch-training:v25.5
-
-      Run the Docker container.
-
-      .. code-block:: shell
-
-         docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env rocm/pytorch-training:v25.5
-
-      Use these commands if you exit the ``training_env`` container and need to return to it.
-
-      .. code-block:: shell
-
-         docker start training_env
-         docker exec -it training_env bash
-
-      In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
-      repository and navigate to the benchmark scripts directory
-      ``/workspace/MAD/scripts/pytorch_train``.
-
-      .. code-block:: shell
-
-         git clone https://github.com/ROCm/MAD
-         cd MAD/scripts/pytorch_train
-
-      .. rubric:: Prepare training datasets and dependencies
-
-      The following benchmarking examples require downloading models and datasets
-      from Hugging Face. To ensure successful access to gated repos, set your
-      ``HF_TOKEN``.
-
-      .. code-block:: shell
-
-         export HF_TOKEN=$your_personal_hugging_face_access_token
-
-      Run the setup script to install libraries and datasets needed for benchmarking.
-
-      .. code-block:: shell
-
-         ./pytorch_benchmark_setup.sh
-
-      ``pytorch_benchmark_setup.sh`` installs the following libraries:
-
-      .. list-table::
-         :header-rows: 1
-
-         * - Library
-           - Benchmark model
-           - Reference
-
-         * - ``accelerate``
-           - Llama 3.1 8B, FLUX
-           - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
-
-         * - ``datasets``
-           - Llama 3.1 8B, 70B, FLUX
-           - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
-
-         * - ``torchdata``
-           - Llama 3.1 70B
-           - `TorchData <https://pytorch.org/data/beta/index.html>`_
-
-         * - ``tomli``
-           - Llama 3.1 70B
-           - `Tomli <https://pypi.org/project/tomli/>`_
-
-         * - ``tiktoken``
-           - Llama 3.1 70B
-           - `tiktoken <https://github.com/openai/tiktoken>`_
-
-         * - ``blobfile``
-           - Llama 3.1 70B
-           - `blobfile <https://pypi.org/project/blobfile/>`_
-
-         * - ``tabulate``
-           - Llama 3.1 70B
-           - `tabulate <https://pypi.org/project/tabulate/>`_
-
-         * - ``wandb``
-           - Llama 3.1 70B
-           - `Weights & Biases <https://github.com/wandb/wandb>`_
-
-         * - ``sentencepiece``
-           - Llama 3.1 70B, FLUX
-           - `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
-
-         * - ``tensorboard``
-           - Llama 3.1 70 B, FLUX
-           - `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
-
-         * - ``csvkit``
-           - FLUX
-           - `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1
-
-         * - ``deepspeed``
-           - FLUX
-           - `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2
-
-         * - ``diffusers``
-           - FLUX
-           - `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0
-
-         * - ``GitPython``
-           - FLUX
-           - `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44
-
-         * - ``opencv-python-headless``
-           - FLUX
-           - `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84
-
-         * - ``peft``
-           - FLUX
-           - `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0
-
-         * - ``protobuf``
-           - FLUX
-           - `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2
-
-         * - ``pytest``
-           - FLUX
-           - `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4
-
-         * - ``python-dotenv``
-           - FLUX
-           - `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1
-
-         * - ``seaborn``
-           - FLUX
-           - `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2
-
-         * - ``transformers``
-           - FLUX
-           - `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0
-
-      ``pytorch_benchmark_setup.sh`` downloads the following models from Hugging Face:
-
-      * `meta-llama/Llama-3.1-70B-Instruct <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
-
-      * `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
-
-      Along with the following datasets:
-
-      * `WikiText <https://huggingface.co/datasets/Salesforce/wikitext>`_
-
-      * `UltraChat 200k <https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k>`_
-
-      * `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_
-
-      .. rubric:: Pretraining
-
-      To start the pretraining benchmark, use the following command with the
-      appropriate options. See the following list of options and their descriptions.
-
-      .. code-block:: shell
-
-         ./pytorch_benchmark_report.sh -t $training_mode -m $model_repo -p $datatype -s $sequence_length
-
-      .. list-table::
-         :header-rows: 1
-
-         * - Name
-           - Options
-           - Description
-
-         * - ``$training_mode``
-           - ``pretrain``
-           - Benchmark pretraining
-
-         * -
-           - ``finetune_fw``
-           - Benchmark full weight fine-tuning (Llama 3.1 70B with BF16)
-
-         * -
-           - ``finetune_lora``
-           - Benchmark LoRA fine-tuning (Llama 3.1 70B with BF16)
-
-         * -
-           - ``HF_finetune_lora``
-           - Benchmark LoRA fine-tuning with Hugging Face PEFT (Llama 2 70B with BF16)
-
-         * - ``$datatype``
-           - ``FP8`` or ``BF16``
-           - Only Llama 3.1 8B supports FP8 precision.
-
-         * - ``$model_repo``
-           - ``Llama-3.3-70B``
-           - `Llama 3.3 70B <https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct>`_
-
-         * - 
-           - ``Llama-3.1-8B``
-           - `Llama 3.1 8B <https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct>`_
-
-         * - 
-           - ``Llama-3.1-70B``
-           - `Llama 3.1 70B <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_
-
-         * - 
-           - ``Llama-2-70B``
-           - `Llama 2 70B <https://huggingface.co/meta-llama/Llama-2-70B>`_
-
-         * - 
-           - ``Flux``
-           - `FLUX.1 [dev] <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
-
-         * - ``$sequence_length``
-           - Sequence length for the language model.
-           - Between 2048 and 8192. 8192 by default.
-
-      .. note::
-
-         Occasionally, downloading the Flux dataset might fail. In the event of this
-         error, manually download it from Hugging Face at
-         `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
-         and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
-         the required dataset.
-
-      .. rubric:: Fine-tuning
-
-      To start the fine-tuning benchmark, use the following command. It will run the benchmarking example of Llama 3.1 70B
-      with the WikiText dataset using the AMD fork of `torchtune <https://github.com/AMD-AIG-AIMA/torchtune>`_.
-
-      .. code-block:: shell
-
-         ./pytorch_benchmark_report.sh -t {finetune_fw, finetune_lora} -p BF16 -m Llama-3.1-70B
-
-      Use the following command to run the benchmarking example of Llama 2 70B with the UltraChat 200k dataset using
-      `Hugging Face PEFT <https://huggingface.co/docs/peft/en/index>`_.
-
-      .. code-block:: shell
-
-         ./pytorch_benchmark_report.sh -t HF_finetune_lora -p BF16 -m Llama-2-70B
-
-      .. rubric:: Benchmarking examples
-
-      Here are some example commands to get started pretraining and fine-tuning with various model configurations.
-
-      * Example 1: Llama 3.1 70B with BF16 precision with `torchtitan <https://github.com/ROCm/torchtitan>`_.
-
-        .. code-block:: shell
-
-           ./pytorch_benchmark_report.sh -t pretrain -p BF16 -m Llama-3.1-70B -s 8192
-
-      * Example 2: Llama 3.1 8B with FP8 precision using Transformer Engine (TE) and Hugging Face Accelerator.
-
-        .. code-block:: shell
-
-           ./pytorch_benchmark_report.sh -t pretrain -p FP8 -m Llama-3.1-70B -s 8192
-
-      * Example 3: FLUX.1-dev with BF16 precision with FluxBenchmark.
-
-        .. code-block:: shell
-
-           ./pytorch_benchmark_report.sh -t pretrain -p BF16 -m Flux
-
-      * Example 4: Torchtune full weight fine-tuning with Llama 3.1 70B
-
-        .. code-block:: shell
-
-           ./pytorch_benchmark_report.sh -t finetune_fw -p BF16 -m Llama-3.1-70B
-
-      * Example 5: Torchtune LoRA fine-tuning with Llama 3.1 70B
-
-        .. code-block:: shell
-
-           ./pytorch_benchmark_report.sh -t finetune_lora -p BF16 -m Llama-3.1-70B
-
-      * Example 6: Torchtune full weight fine-tuning with Llama-3.3-70B
-
-        .. code-block:: shell
-
-           ./pytorch_benchmark_report.sh -t finetune_fw -p BF16 -m Llama-3.3-70B
-
-      * Example 7: Torchtune LoRA fine-tuning with Llama-3.3-70B
-
-        .. code-block:: shell
-
-           ./pytorch_benchmark_report.sh -t finetune_lora -p BF16 -m Llama-3.3-70B
-
-      * Example 8: Torchtune QLoRA fine-tuning with Llama-3.3-70B
-
-        .. code-block:: shell
-
-           ./pytorch_benchmark_report.sh -t finetune_qlora -p BF16 -m Llama-3.3-70B
-
-      * Example 9: Hugging Face PEFT LoRA fine-tuning with Llama 2 70B
-
-        .. code-block:: shell
-
-           ./pytorch_benchmark_report.sh -t HF_finetune_lora -p BF16 -m Llama-2-70B
-
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
@@ -9,27 +9,28 @@ Training a model with PyTorch for ROCm
 PyTorch is an open-source machine learning framework that is widely used for
 model training with GPU-optimized components for transformer-based models.

-The `PyTorch for ROCm training Docker <https://hub.docker.com/r/rocm/pytorch-training/tags>`_
-(``rocm/pytorch-training:v25.6``) image provides a prebuilt optimized environment for fine-tuning and pretraining a
-model on AMD Instinct MI325X and MI300X accelerators. It includes the following software components to accelerate
-training workloads:
+The `PyTorch for ROCm training Docker <https://hub.docker.com/layers/rocm/pytorch-training/v25.5/images/sha256-d47850a9b25b4a7151f796a8d24d55ea17bba545573f0d50d54d3852f96ecde5>`_
+(``rocm/pytorch-training:v25.5``) image
+provides a prebuilt optimized environment for fine-tuning and pretraining a
+model on AMD Instinct MI325X and MI300X accelerators. It includes the following
+software components to accelerate training workloads:

 +--------------------------+--------------------------------+
 | Software component       | Version                        |
 +==========================+================================+
 | ROCm                     | 6.3.4                          |
 +--------------------------+--------------------------------+
-| PyTorch                  | 2.8.0a0+git7d205b2             |
+| PyTorch                  | 2.7.0a0+git637433              |
 +--------------------------+--------------------------------+
-| Python                   | 3.10.17                        |
+| Python                   | 3.10                           |
 +--------------------------+--------------------------------+
-| Transformer Engine       | 1.14.0+2f85f5f2                |
+| Transformer Engine       | 1.12.0.dev0+25a33da            |
 +--------------------------+--------------------------------+
-| Flash Attention          | 3.0.0.post1                    |
+| Flash Attention          | 3.0.0                          |
 +--------------------------+--------------------------------+
-| hipBLASLt                | 0.15.0-8c6919d                 |
+| hipBLASLt                | git53b53bf                     |
 +--------------------------+--------------------------------+
-| Triton                   | 3.3.0                          |
+| Triton                   | 3.2.0                          |
 +--------------------------+--------------------------------+

 .. _amd-pytorch-training-model-support:
@@ -39,396 +40,422 @@ Supported models

 The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X accelerators.

-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
+* Llama 3.3 70B

-   {% set unified_docker = data.unified_docker.latest %}
-   {% set model_groups = data.model_groups %}
+* Llama 3.1 8B

-   .. raw:: html
+* Llama 3.1 70B

-      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
-        <div class="row">
-          <div class="col-2 me-2 model-param-head">Workload</div>
-          <div class="row col-10">
-   {% for model_group in model_groups %}
-            <div class="col-6 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
-   {% endfor %}
-          </div>
-        </div>
+* Llama 2 70B

-        <div class="row mt-1">
-          <div class="col-2 me-2 model-param-head">Model</div>
-          <div class="row col-10">
-   {% for model_group in model_groups %}
-      {% set models = model_group.models %}
-      {% for model in models %}
-         {% if models|length % 3 == 0 %}
-            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-         {% else %}
-            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-         {% endif %}
-      {% endfor %}
-   {% endfor %}
-          </div>
-        </div>
-      </div>
+* FLUX.1-dev

-   .. note::
+.. note::

-      Some models require an external license agreement through a third party (for example, Meta).
+   Only these models are supported in the following steps.

-   .. _amd-pytorch-training-performance-measurements:
+   Some models, such as Llama 3, require an external license agreement through
+   a third party (for example, Meta).

-   Performance measurements
-   ========================
+.. _amd-pytorch-training-performance-measurements:

-   To evaluate performance, the
+Performance measurements
+========================
+
+To evaluate performance, the
+`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
+page provides reference throughput and latency measurements for training
+popular AI models.
+
+.. note::
+
+   The performance data presented in
   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
-   page provides reference throughput and latency measurements for training
-   popular AI models.
+   should not be interpreted as the peak performance achievable by AMD
+   Instinct MI325X and MI300X accelerators or ROCm software.

-   .. note::
+System validation
+=================

-      The performance data presented in
-      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
-      should not be interpreted as the peak performance achievable by AMD
-      Instinct MI325X and MI300X accelerators or ROCm software.
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.

-   System validation
-   =================
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting training.

-   Before running AI workloads, it's important to validate that your AMD hardware is configured
-   correctly and performing optimally.
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.

-   If you have already validated your system settings, including aspects like NUMA auto-balancing, you
-   can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
-   optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
-   before starting training.
+This Docker image is optimized for specific model configurations outlined
+below. Performance can vary for other training workloads, as AMD 
+doesn’t validate configurations and run conditions outside those described.

-   To test for optimal performance, consult the recommended :ref:`System health benchmarks
-   <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
-   system's configuration.
+Benchmarking
+============

-   This Docker image is optimized for specific model configurations outlined
-   below. Performance can vary for other training workloads, as AMD
-   doesn’t validate configurations and run conditions outside those described.
+Once the setup is complete, choose between two options to start benchmarking:

-   Benchmarking
-   ============
+.. tab-set::

-   Once the setup is complete, choose between two options to start benchmarking:
+   .. tab-item:: MAD-integrated benchmarking

-   .. tab-set::
+      Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+      directory and install the required packages on the host machine.

-      .. tab-item:: MAD-integrated benchmarking
+      .. code-block:: shell

-         Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
-         directory and install the required packages on the host machine.
+         git clone https://github.com/ROCm/MAD
+         cd MAD
+         pip install -r requirements.txt

-         .. code-block:: shell
+      For example, use this command to run the performance benchmark test on the Llama 3.1 8B model
+      using one GPU with the float16 data type on the host machine.

-            git clone https://github.com/ROCm/MAD
-            cd MAD
-            pip install -r requirements.txt
+      .. code-block:: shell

-   {% for model_group in model_groups %}
-      {% for model in model_group.models %}
+         export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+         python3 tools/run_models.py --tags pyt_train_llama-3.1-8b --keep-model-dir --live-output --timeout 28800

-         .. container:: model-doc {{ model.mad_tag }}
+      The available models for MAD-integrated benchmarking are:

-            For example, use this command to run the performance benchmark test on the {{ model.model }} model
-            using one GPU with the {{ model.precision }} data type on the host machine.
+      * ``pyt_train_llama-3.3-70b``

-            .. code-block:: shell
+      * ``pyt_train_llama-3.1-8b``

-               export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
-               python3 tools/run_models.py --tags {{ model.mad_tag }} --keep-model-dir --live-output --timeout 28800
+      * ``pyt_train_llama-3.1-70b``

-            MAD launches a Docker container with the name
-            ``container_ci-{{ model.mad_tag }}``, for example. The latency and throughput reports of the
-            model are collected in the following path: ``~/MAD/perf.csv``.
+      * ``pyt_train_flux``

-      {% endfor %}
-   {% endfor %}
+      MAD launches a Docker container with the name
+      ``container_ci-pyt_train_llama-3.1-8b``, for example. The latency and throughput reports of the
+      model are collected in the following path: ``~/MAD/perf.csv``.

-      .. tab-item:: Standalone benchmarking
+   .. tab-item:: Standalone benchmarking

-         .. rubric:: Download the Docker image and required packages
+      .. rubric:: Download the Docker image and required packages

-         Use the following command to pull the Docker image from Docker Hub.
+      Use the following command to pull the Docker image from Docker Hub.

-         .. code-block:: shell
+      .. code-block:: shell

-            docker pull {{ unified_docker.pull_tag }}
+         docker pull rocm/pytorch-training:v25.5

-         Run the Docker container.
+      Run the Docker container.

-         .. code-block:: shell
+      .. code-block:: shell

-            docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env {{ unified_docker.pull_tag }}
+         docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env rocm/pytorch-training:v25.5

-         Use these commands if you exit the ``training_env`` container and need to return to it.
+      Use these commands if you exit the ``training_env`` container and need to return to it.

-         .. code-block:: shell
+      .. code-block:: shell

-            docker start training_env
-            docker exec -it training_env bash
+         docker start training_env
+         docker exec -it training_env bash

-         In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
-         repository and navigate to the benchmark scripts directory
-         ``/workspace/MAD/scripts/pytorch_train``.
+      In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
+      repository and navigate to the benchmark scripts directory
+      ``/workspace/MAD/scripts/pytorch_train``.

-         .. code-block:: shell
+      .. code-block:: shell

-            git clone https://github.com/ROCm/MAD
-            cd MAD/scripts/pytorch_train
+         git clone https://github.com/ROCm/MAD
+         cd MAD/scripts/pytorch_train

-         .. rubric:: Prepare training datasets and dependencies
+      .. rubric:: Prepare training datasets and dependencies

-         The following benchmarking examples require downloading models and datasets
-         from Hugging Face. To ensure successful access to gated repos, set your
-         ``HF_TOKEN``.
+      The following benchmarking examples require downloading models and datasets
+      from Hugging Face. To ensure successful access to gated repos, set your
+      ``HF_TOKEN``.

-         .. code-block:: shell
+      .. code-block:: shell

-            export HF_TOKEN=$your_personal_hugging_face_access_token
+         export HF_TOKEN=$your_personal_hugging_face_access_token

-         Run the setup script to install libraries and datasets needed for benchmarking.
+      Run the setup script to install libraries and datasets needed for benchmarking.

-         .. code-block:: shell
+      .. code-block:: shell

-            ./pytorch_benchmark_setup.sh
+         ./pytorch_benchmark_setup.sh

-         .. container:: model-doc pyt_train_llama-3.1-8b
+      ``pytorch_benchmark_setup.sh`` installs the following libraries:

-            ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 8B:
+      .. list-table::
+         :header-rows: 1

-            .. list-table::
-               :header-rows: 1
+         * - Library
+           - Benchmark model
+           - Reference

-               * - Library
-                 - Reference
+         * - ``accelerate``
+           - Llama 3.1 8B, FLUX
+           - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_

-               * - ``accelerate``
-                 - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
+         * - ``datasets``
+           - Llama 3.1 8B, 70B, FLUX
+           - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0

-               * - ``datasets``
-                 - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
+         * - ``torchdata``
+           - Llama 3.1 70B
+           - `TorchData <https://pytorch.org/data/beta/index.html>`_

-         .. container:: model-doc pyt_train_llama-3.1-70b
+         * - ``tomli``
+           - Llama 3.1 70B
+           - `Tomli <https://pypi.org/project/tomli/>`_

-            ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 70B:
+         * - ``tiktoken``
+           - Llama 3.1 70B
+           - `tiktoken <https://github.com/openai/tiktoken>`_

-            .. list-table::
-               :header-rows: 1
+         * - ``blobfile``
+           - Llama 3.1 70B
+           - `blobfile <https://pypi.org/project/blobfile/>`_

-               * - Library
-                 - Reference
+         * - ``tabulate``
+           - Llama 3.1 70B
+           - `tabulate <https://pypi.org/project/tabulate/>`_

-               * - ``datasets``
-                 - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
+         * - ``wandb``
+           - Llama 3.1 70B
+           - `Weights & Biases <https://github.com/wandb/wandb>`_

-               * - ``torchdata``
-                 - `TorchData <https://pytorch.org/data/beta/index.html>`_
+         * - ``sentencepiece``
+           - Llama 3.1 70B, FLUX
+           - `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0

-               * - ``tomli``
-                 - `Tomli <https://pypi.org/project/tomli/>`_
+         * - ``tensorboard``
+           - Llama 3.1 70 B, FLUX
+           - `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0

-               * - ``tiktoken``
-                 - `tiktoken <https://github.com/openai/tiktoken>`_
+         * - ``csvkit``
+           - FLUX
+           - `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1

-               * - ``blobfile``
-                 - `blobfile <https://pypi.org/project/blobfile/>`_
+         * - ``deepspeed``
+           - FLUX
+           - `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2

-               * - ``tabulate``
-                 - `tabulate <https://pypi.org/project/tabulate/>`_
+         * - ``diffusers``
+           - FLUX
+           - `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0

-               * - ``wandb``
-                 - `Weights & Biases <https://github.com/wandb/wandb>`_
+         * - ``GitPython``
+           - FLUX
+           - `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44

-               * - ``sentencepiece``
-                 - `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
+         * - ``opencv-python-headless``
+           - FLUX
+           - `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84

-               * - ``tensorboard``
-                 - `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
+         * - ``peft``
+           - FLUX
+           - `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0

-         .. container:: model-doc pyt_train_flux
+         * - ``protobuf``
+           - FLUX
+           - `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2

-            ``pytorch_benchmark_setup.sh`` installs the following libraries for FLUX:
+         * - ``pytest``
+           - FLUX
+           - `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4

-            .. list-table::
-               :header-rows: 1
+         * - ``python-dotenv``
+           - FLUX
+           - `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1

-               * - Library
-                 - Reference
+         * - ``seaborn``
+           - FLUX
+           - `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2

-               * - ``accelerate``
-                 - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
+         * - ``transformers``
+           - FLUX
+           - `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0

-               * - ``datasets``
-                 - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
+      ``pytorch_benchmark_setup.sh`` downloads the following models from Hugging Face:

-               * - ``sentencepiece``
-                 - `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
+      * `meta-llama/Llama-3.1-70B-Instruct <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_

-               * - ``tensorboard``
-                 - `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
+      * `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_

-               * - ``csvkit``
-                 - `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1
+      Along with the following datasets:

-               * - ``deepspeed``
-                 - `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2
+      * `WikiText <https://huggingface.co/datasets/Salesforce/wikitext>`_

-               * - ``diffusers``
-                 - `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0
+      * `UltraChat 200k <https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k>`_

-               * - ``GitPython``
-                 - `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44
+      * `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_

-               * - ``opencv-python-headless``
-                 - `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84
+      .. rubric:: Pretraining

-               * - ``peft``
-                 - `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0
+      To start the pretraining benchmark, use the following command with the
+      appropriate options. See the following list of options and their descriptions.

-               * - ``protobuf``
-                 - `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2
+      .. code-block:: shell

-               * - ``pytest``
-                 - `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4
+         ./pytorch_benchmark_report.sh -t $training_mode -m $model_repo -p $datatype -s $sequence_length

-               * - ``python-dotenv``
-                 - `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1
+      .. list-table::
+         :header-rows: 1

-               * - ``seaborn``
-                 - `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2
+         * - Name
+           - Options
+           - Description

-               * - ``transformers``
-                 - `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0
+         * - ``$training_mode``
+           - ``pretrain``
+           - Benchmark pretraining

-         ``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face:
+         * -
+           - ``finetune_fw``
+           - Benchmark full weight fine-tuning (Llama 3.1 70B with BF16)

-         * `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_
+         * -
+           - ``finetune_lora``
+           - Benchmark LoRA fine-tuning (Llama 3.1 70B with BF16)

-   {% for model_group in model_groups %}
-      {% for model in model_group.models %}
-         {% if model_group.tag == "pre-training" and model.mad_tag in ["pyt_train_llama-3.1-8b", "pyt_train_llama-3.1-70b", "pyt_train_flux"] %}
+         * -
+           - ``HF_finetune_lora``
+           - Benchmark LoRA fine-tuning with Hugging Face PEFT (Llama 2 70B with BF16)

-         .. container:: model-doc {{ model.mad_tag }}
+         * - ``$datatype``
+           - ``FP8`` or ``BF16``
+           - Only Llama 3.1 8B supports FP8 precision.

-            .. rubric:: Pretraining
+         * - ``$model_repo``
+           - ``Llama-3.3-70B``
+           - `Llama 3.3 70B <https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct>`_

-            To start the pre-training benchmark, use the following command with the
-            appropriate options. See the following list of options and their descriptions.
+         * - 
+           - ``Llama-3.1-8B``
+           - `Llama 3.1 8B <https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct>`_

-            .. code-block:: shell
+         * - 
+           - ``Llama-3.1-70B``
+           - `Llama 3.1 70B <https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct>`_

-               ./pytorch_benchmark_report.sh -t pretrain -m {{ model.model_repo }} -p $datatype -s $sequence_length
+         * - 
+           - ``Llama-2-70B``
+           - `Llama 2 70B <https://huggingface.co/meta-llama/Llama-2-70B>`_

-            .. list-table::
-               :header-rows: 1
+         * - 
+           - ``Flux``
+           - `FLUX.1 [dev] <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_

-               * - Name
-                 - Options
-                 - Description
+         * - ``$sequence_length``
+           - Sequence length for the language model.
+           - Between 2048 and 8192. 8192 by default.

-            {% if model.mad_tag == "pyt_train_llama-3.1-8b" %}
-               * - ``$datatype``
-                 - ``BF16`` or ``FP8``
-                 - Only Llama 3.1 8B supports FP8 precision.
-            {% else %}
-               * - ``$datatype``
-                 - ``BF16``
-                 - Only Llama 3.1 8B supports FP8 precision.
-            {% endif %}
+      .. note::

-               * - ``$sequence_length``
-                 - Sequence length for the language model.
-                 - Between 2048 and 8192. 8192 by default.
+         Occasionally, downloading the Flux dataset might fail. In the event of this
+         error, manually download it from Hugging Face at
+         `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
+         and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
+         the required dataset.

-            {% if model.mad_tag == "pyt_train_flux" %}
-            .. container:: model-doc {{ model.mad_tag }}
+      .. rubric:: Fine-tuning

-               .. note::
+      To start the fine-tuning benchmark, use the following command. It will run the benchmarking example of Llama 3.1 70B
+      with the WikiText dataset using the AMD fork of `torchtune <https://github.com/AMD-AIG-AIMA/torchtune>`_.

-                  Occasionally, downloading the Flux dataset might fail. In the event of this
-                  error, manually download it from Hugging Face at
-                  `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
-                  and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
-                  the required dataset.
-            {% endif %}
-         {% endif %}
+      .. code-block:: shell

-         {% if model_group.tag == "fine-tuning" %}
-         .. container:: model-doc {{ model.mad_tag }}
+         ./pytorch_benchmark_report.sh -t {finetune_fw, finetune_lora} -p BF16 -m Llama-3.1-70B

-            .. rubric:: Fine-tuning
+      Use the following command to run the benchmarking example of Llama 2 70B with the UltraChat 200k dataset using
+      `Hugging Face PEFT <https://huggingface.co/docs/peft/en/index>`_.

-            To start the fine-tuning benchmark, use the following command with the
-            appropriate options. See the following list of options and their descriptions.
+      .. code-block:: shell

-            .. code-block:: shell
+         ./pytorch_benchmark_report.sh -t HF_finetune_lora -p BF16 -m Llama-2-70B

-               ./pytorch_benchmark_report.sh -t $training_mode -m {{ model.model_repo }} -p BF16 -s $sequence_length
+      .. rubric:: Benchmarking examples

-            .. list-table::
-               :header-rows: 1
+      Here are some example commands to get started pretraining and fine-tuning with various model configurations.

-               * - Name
-                 - Options
-                 - Description
+      * Example 1: Llama 3.1 70B with BF16 precision with `torchtitan <https://github.com/ROCm/torchtitan>`_.

-               * - ``$training_mode``
-                 - ``finetune_fw``
-                 - Full weight fine-tuning (BF16 supported)
+        .. code-block:: shell

-               * -
-                 - ``finetune_lora``
-                 - LoRA fine-tuning (BF16 supported)
+           ./pytorch_benchmark_report.sh -t pretrain -p BF16 -m Llama-3.1-70B -s 8192

-               * -
-                 - ``finetune_qlora``
-                 - QLoRA fine-tuning (BF16 supported)
+      * Example 2: Llama 3.1 8B with FP8 precision using Transformer Engine (TE) and Hugging Face Accelerator.

-               * -
-                 - ``HF_finetune_lora``
-                 - LoRA fine-tuning with Hugging Face PEFT
+        .. code-block:: shell

-               * - ``$datatype``
-                 - ``BF16``
-                 - All models support BF16.
+           ./pytorch_benchmark_report.sh -t pretrain -p FP8 -m Llama-3.1-70B -s 8192

-               * - ``$sequence_length``
-                 - Between 2048 and 16384.
-                 - Sequence length for the language model.
+      * Example 3: FLUX.1-dev with BF16 precision with FluxBenchmark.

-            .. note::
+        .. code-block:: shell

-               {{ model.model }} currently supports the following fine-tuning methods:
+           ./pytorch_benchmark_report.sh -t pretrain -p BF16 -m Flux

-            {% for method in model.training_modes %}
-               * ``{{ method }}``
-            {% endfor %}
-            {% if model.training_modes|length < 4 %}
+      * Example 4: Torchtune full weight fine-tuning with Llama 3.1 70B

-               The upstream `torchtune <https://github.com/pytorch/torchtune>`_ repository
-               does not currently provide YAML configuration files for other combinations of
-               model to fine-tuning method
-               However, you can still configure your own YAML files to enable support for
-               fine-tuning methods not listed here by following existing patterns in the
-               ``/workspace/torchtune/recipes/configs`` directory.
-            {% endif %}
-         {% endif %}
-      {% endfor %}
-   {% endfor %}
+        .. code-block:: shell

-               .. rubric:: Benchmarking examples
+           ./pytorch_benchmark_report.sh -t finetune_fw -p BF16 -m Llama-3.1-70B

-               For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.
+      * Example 5: Torchtune LoRA fine-tuning with Llama 3.1 70B
+
+        .. code-block:: shell
+
+           ./pytorch_benchmark_report.sh -t finetune_lora -p BF16 -m Llama-3.1-70B
+
+      * Example 6: Torchtune full weight fine-tuning with Llama-3.3-70B
+
+        .. code-block:: shell
+
+           ./pytorch_benchmark_report.sh -t finetune_fw -p BF16 -m Llama-3.3-70B
+
+      * Example 7: Torchtune LoRA fine-tuning with Llama-3.3-70B
+
+        .. code-block:: shell
+
+           ./pytorch_benchmark_report.sh -t finetune_lora -p BF16 -m Llama-3.3-70B
+
+      * Example 8: Torchtune QLoRA fine-tuning with Llama-3.3-70B
+
+        .. code-block:: shell
+
+           ./pytorch_benchmark_report.sh -t finetune_qlora -p BF16 -m Llama-3.3-70B
+
+      * Example 9: Hugging Face PEFT LoRA fine-tuning with Llama 2 70B
+
+        .. code-block:: shell
+
+           ./pytorch_benchmark_report.sh -t HF_finetune_lora -p BF16 -m Llama-2-70B

 Previous versions
 =================

-See :doc:`previous-versions/pytorch-training-history` to find documentation for previous releases
-of the ``ROCm/pytorch-training`` Docker image.
+This table lists previous versions of the ROCm PyTorch training Docker image for training
+performance validation. For detailed information about available models for
+benchmarking, see the version-specific documentation.
+
+.. list-table::
+   :header-rows: 1
+   :stub-columns: 1
+
+   * - Image version
+     - ROCm version
+     - PyTorch version
+     - Resources
+
+   * - v25.4
+     - 6.3.0
+     - 2.7.0a0+git637433
+     - 
+       * `Documentation <https://rocm.docs.amd.com/en/docs-6.3.3/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.html>`_
+       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.4/images/sha256-fa98a9aa69968e654466c06f05aaa12730db79b48b113c1ab4f7a5fe6920a20b>`_
+
+   * - v25.3
+     - 6.3.0
+     - 2.7.0a0+git637433
+     - 
+       * `Documentation <https://rocm.docs.amd.com/en/docs-6.3.2/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.html>`_
+       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.3/images/sha256-0ffdde1b590fd2787b1c7adf5686875b100980b0f314090901387c44253e709b>`_
--- a/docs/how-to/setting-cus.rst
+++ b/docs/how-to/setting-cus.rst
@@ -38,5 +38,5 @@ The variable parsing stops when a syntax error occurs. The erroneous set and the

    These environment variables only affect ROCm software, not graphics applications.

-Not all CU configurations are valid on all devices. For example, on devices where two CUs can be combined into a WGP (for kernels running in WGP mode), it’s not valid to disable only a single CU in a WGP.
+Not all CU configurations are valid on all devices. For example, on devices where two CUs can be combined into a WGP (for kernels running in WGP mode), it’s not valid to disable only a single CU in a WGP. For more information about what to expect when disabling CUs, see the `Exploring AMD GPU Scheduling Details by Experimenting With “Worst Practices” <https://www.cs.unc.edu/~otternes/papers/rtsj2022.pdf>`_ paper.

--- a/docs/preview/index.md
+++ b/docs/preview/index.md
@@ -1,26 +0,0 @@
---
-myst:
-  html_meta:
-    "description": "AMD ROCm 7.0 Alpha documentation"
-    "keywords": "Radeon, open, compute, platform, install, how, conceptual, reference, home, docs"
---
-
-# AMD ROCm 7.0 Alpha documentation
-
-AMD ROCm is an open-source software platform optimized to extract HPC and AI
-workload performance from AMD Instinct™ accelerators while maintaining
-compatibility with industry software frameworks.
-
-This documentation is intended to provide early access information about the ROCm
-software Alpha release. The preview release provides early access to new
-features under development for testing for users to provide feedback.
-It is not recommended for production use.
-
-```{note}
-See [ROCm documentation](https://rocm.docs.amd.com/en/latest/) for the latest stable release for use in production.
-```
-
-The documentation includes:
-
- [ROCm 7.0 Alpha release notes](release.rst) with feature details and support matrix
- [Installation instructions](install/index.rst) for the ROCm 7.0 Alpha and the Instinct Driver
--- a/docs/preview/install/index.rst
+++ b/docs/preview/install/index.rst
@@ -1,28 +0,0 @@
-.. meta::
-  :description: Installation via native package manager
-  :keywords: ROCm install, installation instructions, package manager, native package manager, AMD,
-    ROCm
-
-****************************************
-ROCm 7.0 Alpha installation instructions
-****************************************
-
-The ROCm 7.0 Alpha must be installed using your Linux distribution's native
-package manager. This release supports specific hardware and software
-configurations -- before installing, see the :ref:`supported OSes and hardware
-<alpha-system-requirements>` outlined in the Alpha release notes.
-
-.. important::
-
-   Upgrades and downgrades are not supported. You must install any existing
-   ROCm installation before installing the Alpha build.
-
-.. grid:: 2
-
-   .. grid-item-card:: Install ROCm
-
-      See :doc:`Install the ROCm 7.0 Alpha via package manager <rocm>`.
-
-   .. grid-item-card:: Install Instinct Driver
-
-      See :doc:`Install the Instinct Driver via package manager <instinct-driver>`.
--- a/docs/preview/install/instinct-driver.rst
+++ b/docs/preview/install/instinct-driver.rst
@@ -1,212 +0,0 @@
-***********************************************
-Install the Instinct Driver via package manager
-***********************************************
-
-This section describes how to install the Instinct Driver using ``apt`` on
-Ubuntu 22.04 or 24.04, or ``dnf`` on Red Hat Enterprise Linux 9.6.
-
-.. important::
-
-   Upgrades and downgrades are not supported. You must uninstall any existing
-   ROCm installation before installing the preview build.
-
-Prerequisites
-=============
-
-Before installing, complete the following prerequisites.
-
-.. tab-set::
-
-   .. tab-item:: Ubuntu 22.04
-      :sync: ubuntu-22
-
-      Install kernel headers.
-
-      .. code-block:: shell
-
-         sudo apt install "linux-headers-$(uname -r)" "linux-modules-extra-$(uname -r)" 
-
-   .. tab-item:: Ubuntu 24.04
-      :sync: ubuntu-24
-
-      Install kernel headers.
-
-      .. code-block:: shell
-
-         sudo apt install "linux-headers-$(uname -r)" "linux-modules-extra-$(uname -r)" 
-
-   .. tab-item:: RHEL 9.6
-      :sync: rhel-96
-
-      1. Register your Enterprise Linux.
-
-         .. code-block:: shell
-
-            subscription-manager register --username <username> --password <password>
-            subscription-manager attach --auto
-
-      2. Update your Enterprise Linux.
-
-         .. code-block:: shell
-
-            sudo dnf update --releasever=9.6 --exclude=\*release\*
-
-      3. Install kernel headers.
-
-         .. code-block:: shell
-
-            sudo dnf install "kernel-headers-$(uname -r)" "kernel-devel-$(uname -r)" "kernel-devel-matched-$(uname -r)"
-
-Register ROCm repositories
-==========================
-
-.. tab-set::
-
-   .. tab-item:: Ubuntu 22.04
-      :sync: ubuntu-22
-
-      1. Add the package signing key.
-
-         .. code-block:: shell
-
-            # Make the directory if it doesn't exist yet.
-            # This location is recommended by the distribution maintainers.
-            sudo mkdir --parents --mode=0755 /etc/apt/keyrings 
-            # Download the key, convert the signing-key to a full
-            # keyring required by apt and store in the keyring directory.
-            wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | \
-              gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null 
-
-      2. Register the kernel mode driver.
-
-         .. code-block:: shell
-
-            echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/amdgpu/30.10_alpha/ubuntu jammy main" \
-              | sudo tee /etc/apt/sources.list.d/amdgpu.list
-            sudo apt update 
-
-   .. tab-item:: Ubuntu 24.04
-      :sync: ubuntu-24
-
-      1. Add the package signing key.
-
-         .. code-block:: shell
-
-            # Make the directory if it doesn't exist yet.
-            # This location is recommended by the distribution maintainers.
-            sudo mkdir --parents --mode=0755 /etc/apt/keyrings 
-            # Download the key, convert the signing-key to a full
-            # keyring required by apt and store in the keyring directory.
-            wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | \
-              gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null 
-
-      2. Register the kernel mode driver.
-
-         .. code-block:: shell
-
-            echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/amdgpu/30.10_alpha/ubuntu noble main" \
-              | sudo tee /etc/apt/sources.list.d/amdgpu.list
-            sudo apt update 
-
-   .. tab-item:: RHEL 9.6
-      :sync: rhel-96
-
-      .. code-block:: shell
-
-         sudo tee /etc/yum.repos.d/amdgpu.repo <<EOF
-         [amdgpu]
-         name=amdgpu
-         baseurl=https://repo.radeon.com/amdgpu/30.10_alpha/rhel/9.6/main/x86_64/
-         enabled=1
-         priority=50
-         gpgcheck=1
-         gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key
-         EOF
-         sudo dnf clean all
-
-Install the kernel driver
-=========================
-
-.. tab-set::
-
-   .. tab-item:: Ubuntu 22.04
-      :sync: ubuntu-22
-
-      .. code-block:: shell
-
-         sudo apt install amdgpu-dkms
-
-   .. tab-item:: Ubuntu 24.04
-      :sync: ubuntu-24
-
-      .. code-block:: shell
-
-         sudo apt install amdgpu-dkms
-
-   .. tab-item:: RHEL 9.6
-      :sync: rhel-96
-
-      .. code-block:: shell
-
-         sudo dnf install amdgpu-dkms
-
-Uninstalling
-============
-
-.. tab-set::
-
-   .. tab-item:: Ubuntu 22.04
-      :sync: ubuntu-22
-
-      1. Uninstall the kernel mode driver.
-
-         .. code-block:: shell
-
-            sudo apt autoremove amdgpu-dkms
-
-      2. Remove AMDGPU repositories.
-
-         .. code-block:: shell
-
-            sudo rm /etc/apt/sources.list.d/amdgpu.list
-            # Clear the cache and clean the system
-            sudo rm -rf /var/cache/apt/*
-            sudo apt clean all
-            sudo apt update
-
-   .. tab-item:: Ubuntu 24.04
-      :sync: ubuntu-24
-
-      1. Uninstall the kernel mode driver.
-
-         .. code-block:: shell
-
-            sudo apt autoremove amdgpu-dkms
-
-      2. Remove AMDGPU repositories.
-
-         .. code-block:: shell
-
-            sudo rm /etc/apt/sources.list.d/amdgpu.list
-            # Clear the cache and clean the system
-            sudo rm -rf /var/cache/apt/*
-            sudo apt clean all
-            sudo apt update
-
-   .. tab-item:: RHEL 9.6
-      :sync: rhel-96
-
-      1. Uninstall the kernel mode driver.
-
-         .. code-block:: shell
-
-            sudo dnf remove amdgpu-dkms
-
-      2. Remove AMDGPU repositories.
-
-         .. code-block:: shell
-
-            sudo rm /etc/yum.repos.d/amdgpu.repo
-            # Clear the cache and clean the system
-            sudo rm -rf /var/cache/dnf
-            sudo dnf clean all
--- a/docs/preview/install/rocm.rst
+++ b/docs/preview/install/rocm.rst
@@ -1,288 +0,0 @@
-**********************************************
-Install the ROCm 7.0 Alpha via package manager
-**********************************************
-
-This page describes how to install the ROCm 7.0 Alpha build using ``apt`` on
-Ubuntu 22.04 or 24.04, or ``dnf`` on Red Hat Enterprise Linux 9.6.
-
-.. important::
-
-   Upgrades and downgrades are not supported. You must uninstall any existing
-   ROCm installation before installing the preview build.
-
-Prerequisites
-=============
-
-Before installing, complete the following prerequisites.
-
-.. tab-set::
-
-   .. tab-item:: Ubuntu 22.04
-      :sync: ubuntu-22
-
-      1. Install development packages.
-
-         .. code-block:: shell
-
-            sudo apt install python3-setuptools python3-wheel
-
-      2. Configure user permissions for GPU access.
-
-         .. code-block:: shell
-
-            sudo usermod -a -G render,video $LOGNAME
-
-   .. tab-item:: Ubuntu 24.04
-      :sync: ubuntu-24
-
-      1. Install development packages.
-
-         .. code-block:: shell
-
-            sudo apt install python3-setuptools python3-wheel
-
-      2. Configure user permissions for GPU access.
-
-         .. code-block:: shell
-
-            sudo usermod -a -G render,video $LOGNAME
-
-   .. tab-item:: RHEL 9.6
-      :sync: rhel-96
-
-      1. Register your Enterprise Linux.
-
-         .. code-block:: shell
-
-            subscription-manager register --username <username> --password <password>
-            subscription-manager attach --auto
-
-      2. Update your Enterprise Linux.
-
-         .. code-block:: shell
-
-            sudo dnf update --releasever=9.6 --exclude=\*release\*
-
-      3. Install additional package repositories.
-
-         Add the EPEL repository:
-
-         .. code-block:: shell
-
-            wget https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm
-            sudo rpm -ivh epel-release-latest-9.noarch.rpm
-
-         Enable the CodeReady Linux Build (CRB) repository.
-
-         .. code-block:: shell
-
-            sudo dnf install dnf-plugin-config-manager
-            sudo crb enable
-
-      4. Install development packages.
-
-         .. code-block:: shell
-
-            sudo dnf install python3-setuptools python3-wheel
-
-      5. Configure user permissions for GPU access.
-
-         .. code-block:: shell
-
-            sudo usermod -a -G render,video $LOGNAME
-
-Register ROCm repositories
-==========================
-
-.. tab-set::
-
-   .. tab-item:: Ubuntu 22.04
-      :sync: ubuntu-22
-
-      1. Add the package signing key.
-
-         .. code-block:: shell
-
-            # Make the directory if it doesn't exist yet.
-            # This location is recommended by the distribution maintainers.
-            sudo mkdir --parents --mode=0755 /etc/apt/keyrings 
-            # Download the key, convert the signing-key to a full
-            # keyring required by apt and store in the keyring directory.
-            wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | \
-              gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null 
-
-      2. Register ROCm packages.
-
-         .. code-block:: shell
-
-            echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/7.0_alpha jammy main" \
-              | sudo tee /etc/apt/sources.list.d/rocm.list
-
-            echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/graphics/7.0_alpha/ubuntu jammy main" \ 
-              | sudo tee /etc/apt/sources.list.d/rocm-graphics.list
-
-            echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' \
-              | sudo tee /etc/apt/preferences.d/rocm-pin-600
-            sudo apt update
-
-   .. tab-item:: Ubuntu 24.04
-      :sync: ubuntu-24
-
-      1. Add the package signing key.
-
-         .. code-block:: shell
-
-            # Make the directory if it doesn't exist yet.
-            # This location is recommended by the distribution maintainers.
-            sudo mkdir --parents --mode=0755 /etc/apt/keyrings 
-            # Download the key, convert the signing-key to a full
-            # keyring required by apt and store in the keyring directory.
-            wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | \
-              gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null 
-
-      2. Register ROCm packages.
-
-         .. code-block:: shell
-
-            echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/7.0_alpha noble main" \
-              | sudo tee /etc/apt/sources.list.d/rocm.list
-
-            echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/graphics/7.0_alpha/ubuntu noble main" \
-              | sudo tee /etc/apt/sources.list.d/rocm-graphics.list
-
-            echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' \
-              | sudo tee /etc/apt/preferences.d/rocm-pin-600
-            sudo apt update 
-
-   .. tab-item:: RHEL 9.6
-      :sync: rhel-96
-
-      .. code-block:: shell
-
-         sudo tee /etc/yum.repos.d/rocm.repo <<EOF
-         [ROCm-7.0.0]
-         name=ROCm7.0.0
-         baseurl=https://repo.radeon.com/rocm/el9/7.0_alpha/main
-         enabled=1
-         priority=50
-         gpgcheck=1
-         gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key
-         EOF
-
-         sudo tee /etc/yum.repos.d/rocm-graphics.repo <<EOF
-         [ROCm-7.0.0-Graphics]
-         name=ROCm7.0.0-Graphics
-         baseurl=https://repo.radeon.com/graphics/7.0_alpha/rhel/9/main/x86_64/
-         enabled=1
-         priority=50
-         gpgcheck=1
-         gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key
-         EOF
-         sudo dnf clean all
-
-Install ROCm
-============
-
-.. tab-set::
-
-   .. tab-item:: Ubuntu 22.04
-      :sync: ubuntu-22
-
-      .. code-block:: shell
-
-         sudo apt install rocm
-
-   .. tab-item:: Ubuntu 24.04
-      :sync: ubuntu-24
-
-      .. code-block:: shell
-
-         sudo apt install rocm
-
-   .. tab-item:: RHEL 9.6
-      :sync: rhel-96
-
-      .. code-block:: shell
-
-         sudo dnf install rocm
-
-.. _uninstall-rocm:
-
-Uninstalling
-============
-
-.. tab-set::
-
-   .. tab-item:: Ubuntu 22.04
-      :sync: ubuntu-22
-
-      1. Uninstall specific meta packages.
-
-         .. code-block:: shell
-
-            sudo apt autoremove rocm
-
-      2. Uninstall ROCm packages.
-
-         .. code-block:: shell
-
-            sudo apt autoremove rocm-core
-
-      3. Remove ROCm repositories.
-
-         .. code-block:: shell
-
-            sudo rm /etc/apt/sources.list.d/rocm*.list
-            # Clear the cache and clean the system
-            sudo rm -rf /var/cache/apt/*
-            sudo apt clean all
-            sudo apt update
-
-   .. tab-item:: Ubuntu 24.04
-      :sync: ubuntu-24
-
-      1. Uninstall specific meta packages.
-
-         .. code-block:: shell
-
-            sudo apt autoremove rocm
-
-      2. Uninstall ROCm packages.
-
-         .. code-block:: shell
-
-            sudo apt autoremove rocm-core
-
-      3. Remove ROCm repositories.
-
-         .. code-block:: shell
-
-            sudo rm /etc/apt/sources.list.d/rocm*.list
-            # Clear the cache and clean the system
-            sudo rm -rf /var/cache/apt/*
-            sudo apt clean all
-            sudo apt update
-
-   .. tab-item:: RHEL 9.6
-      :sync: rhel-96
-
-      1. Uninstall specific meta packages.
-
-         .. code-block:: shell
-
-            sudo dnf remove rocm
-
-      2. Uninstall ROCm packages.
-
-         .. code-block:: shell
-
-            sudo dnf remove rocm-core amdgpu-core
-
-      3. Remove ROCm repositories.
-
-         .. code-block:: shell
-
-            sudo rm /etc/yum.repos.d/rocm*.repo*
-            # Clear the cache and clean the system
-            sudo rm -rf /var/cache/dnf
-            sudo dnf clean all
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Adel Johar	9f6afc9400	Docs: Pytorch compatibility page update	2025-06-18 11:14:50 +02:00
Peter Park	6286090d12	Merge pull request #4925 from peterjunpark/docs/6.4.1 [docs/6.4.1] Fix Sphinx issue in vllm-benchmark 0.8.5-20250513 previous version (#…	2025-06-13 15:15:46 -04:00
Peter Park	53f30c7880	Fix Sphinx issue in vllm-benchmark 0.8.5-20250513 previous version (#4924 ) * fix sphinx issue in vllm-benchmark 0.8.5-20250513 previous version * update article_info in conf.py * update rocm/vllm (cherry picked from commit `d69037bfcc`)	2025-06-13 15:04:54 -04:00
Istvan Kiss	8e0e0b93c6	Docs: Overhaul JAX compatibility page (#4917 ) Co-authored-by: Adel Johar <adel.johar@amd.com>	2025-06-12 15:25:28 +02:00
Pratik Basyal	629b9184b4	Link to 6.4.1 updated from internal to public (#4913 ) (#4914 )	2025-06-10 17:19:45 -04:00
Peter Park	b3e8ac32e7	Merge pull request #4911 from peterjunpark/docs/6.4.1 [docs/6.4.1] Add Mochi Video to pytorch-inference-benchmark-models.yaml	2025-06-10 13:18:50 -04:00
Peter Park	419b3a02a2	add mochi video to pytorch-inference-benchmark-models.yaml fix container tag fix container tag update model selector col width in pytorch-inference.rst model name (cherry picked from commit `51fc77d7fc`)	2025-06-10 13:07:35 -04:00
Alex Xu	304809951f	upgrade rocm-docs-core to 1.20.1 (cherry picked from commit `685457834a`)	2025-06-09 14:54:01 -04:00
yugang-amd	c9f1c821eb	Update for vllm -05/27 (#4886 ) (#4888 ) * Update vLLM inference benchmark Docker page for rocm/vllm 5/27 * update repo for Pytorch	2025-06-05 13:40:56 -04:00
Pratik Basyal	876e11fc8d	KMD version updated in compatibility matrix (#4873 ) (#4879 )	2025-06-04 06:43:49 -04:00
Pratik Basyal	1c2513b788	GPU SKU added to ROCm 6.4.1 (#4875 )	2025-06-03 16:28:34 -04:00
yugang-amd	7d26eb0e6f	Fix broken link (#4867 )	2025-06-03 11:01:44 -04:00
randyh62	a62f4a5296	add reference to HIP 7.0 blog for upcoming changes (#4862 )	2025-05-30 19:37:06 -07:00
yugang-amd	404e91f2d9	Update compatibility-matrix.rst (#4860 )	2025-05-30 17:50:33 -04:00
alexxu-amd	50cfc538ff	Change viewer link from latest to mainline in what-is-rocm page (#4856 ) * change viewer link from latest to mainline * correct format (cherry picked from commit `c1919faccd`)	2025-05-30 17:18:40 -04:00
Swati Rawat	a9c323e596	Docs: Add rocprof-compute-viewer (#4850 ) * Docs: Add rocprof-compute-viewer * update requirements.txt --------- Co-authored-by: Alex Xu <alex.xu@amd.com> (cherry picked from commit `6142df329b`)	2025-05-30 15:22:51 -04:00
Peter Park	7a81d10c1d	Add RHEL 9.6 to compat matrix (#4839 ) * add RHEL 9.6 to compat matrix * add os support note (cherry picked from commit `2addcb0bca`)	2025-05-30 14:57:24 -04:00
Jeffrey Novotny	43736ef655	Merge pull request #4853 from amd-jnovotny/release-notes-641-docs641 Cherry-pick to docs/6.4.1: Update release notes with RHEL 9.6 (#4848)	2025-05-30 14:54:17 -04:00
Jeffrey Novotny	d4416e2162	Update release notes with RHEL 9.6 (#4848 ) (cherry picked from commit `106cecba5e`)	2025-05-30 14:50:30 -04:00
yugang-amd	00f74d2d8e	Add microsoft/phi-4 vllm-benchmark-models (#4801 ) (#4847 ) * add Phi-4 to vllm-benchmark-models.yaml fix model_repo * update model group names Co-authored-by: Peter Park <peter.park@amd.com>	2025-05-30 09:20:17 -04:00
Peter Park	db9e845844	Add vLLM benchmark and ML framework Docker doc updates to docs/6.4.1 (#4844 ) * Add Falcon-180B to vLLM benchmark Docker doc (#4836) * add Falcon to vllm-benchmark-models.yaml * update group name (cherry picked from commit `daf2e980d9`) * Update ML framework Docker inventories for 6.4.1 (#4841) * Update tensorflow Docker compatibility table * update jax Docker compatibility table * fix py versions * update pytorch Docker compatibility table (cherry picked from commit `93fd0ef1d4`)	2025-05-29 18:50:03 -04:00
Peter Park	4963eeab00	Update ML framework Docker inventories for 6.4.1 (#4841 ) * Update tensorflow Docker compatibility table * update jax Docker compatibility table * fix py versions * update pytorch Docker compatibility table (cherry picked from commit `93fd0ef1d4`)	2025-05-29 18:34:47 -04:00
Peter Park	7c25ce240b	Add Falcon-180B to vLLM benchmark Docker doc (#4836 ) * add Falcon to vllm-benchmark-models.yaml * update group name (cherry picked from commit `daf2e980d9`)	2025-05-29 18:34:47 -04:00
Peter Park	bac2d038f7	Merge pull request #4830 from peterjunpark/docs/6.4.1 [docs/6.4.1] Fix typo in Megatron-LM Docker pull tags	2025-05-28 15:18:14 -04:00
Peter Park	fdeaacd3cc	fix megatron-lm pull tags	2025-05-28 15:12:50 -04:00
Peter Park	8e61ba4f90	Fix rocm/vllm pull tag fix	2025-05-28 14:42:35 -04:00
Peter Park	4051e985d4	Merge pull request #4826 from peterjunpark/docs/6.4.1 [6.4.1] Add latest rocm/vllm Docker details in vLLM inference benchmark guide	2025-05-28 14:27:08 -04:00
Peter Park	94ee445a8a	Add latest rocm/vllm Docker details in vLLM inference benchmark guide (#4824 ) * update rocm/vllm Docker details to latest release * Add previous vLLM version * fix 'further reading' xrefs * improve model grouping names * fix links * update model picker text (cherry picked from commit `cebf0f5975`)	2025-05-28 14:23:05 -04:00
Peter Park	535859ac9f	Add RDNA4 RX 9070 GRE to gpu-arch-specs.rst and RELEASE.md (#4820 ) (#4821 ) (cherry picked from commit `0acb457389`)	2025-05-28 10:26:55 -04:00
Peter Park	2e5fe544a0	Add RDNA4 RX 9070 GRE to gpu-arch-specs.rst and RELEASE.md (#4820 ) (cherry picked from commit `0acb457389`)	2025-05-28 10:21:50 -04:00
yugang-amd	4dae0ba84d	Update SGPR for RDNA3 and RDNA2 series (#4815 )	2025-05-27 15:13:22 -04:00
yugang-amd	5ddab465c3	Bump up requirement version (#4805 ) * bump up requirement version * update requirements.txt * Use Python 3.10	2025-05-27 11:08:55 -04:00
yugang-amd	151e563dcb	Merge pull request #4792 from yugang-amd/wavefront-size-6-4-1 Update wavefront size	2025-05-26 14:56:38 -04:00
yugang-amd	2098af1456	Merge pull request #4803 from yugang-amd/link-fix-6-4-1 fix broken links	2025-05-26 14:42:39 -04:00
yugang-amd	ae1a330fd7	fix links	2025-05-26 14:35:36 -04:00
yugang-amd	cab805674a	update wavefront size (cherry picked from commit `230b01565f`)	2025-05-26 13:56:14 -04:00
yugang-amd	387cfab91f	fix typo	2025-05-26 12:53:18 -04:00
yugang-amd	525703a5ab	update wavefront size	2025-05-22 17:41:36 -04:00
Peter Park	ce65e6783b	Merge pull request #4783 from peterjunpark/docs/6.4.1 Document specs for Radeon RX 9070 + small fix in megatron-lm doc (#4780)	2025-05-22 16:33:33 -04:00
Peter Park	6d2b1595b3	Document specs for Radeon RX 9070 + small fix in megatron-lm doc (#4780 ) * Document specs for Radeon RX 9070 * fix wrong version in megatron-lm.rst (cherry picked from commit `505041d90a`)	2025-05-22 16:30:56 -04:00
yugang-amd	31e9013bdc	update rocSHMEM xrefs (cherry picked from commit `7697298f5d`)	2025-05-22 15:19:09 -04:00
Peter Park	698ac70662	Merge pull request #4779 from peterjunpark/docs/6.4.1 [6.4.1] Add Megatron-LM benchmark doc 5/2 (#4778)	2025-05-22 14:36:29 -04:00
Peter Park	9b69755b99	Add Megatron-LM benchmark doc 5/2 (#4778 ) * reorg files * add tabs * update template * update template * update wordlist and toc * add previous version to doc * add selector paragraph * update wordlist.txt (cherry picked from commit `9ed65a81c4`)	2025-05-22 14:29:40 -04:00
Peter Park	05773ca41e	Merge pull request #4776 from peterjunpark/docs/6.4.1 [docs/6.4.1] fix 9070 XT gfx target in gpu-arch-specs table (#4775)	2025-05-22 12:15:41 -04:00
Peter Park	4f80043312	fix 9070 XT gfx target in gpu-arch-specs table (#4775 ) (cherry picked from commit `6d9f430c70`)	2025-05-22 12:12:14 -04:00
Peter Park	223fbb8f28	remove HIP upcoming changes reference link (#4771 ) (#4772 ) (cherry picked from commit `f1f2b3cac2`)	2025-05-21 12:27:07 -07:00
Alex Xu	845b3c4d5a	Merge branch 'roc-6.4.x' into docs/6.4.1	2025-05-21 15:04:20 -04:00
Alex Xu	11747aaadc	Merge branch 'develop' into roc-6.4.x	2025-05-21 15:04:02 -04:00
Alex Xu	8e7d43bec2	Merge branch 'roc-6.4.x' into docs/6.4.1	2025-05-21 12:27:43 -04:00
Alex Xu	1088beefe5	Merge branch 'develop' into roc-6.4.x	2025-05-21 12:27:13 -04:00
Alex Xu	b7988925a5	Merge branch 'develop' into roc-6.4.x	2025-05-21 12:25:30 -04:00
chiranjeevipattigidi	89dafa6232	Update packages - remove broken packages (#4758 ) * Update envsetup.sh HIP_ON_ROCclr_ROOT path to hip and remove aqlprofiletest * Update packages - remove broken packages	2025-05-21 09:06:39 -07:00
alexxu-amd	080b15d261	Sync develop into docs/6.4.1	2025-05-20 21:24:27 -04:00
chiranjeevipattigidi	8054852dad	Update envsetup.sh HIP_ON_ROCclr_ROOT path to hip and remove (#4755 ) aqlprofiletest	2025-05-20 07:59:07 -07:00
ammallya	542d7813ce	Removing aqlprofiletest	2025-04-14 15:26:24 -07:00
ammallya	bc1ffe4fcb	bypass tests	2025-04-14 13:41:34 -07:00
ammallya	09997c68bb	Removing kfd test	2025-04-14 12:55:13 -07:00
ammallya	42bc3501ac	Merge pull request #4623 from ammallya/roc-6.4.x Rebasing branch 6.4.x	2025-04-14 11:42:06 -07:00