update links to vllm perf validation doc

External CI: various fixes (#3963 )
External CI: Add aqlprofile to Tensile test dependencies (#3961 )
2026-01-09 22:58:17 -05:00 · 2024-10-30 15:52:12 -04:00 · 2024-10-30 15:52:12 -04:00 · 2024-10-30 15:52:12 -04:00 · 2024-10-30 14:08:35 -04:00 · 2024-10-30 12:51:40 -04:00
92 changed files with 1576 additions and 603 deletions
--- a/.azuredevops/ci-builds/aomp-mainline.yml
+++ b/.azuredevops/ci-builds/aomp-mainline.yml
@@ -0,0 +1,42 @@
+variables:
+- group: common
+- template: /.azuredevops/variables-global.yml
+
+resources:
+  repositories:
+  - repository: aomp_repo
+    type: github
+    endpoint: ROCm
+    name: ROCm/aomp
+    ref: amd-mainline-open
+  - repository: aomp-extras_repo
+    type: github
+    endpoint: ROCm
+    name: ROCm/aomp-extras
+    ref: amd-mainline-open
+  - repository: flang_repo
+    type: github
+    endpoint: ROCm
+    name: ROCm/flang
+    ref: amd-mainline-open
+  - repository: llvm-project_repo
+    type: github
+    endpoint: ROCm
+    name: ROCm/llvm-project
+    ref: amd-mainline-open
+  pipelines:
+  - pipeline: rocr-runtime_pipeline
+    source: \ROCR-Runtime
+    trigger:
+      branches:
+        include:
+        - amd-master
+# this job will only be triggered after successful build sequence of llvm-project and ROCR-Runtime
+
+trigger: none
+pr: none
+
+jobs:
+  - template: ${{ variables.CI_COMPONENT_PATH }}/aomp.yml
+    parameters:
+      checkoutRepo: aomp_repo
--- a/.azuredevops/ci-builds/aomp-staging.yml
+++ b/.azuredevops/ci-builds/aomp-staging.yml
@@ -27,7 +27,10 @@ resources:
  pipelines:
  - pipeline: rocr-runtime_pipeline
    source: \ROCR-Runtime
-    trigger: true
+    trigger:
+      branches:
+        include:
+        - amd-staging
 # this job will only be triggered after successful build sequence of llvm-project and ROCR-Runtime

 trigger: none
--- a/.azuredevops/components/AMDMIGraphX.yml
+++ b/.azuredevops/components/AMDMIGraphX.yml
@@ -27,12 +27,12 @@ parameters:
  type: object
  default:
    - https://github.com/RadeonOpenCompute/rbuild/archive/master.tar.gz
-    - onnx==1.14.1
-    - numpy==1.21.6
-    - typing==3.7.4
-    - pytest==6.0.1
-    - packaging==23.0
-    - protobuf==3.20.2
+    - onnx>=1.14.1
+    - numpy>=1.21.6
+    - typing>=3.7.4
+    - pytest>=6.0.1
+    - packaging>=23.0
+    - protobuf>=3.20.2
 - name: rocmDependencies
  type: object
  default:
@@ -84,8 +84,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
@@ -129,6 +127,8 @@ jobs:
      gpuTarget: $(JOB_GPU_TARGET)

 - job: AMDMIGraphX_testing
+  dependsOn: AMDMIGraphX
+  condition: and(succeeded(), eq(variables.ENABLE_GFX942_TESTS, 'true'), not(containsValue(split(variables.DISABLED_GFX942_TESTS, ','), variables['Build.DefinitionName'])))
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -188,9 +188,10 @@ jobs:
        -DBUILD_TESTING=ON
        -DMIGRAPHX_ENABLE_C_API_TEST=ON
        ..
-  - task: Bash@3
-    displayName: Build and run MIGraphX tests
-    inputs:
-      targetType: inline
-      workingDirectory: build
-      script: make -j$(nproc) check
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+    parameters:
+      componentName: AMDMIGraphX
+      testExecutable: make
+      testParameters: -j$(nproc) check
+      testPublishResults: false
--- a/.azuredevops/components/MIOpen.yml
+++ b/.azuredevops/components/MIOpen.yml
@@ -16,6 +16,7 @@ parameters:
    - libgtest-dev
    - libsqlite3-dev
    - libstdc++-12-dev
+    - libzstd-dev
    - ninja-build
    - nlohmann-json3-dev
    - python3-pip
@@ -74,26 +75,17 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
      aptPackages: ${{ parameters.aptPackages }}
      pipModules: ${{ parameters.pipModules }}
-  # The default boost library from apt is 1.74, which does not satisfy MIOpen's build requirement (1.79+)
-  # Upgrade boost from apt by following https://launchpad.net/~mhier/+archive/ubuntu/libboost-latest
-  - task: Bash@3
-    displayName: 'Install Boost 1.83'
-    inputs:
-      targetType: inline
-      script: |
-        sudo add-apt-repository ppa:mhier/libboost-latest -y
-        sudo apt-get --yes install libboost1.83-dev libboost-system1.83-dev libboost-filesystem1.83-dev
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
    parameters:
      checkoutRepo: ${{ parameters.checkoutRepo }}
+  # The default boost library from apt is 1.74, which does not satisfy MIOpen's build requirement (1.79+)
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-boost.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
    parameters:
      dependencyList: ${{ parameters.rocmDependencies }}
@@ -109,7 +101,8 @@ jobs:
      extraBuildFlags: >-
        -DMIOPEN_BACKEND=HIP
        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
-        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/boost
+        -DAMDGPU_TARGETS=$(JOB_GPU_TARGET)
        -DMIOPEN_ENABLE_AI_KERNEL_TUNING=OFF
        -DMIOPEN_ENABLE_AI_IMMED_MODE_FALLBACK=OFF
        -DCMAKE_BUILD_TYPE=Release
@@ -120,6 +113,8 @@ jobs:
      gpuTarget: $(JOB_GPU_TARGET)

 - job: MIOpen_testing
+  dependsOn: MIOpen
+  condition: and(succeeded(), eq(variables.ENABLE_GFX942_TESTS, 'true'), not(containsValue(split(variables.DISABLED_GFX942_TESTS, ','), variables['Build.DefinitionName'])))
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -136,17 +131,12 @@ jobs:
    parameters:
      aptPackages: ${{ parameters.aptPackages }}
      pipModules: ${{ parameters.pipModules }}
-  - task: Bash@3
-    displayName: 'Install Boost 1.83'
-    inputs:
-      targetType: inline
-      script: |
-        sudo add-apt-repository ppa:mhier/libboost-latest -y
-        sudo apt-get --yes install libboost1.83-dev libboost-system1.83-dev libboost-filesystem1.83-dev
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
    parameters:
      checkoutRepo: ${{ parameters.checkoutRepo }}
+  # The default boost library from apt is 1.74, which does not satisfy MIOpen's build requirement (1.79+)
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-boost.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
    parameters:
      ${{ if eq(parameters.checkoutRef, '') }}:
@@ -198,7 +188,7 @@ jobs:
    displayName: 'MIOpen Test CMake Flags'
    inputs:
      cmakeArgs: >-
-        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Build.SourcesDirectory)/bin;$(Build.SourcesDirectory)/cget/cget/pkg/Dobiasd__FunctionalPlus/install
+        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Build.SourcesDirectory)/bin;$(Build.SourcesDirectory)/cget/cget/pkg/Dobiasd__FunctionalPlus/install;$(Agent.BuildDirectory)/boost
        -DCMAKE_INSTALL_PREFIX=$(Agent.BuildDirectory)/rocm
        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
        -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
@@ -209,19 +199,15 @@ jobs:
        -DMIOPEN_USE_MLIR=ON
        -DMIOPEN_GPU_SYNC=OFF
        ..
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-    parameters:
-      componentName: MIOpen
-      testExecutable: 'CTEST_PARALLEL_LEVEL=4 make -j$(nproc) check'
-      testParameters: ''
-      reloadAMDGPU: true
-      testPublishResults: false
  - task: Bash@3
-    condition: always()
-    displayName: Clean up Boost 1.83
+    displayName: 'MIOpen Test Build'
    inputs:
      targetType: inline
      script: |
-        sudo apt -y autoremove libboost1.83-dev libboost-system1.83-dev libboost-filesystem1.83-dev
-        sudo add-apt-repository --remove ppa:mhier/libboost-latest -y
-        sudo apt update
+        cmake --build . --target tests -- -j$(nproc)
+      workingDirectory: $(Build.SourcesDirectory)/build
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+    parameters:
+      componentName: MIOpen
+      reloadAMDGPU: true
--- a/.azuredevops/components/MIVisionX.yml
+++ b/.azuredevops/components/MIVisionX.yml
@@ -86,8 +86,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
@@ -120,7 +118,7 @@ jobs:

 - job: MIVisionX_testing
  dependsOn: MIVisionX
-  condition: succeeded()
+  condition: and(succeeded(), eq(variables.ENABLE_GFX942_TESTS, 'true'), not(containsValue(split(variables.DISABLED_GFX942_TESTS, ','), variables['Build.DefinitionName'])))
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -162,6 +160,7 @@ jobs:
      mkdir mivisionx-tests
      cd mivisionx-tests
      cmake /opt/rocm/share/mivisionx/test
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
    parameters:
      componentName: MIVisionX
--- a/.azuredevops/components/ROCR-Runtime.yml
+++ b/.azuredevops/components/ROCR-Runtime.yml
@@ -59,7 +59,7 @@ jobs:

 - job: ROCR_Runtime_testing
  dependsOn: ROCR_Runtime
-  condition: succeeded()
+  condition: and(succeeded(), eq(variables.ENABLE_GFX942_TESTS, 'true'), not(containsValue(split(variables.DISABLED_GFX942_TESTS, ','), variables['Build.DefinitionName'])))
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -102,8 +102,12 @@ jobs:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
    parameters:
      checkoutRepo: ${{ parameters.checkoutRepo }}
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+    parameters:
+      runRocminfo: false
  - task: Bash@3
    displayName: Build kfdtest
+    continueOnError: true
    inputs:
      targetType: 'inline'
      workingDirectory: $(Build.SourcesDirectory)/libhsakmt/tests/kfdtest
@@ -119,6 +123,7 @@ jobs:
      testDir: $(Build.SourcesDirectory)/libhsakmt/tests/kfdtest/scripts
  - task: Bash@3
    displayName: Build rdmatest app
+    continueOnError: true
    inputs:
      targetType: 'inline'
      workingDirectory: $(Build.SourcesDirectory)/libhsakmt/tests/rdma/simple/app
@@ -127,6 +132,7 @@ jobs:
        cmake --build .
  - task: Bash@3
    displayName: Build rdmatest driver
+    continueOnError: true
    inputs:
      targetType: 'inline'
      workingDirectory: $(Build.SourcesDirectory)/libhsakmt/tests/rdma/simple/drv
@@ -136,6 +142,7 @@ jobs:
        RDMA_HEADER_DIR=/usr/src/amdgpu-*/include make all
  - task: Bash@3
    displayName: Install rdmatest driver
+    continueOnError: true
    inputs:
      targetType: 'inline'
      workingDirectory: $(Build.SourcesDirectory)/libhsakmt/tests/rdma/simple/drv
@@ -151,6 +158,7 @@ jobs:
      testPublishResults: false
  - task: Bash@3
    displayName: Build rocrtst
+    continueOnError: true
    inputs:
      targetType: 'inline'
      workingDirectory: $(Build.SourcesDirectory)/rocrtst/suites/test_common
--- a/.azuredevops/components/ROCgdb.yml
+++ b/.azuredevops/components/ROCgdb.yml
@@ -32,6 +32,7 @@ parameters:

 jobs:
 - job: ROCgdb
+  condition: and(succeeded(), eq(variables.ENABLE_GFX942_TESTS, 'true'), not(containsValue(split(variables.DISABLED_GFX942_TESTS, ','), variables['Build.DefinitionName'])))
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -100,6 +101,7 @@ jobs:
        sudo rm -rf /opt/rocm
        sudo ln -s $(Agent.BuildDirectory)/rocm /opt/rocm
        echo "##vso[task.prependpath]/opt/rocm/bin"
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
  - task: Bash@3
    displayName: check-gdb
    continueOnError: true
--- a/.azuredevops/components/ROCmValidationSuite.yml
+++ b/.azuredevops/components/ROCmValidationSuite.yml
@@ -13,6 +13,7 @@ parameters:
    - libyaml-cpp-dev
    - libpci-dev
    - libpci3
+    - libgst-dev
    - libgtest-dev
    - git
 - name: rocmDependencies
@@ -40,6 +41,7 @@ parameters:
    - llvm-project
    - rocBLAS
    - rocm_smi_lib
+    - rocminfo
    - rocprofiler-register
    - ROCR-Runtime
    - rocRAND
@@ -64,8 +66,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
@@ -98,7 +98,7 @@ jobs:

 - job: ROCmValidationSuite_testing
  dependsOn: ROCmValidationSuite
-  condition: succeeded()
+  condition: and(succeeded(), eq(variables.ENABLE_GFX942_TESTS, 'true'), not(containsValue(split(variables.DISABLED_GFX942_TESTS, ','), variables['Build.DefinitionName'])))
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -132,6 +132,7 @@ jobs:
        dependencySource: staging
      ${{ elseif ne(parameters.checkoutRef, '') }}:
        dependencySource: tag-builds
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
    parameters:
      componentName: ROCmValidationSuite
--- a/.azuredevops/components/Tensile.yml
+++ b/.azuredevops/components/Tensile.yml
@@ -0,0 +1,175 @@
+parameters:
+- name: checkoutRepo
+  type: string
+  default: 'self'
+- name: checkoutRef
+  type: string
+  default: ''
+- name: aptPackages
+  type: object
+  default:
+    - python3-pip
+    - cmake
+    - libmsgpack-dev
+    - libboost-program-options-dev
+- name: pipModules
+  type: object
+  default:
+    - tox
+    - pytest
+- name: rocmDependencies
+  type: object
+  default:
+    - aomp
+    - clr
+    - llvm-project
+    - rocm-cmake
+    - rocm-core
+    - rocminfo
+    - rocm_smi_lib
+    - rocprofiler-register
+    - ROCR-Runtime
+
+jobs:
+- job: Tensile
+  variables:
+  - group: common
+  - template: /.azuredevops/variables-global.yml
+  pool:
+    vmImage: ${{ variables.BASE_BUILD_POOL }}
+  workspace:
+    clean: all
+  steps:
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+    parameters:
+      aptPackages: ${{ parameters.aptPackages }}
+      pipModules: ${{ parameters.pipModules }}
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+    parameters:
+      checkoutRepo: ${{ parameters.checkoutRepo }}
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+    parameters:
+      dependencyList: ${{ parameters.rocmDependencies }}
+      # CI case: download latest default branch build
+      ${{ if eq(parameters.checkoutRef, '') }}:
+        dependencySource: staging
+      # manual build case: triggered by ROCm/ROCm repo
+      ${{ elseif ne(parameters.checkoutRef, '') }}:
+        dependencySource: tag-builds
+  - task: Bash@3
+    displayName: Create wheel file
+    inputs:
+      targetType: inline
+      script: python3 setup.py bdist_wheel
+      workingDirectory: $(Build.SourcesDirectory)
+    env:
+      ROCM_PATH: $(Agent.BuildDirectory)/rocm
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-prepare-package.yml
+    parameters:
+      sourceDir: $(Build.SourcesDirectory)/dist
+      contentsString: '*.whl'
+      targetDir: $(Build.ArtifactStagingDirectory)
+      clean: false
+  - task: PublishPipelineArtifact@1
+    displayName: 'wheel file Publish'
+    retryCountOnTaskFailure: 3
+    inputs:
+      targetPath: $(Build.ArtifactStagingDirectory)
+
+- job: Tensile_testing
+  dependsOn: Tensile
+  condition: and(succeeded(), eq(variables.ENABLE_GFX942_TESTS, 'true'), not(containsValue(split(variables.DISABLED_GFX942_TESTS, ','), variables['Build.DefinitionName'])))
+  variables:
+  - group: common
+  - template: /.azuredevops/variables-global.yml
+  pool: $(JOB_TEST_POOL)
+  workspace:
+    clean: all
+  strategy:
+    matrix:
+      gfx942:
+        JOB_GPU_TARGET: gfx942
+        JOB_TEST_POOL: ${{ variables.GFX942_TEST_POOL }}
+  steps:
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+    parameters:
+      aptPackages: ${{ parameters.aptPackages }}
+      pipModules: ${{ parameters.pipModules }}
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+  - task: DownloadPipelineArtifact@2
+    displayName: 'Download Pipeline Wheel Files'
+    inputs:
+      itemPattern: '**/*.whl'
+      targetPath: $(Agent.BuildDirectory)
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+    parameters:
+      checkoutRepo: ${{ parameters.checkoutRepo }}
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+    parameters:
+      ${{ if eq(parameters.checkoutRef, '') }}:
+        dependencySource: staging
+      ${{ elseif ne(parameters.checkoutRef, '') }}:
+        dependencySource: tag-builds
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+    parameters:
+      dependencyList: ${{ parameters.rocmDependencies }}
+      gpuTarget: $(JOB_GPU_TARGET)
+      ${{ if eq(parameters.checkoutRef, '') }}:
+        dependencySource: staging
+      ${{ elseif ne(parameters.checkoutRef, '') }}:
+        dependencySource: tag-builds
+  - task: Bash@3
+    displayName: pip install
+    inputs:
+      targetType: inline
+      script: find -name *.whl -exec pip install {} \;
+      workingDirectory: $(Agent.BuildDirectory)
+  - task: Bash@3
+    displayName: Setup test environment
+    inputs:
+      targetType: inline
+      script: |
+        sudo rm -rf /opt/rocm
+        sudo ln -s $(Agent.BuildDirectory)/rocm /opt/rocm
+  - task: Bash@3
+    displayName: Add Python site-packages binaries to path
+    inputs:
+      targetType: inline
+      script: |
+        USER_BASE=$(python3 -m site --user-base)
+        echo "##vso[task.prependpath]$USER_BASE/bin"
+  - task: Bash@3
+    displayName: Add ROCm binaries to PATH
+    inputs:
+      targetType: inline
+      script: echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/bin"
+  - task: Bash@3
+    displayName: Add ROCm compilers to PATH
+    inputs:
+      targetType: inline
+      script: echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/llvm/bin"
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+  - task: Bash@3
+    displayName: tox test
+    inputs:
+      targetType: inline
+      script: tox run -v -e ci -- -m pre_checkin
+      workingDirectory: $(Build.SourcesDirectory)
+  - task: Bash@3
+    displayName: Remove Python site-packages binaries from path
+    inputs:
+      targetType: inline
+      script: |
+        USER_BASE=$(python3 -m site --user-base)
+        echo "##vso[task.setvariable variable=PATH]$(echo $PATH | sed -e 's;:$USER_BASE/bin;;' -e 's;^/;;' -e 's;/$;;')"
+  - task: Bash@3
+    displayName: Remove ROCm binaries from PATH
+    inputs:
+      targetType: inline
+      script: echo "##vso[task.setvariable variable=PATH]$(echo $PATH | sed -e 's;:$(Agent.BuildDirectory)/rocm/bin;;' -e 's;^/;;' -e 's;/$;;')"
+  - task: Bash@3
+    displayName: Remove ROCm compilers from PATH
+    inputs:
+      targetType: inline
+      script: echo "##vso[task.setvariable variable=PATH]$(echo $PATH | sed -e 's;:$(Agent.BuildDirectory)/rocm/llvm/bin;;' -e 's;^/;;' -e 's;/$;;')"
--- a/.azuredevops/components/amdsmi.yml
+++ b/.azuredevops/components/amdsmi.yml
@@ -35,7 +35,7 @@ jobs:

 - job: amdsmi_testing
  dependsOn: amdsmi
-  condition: succeeded()
+  condition: and(succeeded(), eq(variables.ENABLE_GFX942_TESTS, 'true'), not(containsValue(split(variables.DISABLED_GFX942_TESTS, ','), variables['Build.DefinitionName'])))
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -53,6 +53,9 @@ jobs:
      aptPackages: ${{ parameters.aptPackages }}
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+    parameters:
+      runRocminfo: false
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
    parameters:
      componentName: amdsmi
--- a/.azuredevops/components/aomp.yml
+++ b/.azuredevops/components/aomp.yml
@@ -413,7 +413,7 @@ jobs:

 - job: aomp_testing
  dependsOn: aomp
-  condition: succeeded()
+  condition: and(succeeded(), eq(variables.ENABLE_GFX942_TESTS, 'true'), not(containsValue(split(variables.DISABLED_GFX942_TESTS, ','), variables['Build.DefinitionName'])))
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -472,6 +472,7 @@ jobs:
      Contents: FileCheck
      TargetFolder: $(Agent.BuildDirectory)/rocm/share/openmp-extras/tests/bin
      retryCount: 3
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
  - task: Bash@3
    displayName: Test AOMP
    continueOnError: true
--- a/.azuredevops/components/composable_kernel.yml
+++ b/.azuredevops/components/composable_kernel.yml
@@ -48,8 +48,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
@@ -101,7 +99,7 @@ jobs:

 - job: composable_kernel_testing
  dependsOn: composable_kernel
-  condition: succeeded()
+  condition: and(succeeded(), eq(variables.ENABLE_GFX942_TESTS, 'true'), not(containsValue(split(variables.DISABLED_GFX942_TESTS, ','), variables['Build.DefinitionName'])))
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -137,6 +135,7 @@ jobs:
        dependencySource: staging
      ${{ elseif ne(parameters.checkoutRef, '') }}:
        dependencySource: tag-builds
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
  - task: Bash@3
    displayName: Iterate through test scripts
    inputs:
--- a/.azuredevops/components/copyHIP.yml
+++ b/.azuredevops/components/copyHIP.yml
@@ -25,7 +25,7 @@ jobs:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-download.yml
    parameters:
      componentName: HIP
-      pipelineId: $(hip-pipeline-id)
+      pipelineId: $(HIP_PIPELINE_ID)
  - template:  ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-prepare-package.yml
    parameters:
      sourceDir: $(Agent.BuildDirectory)/rocm
--- a/.azuredevops/components/hip-tests.yml
+++ b/.azuredevops/components/hip-tests.yml
@@ -52,8 +52,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
@@ -91,7 +89,7 @@ jobs:
 - job: hip_tests_testing
  timeoutInMinutes: 240
  dependsOn: hip_tests
-  condition: succeeded()
+  condition: and(succeeded(), eq(variables.ENABLE_GFX942_TESTS, 'true'), not(containsValue(split(variables.DISABLED_GFX942_TESTS, ','), variables['Build.DefinitionName'])))
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -133,6 +131,7 @@ jobs:
        sudo rm -rf /opt/rocm
        sudo mkdir -p /opt/rocm/bin
        sudo ln -s $(Agent.BuildDirectory)/rocm/bin/rocm_agent_enumerator /opt/rocm/bin/rocm_agent_enumerator
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
    parameters:
      componentName: hip_tests
--- a/.azuredevops/components/hipBLAS-common.yml
+++ b/.azuredevops/components/hipBLAS-common.yml
@@ -29,7 +29,7 @@ jobs:
  - name: ROCM_PATH
    value: $(Agent.BuildDirectory)/rocm
  - template: /.azuredevops/variables-global.yml
-  pool: 
+  pool:
    vmImage: ${{ variables.BASE_BUILD_POOL }}
  workspace:
    clean: all
--- a/.azuredevops/components/hipBLAS.yml
+++ b/.azuredevops/components/hipBLAS.yml
@@ -59,8 +59,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
@@ -98,7 +96,7 @@ jobs:

 - job: hipBLAS_testing
  dependsOn: hipBLAS
-  condition: succeeded()
+  condition: and(succeeded(), eq(variables.ENABLE_GFX942_TESTS, 'true'), not(containsValue(split(variables.DISABLED_GFX942_TESTS, ','), variables['Build.DefinitionName'])))
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -132,6 +130,7 @@ jobs:
        dependencySource: staging
      ${{ elseif ne(parameters.checkoutRef, '') }}:
        dependencySource: tag-builds
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
    parameters:
      componentName: hipBLAS
--- a/.azuredevops/components/hipBLASLt.yml
+++ b/.azuredevops/components/hipBLASLt.yml
@@ -22,6 +22,7 @@ parameters:
  type: object
  default:
    - joblib
+    - packaging
 - name: rocmDependencies
  type: object
  default:
@@ -74,8 +75,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
@@ -150,7 +149,7 @@ jobs:

 - job: hipBLASLt_testing
  dependsOn: hipBLASLt
-  condition: succeeded()
+  condition: and(succeeded(), eq(variables.ENABLE_GFX942_TESTS, 'true'), not(containsValue(split(variables.DISABLED_GFX942_TESTS, ','), variables['Build.DefinitionName'])))
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -166,6 +165,7 @@ jobs:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
      aptPackages: ${{ parameters.aptPackages }}
+      pipModules: ${{ parameters.pipModules }}
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
    parameters:
@@ -184,6 +184,7 @@ jobs:
        dependencySource: staging
      ${{ elseif ne(parameters.checkoutRef, '') }}:
        dependencySource: tag-builds
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
    parameters:
      componentName: hipBLASLt
--- a/.azuredevops/components/hipCUB.yml
+++ b/.azuredevops/components/hipCUB.yml
@@ -43,8 +43,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
@@ -78,7 +76,7 @@ jobs:

 - job: hipCUB_testing
  dependsOn: hipCUB
-  condition: succeeded()
+  condition: and(succeeded(), eq(variables.ENABLE_GFX942_TESTS, 'true'), not(containsValue(split(variables.DISABLED_GFX942_TESTS, ','), variables['Build.DefinitionName'])))
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -112,6 +110,7 @@ jobs:
        dependencySource: staging
      ${{ elseif ne(parameters.checkoutRef, '') }}:
        dependencySource: tag-builds
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
    parameters:
      componentName: hipCUB
--- a/.azuredevops/components/hipFFT.yml
+++ b/.azuredevops/components/hipFFT.yml
@@ -55,8 +55,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
@@ -96,7 +94,7 @@ jobs:

 - job: hipFFT_testing
  dependsOn: hipFFT
-  condition: succeeded()
+  condition: and(succeeded(), eq(variables.ENABLE_GFX942_TESTS, 'true'), not(containsValue(split(variables.DISABLED_GFX942_TESTS, ','), variables['Build.DefinitionName'])))
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -130,6 +128,7 @@ jobs:
        dependencySource: staging
      ${{ elseif ne(parameters.checkoutRef, '') }}:
        dependencySource: tag-builds
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
    parameters:
      componentName: hipFFT
--- a/.azuredevops/components/hipRAND.yml
+++ b/.azuredevops/components/hipRAND.yml
@@ -45,8 +45,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
@@ -82,7 +80,7 @@ jobs:

 - job: hipRAND_testing
  dependsOn: hipRAND
-  condition: succeeded()
+  condition: and(succeeded(), eq(variables.ENABLE_GFX942_TESTS, 'true'), not(containsValue(split(variables.DISABLED_GFX942_TESTS, ','), variables['Build.DefinitionName'])))
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -116,6 +114,7 @@ jobs:
        dependencySource: staging
      ${{ elseif ne(parameters.checkoutRef, '') }}:
        dependencySource: tag-builds
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
    parameters:
      componentName: hipRAND
--- a/.azuredevops/components/hipSOLVER.yml
+++ b/.azuredevops/components/hipSOLVER.yml
@@ -57,8 +57,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
@@ -103,7 +101,7 @@ jobs:

 - job: hipSOLVER_testing
  dependsOn: hipSOLVER
-  condition: succeeded()
+  condition: and(succeeded(), eq(variables.ENABLE_GFX942_TESTS, 'true'), not(containsValue(split(variables.DISABLED_GFX942_TESTS, ','), variables['Build.DefinitionName'])))
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -137,6 +135,7 @@ jobs:
        dependencySource: staging
      ${{ elseif ne(parameters.checkoutRef, '') }}:
        dependencySource: tag-builds
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
    parameters:
      componentName: hipSOLVER
--- a/.azuredevops/components/hipSPARSE.yml
+++ b/.azuredevops/components/hipSPARSE.yml
@@ -52,8 +52,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
@@ -97,7 +95,7 @@ jobs:

 - job: hipSPARSE_testing
  dependsOn: hipSPARSE
-  condition: succeeded()
+  condition: and(succeeded(), eq(variables.ENABLE_GFX942_TESTS, 'true'), not(containsValue(split(variables.DISABLED_GFX942_TESTS, ','), variables['Build.DefinitionName'])))
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -131,6 +129,7 @@ jobs:
        dependencySource: staging
      ${{ elseif ne(parameters.checkoutRef, '') }}:
        dependencySource: tag-builds
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
    parameters:
      componentName: hipSPARSE
--- a/.azuredevops/components/hipSPARSELt.yml
+++ b/.azuredevops/components/hipSPARSELt.yml
@@ -123,7 +123,7 @@ jobs:

 - job: hipSPARSELt_testing
  dependsOn: hipSPARSELt
-  condition: succeeded()
+  condition: and(succeeded(), eq(variables.ENABLE_GFX942_TESTS, 'true'), not(containsValue(split(variables.DISABLED_GFX942_TESTS, ','), variables['Build.DefinitionName'])))
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -151,6 +151,7 @@ jobs:
        dependencySource: staging
      ${{ elseif ne(parameters.checkoutRef, '') }}:
        dependencySource: tag-builds
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
    parameters:
      componentName: hipSPARSELt
--- a/.azuredevops/components/hipTensor.yml
+++ b/.azuredevops/components/hipTensor.yml
@@ -42,8 +42,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
@@ -79,7 +77,7 @@ jobs:
 - job: hipTensor_testing
  timeoutInMinutes: 90
  dependsOn: hipTensor
-  condition: succeeded()
+  condition: and(succeeded(), eq(variables.ENABLE_GFX942_TESTS, 'true'), not(containsValue(split(variables.DISABLED_GFX942_TESTS, ','), variables['Build.DefinitionName'])))
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -113,6 +111,7 @@ jobs:
        dependencySource: staging
      ${{ elseif ne(parameters.checkoutRef, '') }}:
        dependencySource: tag-builds
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
    parameters:
      componentName: hipTensor
--- a/.azuredevops/components/hipfort.yml
+++ b/.azuredevops/components/hipfort.yml
@@ -51,8 +51,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
@@ -94,7 +92,7 @@ jobs:

 - job: hipfort_testing
  dependsOn: hipfort
-  condition: succeeded()
+  condition: and(succeeded(), eq(variables.ENABLE_GFX942_TESTS, 'true'), not(containsValue(split(variables.DISABLED_GFX942_TESTS, ','), variables['Build.DefinitionName'])))
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -139,6 +137,7 @@ jobs:
        sudo rm -rf /opt/rocm
        sudo ln -s $(Agent.BuildDirectory)/rocm /opt/rocm
      workingDirectory: $(Build.SourcesDirectory)
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
  - task: Bash@3
    displayName: 'Test hipfort'
    inputs:
--- a/.azuredevops/components/omniperf.yml
+++ b/.azuredevops/components/omniperf.yml
@@ -58,8 +58,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
@@ -92,7 +90,7 @@ jobs:

 - job: omniperf_testing
  dependsOn: omniperf
-  condition: succeeded()
+  condition: and(succeeded(), eq(variables.ENABLE_GFX942_TESTS, 'true'), not(containsValue(split(variables.DISABLED_GFX942_TESTS, ','), variables['Build.DefinitionName'])))
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -150,6 +148,7 @@ jobs:
        -DCMAKE_BUILD_TYPE=Release
        -DENABLE_TESTS=ON
        -DINSTALL_TESTS=ON
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
    parameters:
      componentName: omniperf
--- a/.azuredevops/components/omnitrace.yml
+++ b/.azuredevops/components/omnitrace.yml
@@ -40,6 +40,7 @@ parameters:
 - name: rocmDependencies
  type: object
  default:
+    - aomp
    - clr
    - llvm-project
    - rccl
@@ -63,8 +64,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
--- a/.azuredevops/components/rccl.yml
+++ b/.azuredevops/components/rccl.yml
@@ -60,8 +60,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
@@ -102,7 +100,7 @@ jobs:
 - job: rccl_testing
  timeoutInMinutes: 120
  dependsOn: rccl
-  condition: succeeded()
+  condition: and(succeeded(), eq(variables.ENABLE_GFX942_TESTS, 'true'), not(containsValue(split(variables.DISABLED_GFX942_TESTS, ','), variables['Build.DefinitionName'])))
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -136,6 +134,7 @@ jobs:
        dependencySource: staging
      ${{ elseif ne(parameters.checkoutRef, '') }}:
        dependencySource: tag-builds
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
    parameters:
      componentName: rccl
--- a/.azuredevops/components/rdc.yml
+++ b/.azuredevops/components/rdc.yml
@@ -60,8 +60,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
@@ -113,7 +111,7 @@ jobs:

 - job: rdc_testing
  dependsOn: rdc
-  condition: succeeded()
+  condition: and(succeeded(), eq(variables.ENABLE_GFX942_TESTS, 'true'), not(containsValue(split(variables.DISABLED_GFX942_TESTS, ','), variables['Build.DefinitionName'])))
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -131,6 +129,8 @@ jobs:
      aptPackages: ${{ parameters.aptPackages }}
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+    parameters:
+      gpuTarget: $(JOB_GPU_TARGET)
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
    parameters:
      ${{ if eq(parameters.checkoutRef, '') }}:
@@ -156,6 +156,7 @@ jobs:
        sudo ln -s $(Agent.BuildDirectory)/rocm/bin/rdcd /usr/sbin/rdcd
        echo $(Agent.BuildDirectory)/rocm/lib/rdc/grpc/lib | sudo tee /etc/ld.so.conf.d/grpc.conf
        sudo ldconfig -v
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
  - task: Bash@3
    displayName: Test rdc
    inputs:
--- a/.azuredevops/components/rocAL.yml
+++ b/.azuredevops/components/rocAL.yml
@@ -53,6 +53,7 @@ parameters:
    - half
    - llvm-project
    - MIVisionX
+    - rocminfo
    - rocprofiler-register
    - ROCR-Runtime
    - rpp
@@ -70,8 +71,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - task: Bash@3
    displayName: 'Register libjpeg-turbo packages'
@@ -157,7 +156,7 @@ jobs:

 - job: rocAL_testing
  dependsOn: rocAL
-  condition: succeeded()
+  condition: and(succeeded(), eq(variables.ENABLE_GFX942_TESTS, 'true'), not(containsValue(split(variables.DISABLED_GFX942_TESTS, ','), variables['Build.DefinitionName'])))
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -220,6 +219,7 @@ jobs:
        mkdir rocAL-tests
        cd rocAL-tests
        cmake $(Agent.BuildDirectory)/rocm/share/rocal/test
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
    parameters:
      componentName: rocAL
--- a/.azuredevops/components/rocALUTION.yml
+++ b/.azuredevops/components/rocALUTION.yml
@@ -60,8 +60,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
@@ -98,7 +96,7 @@ jobs:

 - job: rocALUTION_testing
  dependsOn: rocALUTION
-  condition: succeeded()
+  condition: and(succeeded(), eq(variables.ENABLE_GFX942_TESTS, 'true'), not(containsValue(split(variables.DISABLED_GFX942_TESTS, ','), variables['Build.DefinitionName'])))
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -132,6 +130,7 @@ jobs:
        dependencySource: staging
      ${{ elseif ne(parameters.checkoutRef, '') }}:
        dependencySource: tag-builds
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
    parameters:
      componentName: rocALUTION
--- a/.azuredevops/components/rocBLAS.yml
+++ b/.azuredevops/components/rocBLAS.yml
@@ -74,8 +74,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
@@ -121,7 +119,7 @@ jobs:

 - job: rocBLAS_testing
  dependsOn: rocBLAS
-  condition: succeeded()
+  condition: and(succeeded(), eq(variables.ENABLE_GFX942_TESTS, 'true'), not(containsValue(split(variables.DISABLED_GFX942_TESTS, ','), variables['Build.DefinitionName'])))
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -156,6 +154,7 @@ jobs:
        dependencySource: staging
      ${{ elseif ne(parameters.checkoutRef, '') }}:
        dependencySource: tag-builds
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
    parameters:
      componentName: rocBLAS
--- a/.azuredevops/components/rocDecode.yml
+++ b/.azuredevops/components/rocDecode.yml
@@ -17,7 +17,7 @@ parameters:
    - libavformat-dev
    - libavutil-dev
    - libstdc++-12-dev
-    - libva-dev
+    - libva-amdgpu-dev
    - mesa-amdgpu-va-drivers
    - libdrm-dev
 - name: rocmDependencies
@@ -35,6 +35,7 @@ parameters:
  default:
    - clr
    - llvm-project
+    - rocminfo
    - rocprofiler-register
    - ROCR-Runtime

@@ -88,7 +89,7 @@ jobs:

 - job: rocDecode_testing
  dependsOn: rocDecode
-  condition: succeeded()
+  condition: and(succeeded(), eq(variables.ENABLE_GFX942_TESTS, 'true'), not(containsValue(split(variables.DISABLED_GFX942_TESTS, ','), variables['Build.DefinitionName'])))
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -128,6 +129,7 @@ jobs:
      mkdir rocDecode-tests
      cd rocDecode-tests
      cmake /opt/rocm/share/rocdecode/test
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
    parameters:
      componentName: rocDecode
--- a/.azuredevops/components/rocFFT.yml
+++ b/.azuredevops/components/rocFFT.yml
@@ -55,8 +55,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
@@ -95,7 +93,7 @@ jobs:

 - job: rocFFT_testing
  dependsOn: rocFFT
-  condition: succeeded()
+  condition: and(succeeded(), eq(variables.ENABLE_GFX942_TESTS, 'true'), not(containsValue(split(variables.DISABLED_GFX942_TESTS, ','), variables['Build.DefinitionName'])))
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -129,6 +127,7 @@ jobs:
        dependencySource: staging
      ${{ elseif ne(parameters.checkoutRef, '') }}:
        dependencySource: tag-builds
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
    parameters:
      componentName: rocFFT
--- a/.azuredevops/components/rocMLIR.yml
+++ b/.azuredevops/components/rocMLIR.yml
@@ -13,6 +13,7 @@ parameters:
    - git
    - python3-pip
    - libdrm-dev
+    - libstdc++-12-dev
 - name: pipModules
  type: object
  default:
@@ -28,7 +29,7 @@ parameters:
    - ROCR-Runtime

 jobs:
- job: rocMLIR_library
+- job: rocMLIR
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -66,16 +67,19 @@ jobs:

 # compiling and running test on the test system together
 - job: rocMLIR_testing
+  dependsOn: rocMLIR
+  condition: and(succeeded(), eq(variables.ENABLE_GFX942_TESTS, 'true'), not(containsValue(split(variables.DISABLED_GFX942_TESTS, ','), variables['Build.DefinitionName'])))
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
-  pool: ${{ variables.GFX942_TEST_POOL }}
+  pool: $(JOB_TEST_POOL)
  workspace:
    clean: all
  strategy:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
+        JOB_TEST_POOL: ${{ variables.GFX942_TEST_POOL }}
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
@@ -119,6 +123,7 @@ jobs:
        -DAMDGPU_TARGETS=$(JOB_GPU_TARGET)
        -DROCM_TEST_CHIPSET=$(JOB_GPU_TARGET)
        -GNinja
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
    parameters:
      componentName: rocMLIR
--- a/.azuredevops/components/rocPRIM.yml
+++ b/.azuredevops/components/rocPRIM.yml
@@ -42,8 +42,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
@@ -77,7 +75,7 @@ jobs:

 - job: rocPRIM_testing
  dependsOn: rocPRIM
-  condition: succeeded()
+  condition: and(succeeded(), eq(variables.ENABLE_GFX942_TESTS, 'true'), not(containsValue(split(variables.DISABLED_GFX942_TESTS, ','), variables['Build.DefinitionName'])))
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -111,6 +109,7 @@ jobs:
        dependencySource: staging
      ${{ elseif ne(parameters.checkoutRef, '') }}:
        dependencySource: tag-builds
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
    parameters:
      componentName: rocPRIM
--- a/.azuredevops/components/rocPyDecode.yml
+++ b/.azuredevops/components/rocPyDecode.yml
@@ -8,25 +8,21 @@ parameters:
 - name: aptPackages
  type: object
  default:
-    - python3-pip
    - cmake
-    - ninja-build
-    - pkg-config
    - ffmpeg
    - libavcodec-dev
    - libavformat-dev
    - libavutil-dev
-    - libva-dev
-    - libdrm-dev
-    - pybind11-dev
-    - python3-pybind11
    - libdlpack-dev
+    - libdrm-dev
+    - libva-dev
+    - ninja-build
+    - pkg-config
+    - python3-pip
 - name: pipModules
  type: object
  default:
-    - -i
-    - https://test.pypi.org/simple
-    - hip-python
+    - pybind11
 - name: rocmDependencies
  type: object
  default:
@@ -52,13 +48,16 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
      aptPackages: ${{ parameters.aptPackages }}
      pipModules: ${{ parameters.pipModules }}
+  - task: Bash@3
+    displayName: 'pip install hip-python'
+    inputs:
+      targetType: inline
+      script: pip install -i https://test.pypi.org/simple hip-python
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
    parameters:
@@ -80,11 +79,16 @@ jobs:
      script: |
        sudo rm -rf /opt/rocm
        sudo ln -s $(Agent.BuildDirectory)/rocm /opt/rocm
+  - task: Bash@3
+    displayName: 'Set User Site Packages Path'
+    inputs:
+      targetType: inline
+      script: echo "##vso[task.setvariable variable=USER_SITE;]$(python -m site --user-site)"
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
    parameters:
      extraBuildFlags: >-
        -DROCM_PATH=$(Agent.BuildDirectory)/rocm
-        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(USER_SITE)/pybind11
        -DCMAKE_BUILD_TYPE=Release
        -DAMDGPU_TARGETS=$(JOB_GPU_TARGET)
        -DCMAKE_INSTALL_PREFIX_PYTHON=$(Build.BinariesDirectory)
@@ -116,7 +120,7 @@ jobs:

 - job: rocPyDecode_testing
  dependsOn: rocPyDecode
-  condition: succeeded()
+  condition: and(succeeded(), eq(variables.ENABLE_GFX942_TESTS, 'true'), not(containsValue(split(variables.DISABLED_GFX942_TESTS, ','), variables['Build.DefinitionName'])))
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -132,8 +136,16 @@ jobs:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
      aptPackages: ${{ parameters.aptPackages }}
+      pipModules: ${{ parameters.pipModules }}
+  - task: Bash@3
+    displayName: 'pip install hip-python'
+    inputs:
+      targetType: inline
+      script: pip install -i https://test.pypi.org/simple hip-python
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+    parameters:
+      gpuTarget: $(JOB_GPU_TARGET)
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
    parameters:
      ${{ if eq(parameters.checkoutRef, '') }}:
@@ -158,9 +170,11 @@ jobs:
      script: |
        sudo rm -rf /opt/rocm
        sudo ln -s $(Agent.BuildDirectory)/rocm /opt/rocm
+        echo "##vso[task.setvariable variable=USER_SITE;]$(python3 -m site --user-site)"
        cd $(Build.SourcesDirectory)
        sudo pip install .
-        cmake -DAMDGPU_TARGETS=$(JOB_GPU_TARGET) .
+        cmake -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(USER_SITE)/pybind11 -DAMDGPU_TARGETS=$(JOB_GPU_TARGET) .
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
    parameters:
      componentName: rocPyDecode
@@ -168,7 +182,9 @@ jobs:
 # sudo required for pip install but screws up permissions for next pipeline run
  - task: Bash@3
    displayName: Clean up test environment
+    condition: always()
    inputs:
      targetType: inline
      script: |
+        sudo pip uninstall -y rocPyDecode
        sudo rm -rf $(Build.SourcesDirectory)/*
--- a/.azuredevops/components/rocRAND.yml
+++ b/.azuredevops/components/rocRAND.yml
@@ -45,8 +45,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
@@ -79,7 +77,7 @@ jobs:

 - job: rocRAND_testing
  dependsOn: rocRAND
-  condition: succeeded()
+  condition: and(succeeded(), eq(variables.ENABLE_GFX942_TESTS, 'true'), not(containsValue(split(variables.DISABLED_GFX942_TESTS, ','), variables['Build.DefinitionName'])))
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -113,6 +111,7 @@ jobs:
        dependencySource: staging
      ${{ elseif ne(parameters.checkoutRef, '') }}:
        dependencySource: tag-builds
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
    parameters:
      componentName: rocRAND
--- a/.azuredevops/components/rocSOLVER.yml
+++ b/.azuredevops/components/rocSOLVER.yml
@@ -58,8 +58,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
@@ -113,7 +111,7 @@ jobs:

 - job: rocSOLVER_testing
  dependsOn: rocSOLVER
-  condition: succeeded()
+  condition: and(succeeded(), eq(variables.ENABLE_GFX942_TESTS, 'true'), not(containsValue(split(variables.DISABLED_GFX942_TESTS, ','), variables['Build.DefinitionName'])))
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -147,6 +145,7 @@ jobs:
        dependencySource: staging
      ${{ elseif ne(parameters.checkoutRef, '') }}:
        dependencySource: tag-builds
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
    parameters:
      componentName: rocSOLVER
--- a/.azuredevops/components/rocSPARSE.yml
+++ b/.azuredevops/components/rocSPARSE.yml
@@ -56,8 +56,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
@@ -107,7 +105,7 @@ jobs:
 - job: rocSPARSE_testing
  timeoutInMinutes: 90
  dependsOn: rocSPARSE
-  condition: succeeded()
+  condition: and(succeeded(), eq(variables.ENABLE_GFX942_TESTS, 'true'), not(containsValue(split(variables.DISABLED_GFX942_TESTS, ','), variables['Build.DefinitionName'])))
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -141,6 +139,7 @@ jobs:
        dependencySource: staging
      ${{ elseif ne(parameters.checkoutRef, '') }}:
        dependencySource: tag-builds
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
    parameters:
      componentName: rocSPARSE
--- a/.azuredevops/components/rocThrust.yml
+++ b/.azuredevops/components/rocThrust.yml
@@ -47,8 +47,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
@@ -82,7 +80,7 @@ jobs:

 - job: rocThrust_testing
  dependsOn: rocThrust
-  condition: succeeded()
+  condition: and(succeeded(), eq(variables.ENABLE_GFX942_TESTS, 'true'), not(containsValue(split(variables.DISABLED_GFX942_TESTS, ','), variables['Build.DefinitionName'])))
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -116,6 +114,7 @@ jobs:
        dependencySource: staging
      ${{ elseif ne(parameters.checkoutRef, '') }}:
        dependencySource: tag-builds
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
    parameters:
      componentName: rocThrust
--- a/.azuredevops/components/rocWMMA.yml
+++ b/.azuredevops/components/rocWMMA.yml
@@ -56,8 +56,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
@@ -94,7 +92,7 @@ jobs:
 - job: rocWMMA_testing
  timeoutInMinutes: 90
  dependsOn: rocWMMA
-  condition: succeeded()
+  condition: and(succeeded(), eq(variables.ENABLE_GFX942_TESTS, 'true'), not(containsValue(split(variables.DISABLED_GFX942_TESTS, ','), variables['Build.DefinitionName'])))
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -128,6 +126,7 @@ jobs:
        dependencySource: staging
      ${{ elseif ne(parameters.checkoutRef, '') }}:
        dependencySource: tag-builds
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
    parameters:
      componentName: rocWMMA
--- a/.azuredevops/components/rocm-examples.yml
+++ b/.azuredevops/components/rocm-examples.yml
@@ -60,6 +60,7 @@ parameters:
    - rocSOLVER
    - rocSPARSE
    - rocThrust
+    - roctracer

 jobs:
 - job: rocm_examples
@@ -73,8 +74,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
@@ -116,7 +115,7 @@ jobs:

 - job: rocm_examples_testing
  dependsOn: rocm_examples
-  condition: succeeded()
+  condition: and(succeeded(), eq(variables.ENABLE_GFX942_TESTS, 'true'), not(containsValue(split(variables.DISABLED_GFX942_TESTS, ','), variables['Build.DefinitionName'])))
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -135,9 +134,9 @@ jobs:
    parameters:
      aptPackages: ${{ parameters.aptPackages }}
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
    parameters:
-      gpuTarget: $(JOB_GPU_TARGET)
+      checkoutRepo: ${{ parameters.checkoutRepo }}
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
    parameters:
      ${{ if eq(parameters.checkoutRef, '') }}:
@@ -152,20 +151,19 @@ jobs:
        dependencySource: staging
      ${{ elseif ne(parameters.checkoutRef, '') }}:
        dependencySource: tag-builds
-  - task: Bash@3
-    displayName: Unload and reload AMDGPU
-    inputs:
-      targetType: inline
-      script: |
-        sudo modprobe -r amdgpu
-        sudo modprobe amdgpu
-  - task: Bash@3
-    displayName: Iterate through examples
-    inputs:
-      targetType: inline
-      script: |
-        for file in *; do
-          echo Now running: $file
-          ./$file | tee -a $(TEST_LOG_FILE)
-        done
-      workingDirectory: $(Agent.BuildDirectory)/rocm/examples
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+    parameters:
+      # https://github.com/ROCm/HIP/issues/2203
+      extraBuildFlags: >-
+        -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
+        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+        -DROCM_ROOT=$(Agent.BuildDirectory)/rocm
+        -DAMDGPU_TARGETS=$(JOB_GPU_TARGET)
+        -DCMAKE_HIP_ARCHITECTURES=$(JOB_GPU_TARGET)
+        -DCMAKE_EXE_LINKER_FLAGS=-fgpu-rdc
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+    parameters:
+      componentName: rocm-examples
+      testDir: $(Build.SourcesDirectory)/build
+      reloadAMDGPU: true
--- a/.azuredevops/components/rocm_bandwidth_test.yml
+++ b/.azuredevops/components/rocm_bandwidth_test.yml
@@ -27,6 +27,7 @@ parameters:
 - name: rocmTestDependencies
  type: object
  default:
+    - rocminfo
    - rocprofiler-register
    - ROCR-Runtime

@@ -72,7 +73,7 @@ jobs:

 - job: rocm_bandwidth_test_testing
  dependsOn: rocm_bandwidth_test
-  condition: succeeded()
+  condition: and(succeeded(), eq(variables.ENABLE_GFX942_TESTS, 'true'), not(containsValue(split(variables.DISABLED_GFX942_TESTS, ','), variables['Build.DefinitionName'])))
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -99,6 +100,7 @@ jobs:
        dependencySource: staging
      ${{ elseif ne(parameters.checkoutRef, '') }}:
        dependencySource: tag-builds
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
    parameters:
      componentName: rocm_bandwidth_test
--- a/.azuredevops/components/rocm_smi_lib.yml
+++ b/.azuredevops/components/rocm_smi_lib.yml
@@ -29,7 +29,7 @@ jobs:

 - job: rocm_smi_lib_testing
  dependsOn: rocm_smi_lib
-  condition: succeeded()
+  condition: and(succeeded(), eq(variables.ENABLE_GFX942_TESTS, 'true'), not(containsValue(split(variables.DISABLED_GFX942_TESTS, ','), variables['Build.DefinitionName'])))
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -44,6 +44,9 @@ jobs:
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+    parameters:
+      runRocminfo: false
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
    parameters:
      componentName: rocm_smi_lib
--- a/.azuredevops/components/rocminfo.yml
+++ b/.azuredevops/components/rocminfo.yml
@@ -49,7 +49,7 @@ jobs:

 - job: rocminfo_testing
  dependsOn: rocminfo
-  condition: succeeded()
+  condition: and(succeeded(), eq(variables.ENABLE_GFX942_TESTS, 'true'), not(containsValue(split(variables.DISABLED_GFX942_TESTS, ','), variables['Build.DefinitionName'])))
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -72,6 +72,9 @@ jobs:
        dependencySource: staging
      ${{ elseif ne(parameters.checkoutRef, '') }}:
        dependencySource: tag-builds
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+    parameters:
+      runRocminfo: false
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
    parameters:
      componentName: rocminfo
--- a/.azuredevops/components/rocprofiler-sdk.yml
+++ b/.azuredevops/components/rocprofiler-sdk.yml
@@ -55,8 +55,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
--- a/.azuredevops/components/rocprofiler-systems.yml
+++ b/.azuredevops/components/rocprofiler-systems.yml
@@ -0,0 +1,139 @@
+# largely referenced from: https://github.com/ROCm/omnitrace/blob/main/.github/workflows/ubuntu-jammy.yml
+parameters:
+- name: checkoutRepo
+  type: string
+  default: 'self'
+- name: checkoutRef
+  type: string
+  default: ''
+- name: aptPackages
+  type: object
+  default:
+    - autoconf
+    - autotools-dev
+    - bison
+    - build-essential
+    - bzip2
+    - clang
+    - cmake
+    - environment-modules
+    - g++-12
+    - libdrm-dev
+    - libfabric-dev
+    - libiberty-dev
+    - libpapi-dev
+    - libpfm4-dev
+    - libtool
+    - libopenmpi-dev
+    - m4
+    - openmpi-bin
+    - software-properties-common
+    - python3-pip
+    - texinfo
+    - zlib1g-dev
+- name: pipModules
+  type: object
+  default:
+    - numpy
+    - perfetto
+    - dataclasses
+- name: rocmDependencies
+  type: object
+  default:
+    - clr
+    - llvm-project
+    - rccl
+    - rocm-core
+    - rocm_smi_lib
+    - rocminfo
+    - ROCR-Runtime
+    - rocprofiler
+    - rocprofiler-register
+    - roctracer
+
+jobs:
+- job: rocprofiler_systems
+  variables:
+  - group: common
+  - template: /.azuredevops/variables-global.yml
+  pool: ${{ variables.MEDIUM_BUILD_POOL }}
+  workspace:
+    clean: all
+  strategy:
+    matrix:
+      gfx942:
+        JOB_GPU_TARGET: gfx942
+  steps:
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+    parameters:
+      aptPackages: ${{ parameters.aptPackages }}
+      pipModules: ${{ parameters.pipModules }}
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+    parameters:
+      checkoutRepo: ${{ parameters.checkoutRepo }}
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+    parameters:
+      dependencyList: ${{ parameters.rocmDependencies }}
+      gpuTarget: $(JOB_GPU_TARGET)
+      # CI case: download latest default branch build
+      ${{ if eq(parameters.checkoutRef, '') }}:
+        dependencySource: staging
+      # manual build case: triggered by ROCm/ROCm repo
+      ${{ elseif ne(parameters.checkoutRef, '') }}:
+        dependencySource: tag-builds
+  - task: Bash@3
+    displayName: ROCm symbolic link
+    inputs:
+      targetType: inline
+      script: |
+        sudo rm -rf /opt/rocm
+        sudo ln -s $(Agent.BuildDirectory)/rocm /opt/rocm
+  - task: Bash@3
+    displayName: Add ROCm binaries to PATH
+    inputs:
+      targetType: inline
+      script: echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/bin"
+  - task: Bash@3
+    displayName: Add ROCm compilers to PATH
+    inputs:
+      targetType: inline
+      script: echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/llvm/bin"
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+    parameters:
+# build flags reference: https://rocm.docs.amd.com/projects/omnitrace/en/latest/install/install.html
+      extraBuildFlags: >-
+        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+        -DROCPROFSYS_BUILD_TESTING=ON
+        -DROCPROFSYS_BUILD_DYNINST=ON
+        -DROCPROFSYS_BUILD_LIBUNWIND=ON
+        -DDYNINST_BUILD_TBB=ON
+        -DDYNINST_BUILD_ELFUTILS=ON
+        -DDYNINST_BUILD_LIBIBERTY=ON
+        -DDYNINST_BUILD_BOOST=ON
+        -DROCPROFSYS_USE_PAPI=ON
+        -DROCPROFSYS_USE_MPI=ON
+        -DAMDGPU_TARGETS=$(JOB_GPU_TARGET)
+      multithreadFlag: -- -j32
+  - task: Bash@3
+    displayName: Set up rocprofiler-systems env
+    inputs:
+      targetType: inline
+      script: source share/rocprofiler-systems/setup-env.sh
+      workingDirectory: build
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+    parameters:
+      componentName: rocprofiler-systems
+  - task: Bash@3
+    displayName: Remove ROCm binaries from PATH
+    inputs:
+      targetType: inline
+      script: echo "##vso[task.setvariable variable=PATH]$(echo $PATH | sed -e 's;:$(Agent.BuildDirectory)/rocm/bin;;' -e 's;^/;;' -e 's;/$;;')"
+  - task: Bash@3
+    displayName: Remove ROCm compilers from PATH
+    inputs:
+      targetType: inline
+      script: echo "##vso[task.setvariable variable=PATH]$(echo $PATH | sed -e 's;:$(Agent.BuildDirectory)/rocm/llvm/bin;;' -e 's;^/;;' -e 's;/$;;')"
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+    parameters:
+      gpuTarget: $(JOB_GPU_TARGET)
--- a/.azuredevops/components/rocprofiler.yml
+++ b/.azuredevops/components/rocprofiler.yml
@@ -57,8 +57,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
@@ -100,7 +98,7 @@ jobs:

 - job: rocprofiler_testing
  dependsOn: rocprofiler
-  condition: succeeded()
+  condition: and(succeeded(), eq(variables.ENABLE_GFX942_TESTS, 'true'), not(containsValue(split(variables.DISABLED_GFX942_TESTS, ','), variables['Build.DefinitionName'])))
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -141,6 +139,7 @@ jobs:
      script: |
        sudo rm -rf /opt/rocm
        sudo ln -s $(Agent.BuildDirectory)/rocm /opt/rocm
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
    parameters:
      componentName: rocprofilerV1
--- a/.azuredevops/components/rocr_debug_agent.yml
+++ b/.azuredevops/components/rocr_debug_agent.yml
@@ -72,7 +72,7 @@ jobs:

 - job: rocr_debug_agent_testing
  dependsOn: rocr_debug_agent
-  condition: succeeded()
+  condition: and(succeeded(), eq(variables.ENABLE_GFX942_TESTS, 'true'), not(containsValue(split(variables.DISABLED_GFX942_TESTS, ','), variables['Build.DefinitionName'])))
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -114,6 +114,7 @@ jobs:
      cmakeBuildDir: '$(Agent.BuildDirectory)/rocm/src/rocm-debug-agent-test'
      cmakeSourceDir: '.'
      installEnabled: false
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
    parameters:
      componentName: rocr_debug_agent
--- a/.azuredevops/components/roctracer.yml
+++ b/.azuredevops/components/roctracer.yml
@@ -30,6 +30,7 @@ parameters:
  default:
    - clr
    - llvm-project
+    - rocminfo
    - rocprofiler-register
    - ROCR-Runtime

@@ -48,8 +49,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
@@ -77,6 +76,7 @@ jobs:
        -DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm/lib/cmake/hip
        -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
        -DGPU_TARGETS=$(JOB_GPU_TARGET)
+        -DAMDGPU_TARGETS=$(JOB_GPU_TARGET)
        -GNinja
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
    parameters:
@@ -84,7 +84,7 @@ jobs:

 - job: roctracer_testing
  dependsOn: roctracer
-  condition: succeeded()
+  condition: and(succeeded(), eq(variables.ENABLE_GFX942_TESTS, 'true'), not(containsValue(split(variables.DISABLED_GFX942_TESTS, ','), variables['Build.DefinitionName'])))
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -118,6 +118,7 @@ jobs:
        dependencySource: staging
      ${{ elseif ne(parameters.checkoutRef, '') }}:
        dependencySource: tag-builds
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
    parameters:
      componentName: roctracer
--- a/.azuredevops/components/rpp.yml
+++ b/.azuredevops/components/rpp.yml
@@ -56,8 +56,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
    parameters:
@@ -93,7 +91,7 @@ jobs:

 - job: rpp_testing
  dependsOn: rpp
-  condition: succeeded()
+  condition: and(succeeded(), eq(variables.ENABLE_GFX942_TESTS, 'true'), not(containsValue(split(variables.DISABLED_GFX942_TESTS, ','), variables['Build.DefinitionName'])))
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -173,6 +171,7 @@ jobs:
        cmake /opt/rocm/share/rpp/test \
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++ \
          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
    parameters:
      componentName: rpp
--- a/.azuredevops/dependencies/boost.yml
+++ b/.azuredevops/dependencies/boost.yml
@@ -0,0 +1,65 @@
+parameters:
+- name: checkoutRepo
+  type: string
+  default: 'self'
+- name: checkoutRef
+  type: string
+  default: ''
+- name: boostVersion
+  type: string
+  default: ''
+- name: aptPackages
+  type: object
+  default:
+    - git
+- name: rocmDependencies
+  type: object
+  default:
+    - llvm-project
+
+jobs:
+- job: boost
+  variables:
+  - group: common
+  - template: /.azuredevops/variables-global.yml
+  pool:
+    vmImage: ${{ variables.BASE_BUILD_POOL }}
+  workspace:
+    clean: all
+  steps:
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+    parameters:
+      aptPackages: ${{ parameters.aptPackages }}
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+  - task: Bash@3
+    displayName: 'git clone boost'
+    inputs:
+      targetType: inline
+      script: git clone -b ${{ parameters.boostVersion }} https://github.com/boostorg/boost --depth=1 --recurse-submodules
+      workingDirectory: $(Build.SourcesDirectory)
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+    parameters:
+      dependencyList: ${{ parameters.rocmDependencies }}
+      dependencySource: staging
+  - task: Bash@3
+    displayName: Add ROCm binaries to PATH
+    inputs:
+      targetType: inline
+      script: echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/bin"
+  - task: Bash@3
+    displayName: Add ROCm compilers to PATH
+    inputs:
+      targetType: inline
+      script: echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/llvm/bin"
+  - task: Bash@3
+    displayName: 'Build Boost with clang'
+    inputs:
+      targetType: inline
+      script: |
+        export CC=$(Agent.BuildDirectory)/rocm/llvm/bin/clang
+        export CXX=$(Agent.BuildDirectory)/rocm/llvm/bin/clang++
+        ./bootstrap.sh --with-toolset=clang --prefix=$(Build.BinariesDirectory)
+        ./b2 --toolset=clang threading=multi link=shared --prefix=$(Build.BinariesDirectory) cxxflags="-std=c++20"
+        ./b2 install
+      workingDirectory: $(Build.SourcesDirectory)/boost
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
--- a/.azuredevops/dependencies/grpc.yml
+++ b/.azuredevops/dependencies/grpc.yml
@@ -20,7 +20,7 @@ jobs:
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
-  pool: 
+  pool:
    vmImage: ${{ variables.BASE_BUILD_POOL }}
  workspace:
    clean: all
--- a/.azuredevops/dependencies/gtest.yml
+++ b/.azuredevops/dependencies/gtest.yml
@@ -20,7 +20,7 @@ jobs:
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
-  pool: 
+  pool:
    vmImage: ${{ variables.BASE_BUILD_POOL }}
  workspace:
    clean: all
--- a/.azuredevops/dependencies/lapack.yml
+++ b/.azuredevops/dependencies/lapack.yml
@@ -21,7 +21,7 @@ jobs:
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
-  pool: 
+  pool:
    vmImage: ${{ variables.BASE_BUILD_POOL }}
  workspace:
    clean: all
--- a/.azuredevops/nightly/pytorch.yml
+++ b/.azuredevops/nightly/pytorch.yml
@@ -11,7 +11,6 @@ parameters:
    - ca-certificates
    - bc
    - bridge-utils
-    - cmake
    - devscripts
    - dkms
    - doxygen
@@ -67,8 +66,9 @@ parameters:
 - name: pipModules
  type: object
  default:
+    - cmake
    - astunparse
-    - expecttest!=0.2.0
+    - expecttest>=0.2.1
    - hypothesis
    - numpy
    - psutil
@@ -85,14 +85,15 @@ parameters:
    - lintrunner
    - ninja
    - packaging
-    - optree>=0.12.0
+    - optree>=0.13.0
+    - click>=8.0.3
  # list for vision
    - auditwheel
    - future
    - pytest
    - pytest-azurepipelines
    - pillow
-# list from https://github.com/pytorch/builder/blob/main/manywheel/build_rocm.sh
+# list from https://github.com/pytorch/pytorch/blob/main/.ci/manywheel/build_rocm.sh
 - name: rocmDependencies
  type: object
  default:
@@ -122,6 +123,7 @@ parameters:
    - hipCUB
    - rocThrust
    - hipBLAS-common
+    - composable_kernel
 - name: rocmTestDependencies
  type: object
  default:
@@ -159,13 +161,10 @@ jobs:
      amd-staging-gfx942:
        ROCM_BRANCH: amd-staging
        JOB_GPU_TARGET: gfx942
-      amd-staging-gfx90a:
-        ROCM_BRANCH: amd-staging
-        JOB_GPU_TARGET: gfx90a
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
-# various flags/parameters expected by bash scripts in pytorch builder repo
+# various flags/parameters expected by bash scripts in pytorch repo's .ci directory
  - name: ROCM_VERSION
    value: 6.3.0
  - name: ROCM_PATH
@@ -186,7 +185,7 @@ jobs:
  workspace:
    clean: all
  steps:
-# copy environment setup from https://github.com/pytorch/builder/blob/main/manywheel/Dockerfile
+# copy environment setup from https://github.com/pytorch/pytorch/blob/main/.ci/docker/manywheel/Dockerfile
 # but instead of centos, use ubuntu environment
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
  - task: Bash@3
@@ -223,18 +222,21 @@ jobs:
      targetType: inline
      script: sudo ln -s $(Agent.BuildDirectory)/rocm /opt/rocm
  - checkout: self
-  - task: Bash@3
-    displayName: git clone pytorch builder
-    inputs:
-      targetType: inline
-      script: git clone https://github.com/pytorch/builder.git --depth=1 --recurse-submodules
-      workingDirectory: $(Build.SourcesDirectory)
  - task: Bash@3
    displayName: git clone upstream pytorch
    inputs:
      targetType: inline
      script: git clone https://github.com/pytorch/pytorch.git --depth=1 --recurse-submodules
      workingDirectory: $(Build.SourcesDirectory)
+# builder clone still needed due to run_tests.sh at end of build_common.sh call
+  - task: Bash@3
+    displayName: git clone pytorch builder
+    inputs:
+      targetType: inline
+      script: |
+        git clone https://github.com/pytorch/builder.git --depth=1 --recurse-submodules
+        sudo ln -s $(Build.SourcesDirectory)/builder /builder
+      workingDirectory: $(Build.SourcesDirectory)
  - task: Bash@3
    displayName: Install patchelf
    inputs:
@@ -287,8 +289,8 @@ jobs:
        PYTORCH_BUILD_VERSION=$(cat $(Build.SourcesDirectory)/pytorch/version.txt | cut -da -f1)
        PYTORCH_BUILD_NUMBER=$(date -u +%Y%m%d)
        SKIP_ALL_TESTS=1
-        bash ./manywheel/build_rocm.sh
-      workingDirectory: $(Build.SourcesDirectory)/builder
+        bash ./.ci/manywheel/build_rocm.sh
+      workingDirectory: $(Build.SourcesDirectory)/pytorch
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-prepare-package.yml
    parameters:
      sourceDir: /remote/wheelhouserocm$(ROCM_VERSION)
@@ -348,7 +350,7 @@ jobs:

 - job: torchvision_testing
  dependsOn: pytorch
-  condition: succeeded()
+  condition: and(succeeded(), eq(variables.ENABLE_GFX942_TESTS, 'true'), not(containsValue(split(variables.DISABLED_GFX942_TESTS, ','), variables['Build.DefinitionName'])))
  variables:
  - group: common
  - template: /.azuredevops/variables-global.yml
@@ -377,7 +379,6 @@ jobs:
    parameters:
      aptPackages: ${{ parameters.aptPackages }}
      pipModules: ${{ parameters.pipModules }}
-  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
  - task: DownloadPipelineArtifact@2
    displayName: 'Download Pipeline Wheel Files'
--- a/.azuredevops/nightly/rocm-nightly.yml
+++ b/.azuredevops/nightly/rocm-nightly.yml
@@ -47,6 +47,7 @@ parameters:
    - rocPRIM
    - rocprofiler-register
    - rocprofiler-sdk
+    - rocprofiler-systems
    - rocprofiler
    - rocPyDecode
    - ROCR-Runtime
@@ -81,8 +82,6 @@ jobs:
    matrix:
      gfx942:
        JOB_GPU_TARGET: gfx942
-      gfx90a:
-        JOB_GPU_TARGET: gfx90a
  steps:
  - task: DeleteFiles@1
    displayName: 'Cleanup checkout space'
@@ -119,7 +118,7 @@ jobs:
  - script: du -sh $(Build.ArtifactStagingDirectory)
    displayName: Compressed ROCm size
  - task: PublishPipelineArtifact@1
-    displayName: 'Public ROCm Nightly Artifact'
+    displayName: 'Publish ROCm Nightly Artifact'
    retryCountOnTaskFailure: 3
    inputs:
      targetPath: '$(Build.ArtifactStagingDirectory)'
--- a/.azuredevops/tag-builds/Tensile.yml
+++ b/.azuredevops/tag-builds/Tensile.yml
@@ -0,0 +1,29 @@
+variables:
+- group: common
+- template: /.azuredevops/variables-global.yml
+
+parameters:
+- name: checkoutRef
+  type: string
+  default: refs/tags/$(LATEST_RELEASE_TAG)
+
+resources:
+  repositories:
+  - repository: pipelines_repo
+    type: github
+    endpoint: ROCm
+    name: ROCm/ROCm
+  - repository: release_repo
+    type: github
+    endpoint: ROCm
+    name: ROCm/Tensile
+    ref: ${{ parameters.checkoutRef }}
+
+trigger: none
+pr: none
+
+jobs:
+  - template: ${{ variables.CI_COMPONENT_PATH }}/Tensile.yml
+    parameters:
+      checkoutRepo: release_repo
+      checkoutRef: ${{ parameters.checkoutRef }}
--- a/.azuredevops/tag-builds/boost.yml
+++ b/.azuredevops/tag-builds/boost.yml
@@ -0,0 +1,23 @@
+variables:
+- group: common
+- template: /.azuredevops/variables-global.yml
+
+parameters:
+- name: boostVersion
+  type: string
+  default: 'master'
+
+resources:
+  repositories:
+  - repository: pipelines_repo
+    type: github
+    endpoint: ROCm
+    name: ROCm/ROCm
+
+trigger: none
+pr: none
+
+jobs:
+  - template: ${{ variables.CI_DEPENDENCIES_PATH }}/boost.yml
+    parameters:
+      boostVersion: ${{ parameters.boostVersion }}
--- a/.azuredevops/tag-builds/rocprofiler-systems.yml
+++ b/.azuredevops/tag-builds/rocprofiler-systems.yml
@@ -0,0 +1,29 @@
+variables:
+- group: common
+- template: /.azuredevops/variables-global.yml
+
+parameters:
+- name: checkoutRef
+  type: string
+  default: refs/tags/$(LATEST_RELEASE_TAG)
+
+resources:
+  repositories:
+  - repository: pipelines_repo
+    type: github
+    endpoint: ROCm
+    name: ROCm/ROCm
+  - repository: release_repo
+    type: github
+    endpoint: ROCm
+    name: ROCm/rocprofiler-systems
+    ref: ${{ parameters.checkoutRef }}
+
+trigger: none
+pr: none
+
+jobs:
+  - template: ${{ variables.CI_COMPONENT_PATH }}/rocprofiler-systems.yml
+    parameters:
+      checkoutRepo: release_repo
+      checkoutRef: ${{ parameters.checkoutRef }}
--- a/.azuredevops/templates/steps/artifact-download.yml
+++ b/.azuredevops/templates/steps/artifact-download.yml
@@ -9,6 +9,10 @@ parameters:
 - name: useDefaultBranch
  type: boolean
  default: true
+# useMainlineBranch only processed if useDefaultBranch is false
+- name: useMainlineBranch
+  type: boolean
+  default: false
 - name: latestFromBranch
  type: boolean
  default: true
@@ -22,7 +26,7 @@ parameters:
  type: object
  default:
    AMDMIGraphX: develop
-    amdsmi: develop
+    amdsmi: amd-staging
    aomp-extras: aomp-dev
    aomp: aomp-dev
    clr: amd-staging
@@ -46,8 +50,9 @@ parameters:
    MIOpen: develop
    MIVisionX: develop
    omniperf: amd-staging
+    omnitrace: amd-staging
    rccl: develop
-    rdc: develop
+    rdc: amd-staging
    rocAL: develop
    rocALUTION: develop
    rocBLAS: develop
@@ -56,17 +61,18 @@ parameters:
    rocFFT: develop
    ROCgdb: amd-staging
    rocm-cmake: develop
-    rocm-core: master
+    rocm-core: amd-staging
    rocm-examples: develop
    rocminfo: amd-staging
    rocMLIR: develop
    ROCmValidationSuite: master
    rocm_bandwidth_test: master
-    rocm_smi_lib: develop
+    rocm_smi_lib: amd-staging
    rocPRIM: develop
+    rocprofiler: amd-staging
    rocprofiler-register: amd-staging
    rocprofiler-sdk: amd-staging
-    rocprofiler: amd-staging
+    rocprofiler-systems: amd-staging
    rocPyDecode: develop
    ROCR-Runtime: amd-staging
    rocRAND: develop
@@ -77,6 +83,67 @@ parameters:
    roctracer: amd-staging
    rocWMMA: develop
    rpp: develop
+- name: mainlineBranchList
+  type: object
+  default:
+    AMDMIGraphX: mainline
+    amdsmi: amd-mainline
+    aomp-extras: amd-mainline-open
+    aomp: amd-mainline-open
+    clr: amd-mainline
+    composable_kernel: mainline
+    half: rocm
+    HIP: amd-mainline
+    hip-tests: amd-mainline
+    hipBLAS: mainline
+    hipBLASLt: mainline
+    hipBLAS-common: mainline
+    hipCUB: mainline
+    hipFFT: mainline
+    hipfort: mainline
+    HIPIFY: amd-mainline
+    hipRAND: mainline
+    hipSOLVER: mainline
+    hipSPARSE: mainline
+    hipSPARSELt: mainline
+    hipTensor: mainline
+    llvm-project: amd-mainline-open
+    MIOpen: mainline
+    MIVisionX: mainline
+    omniperf: amd-mainline
+    omnitrace: amd-mainline
+    rccl: mainline
+    rdc: amd-mainline
+    rocAL: master # needs the yaml file
+    rocALUTION: mainline
+    rocBLAS: mainline
+    ROCdbgapi : amd-mainline
+    rocDecode: mainline
+    rocFFT: mainline
+    ROCgdb: amd-mainline-rocgdb-15 #
+    rocm-cmake: mainline
+    rocm-core: amd-master
+    rocm-examples: develop # no mainline
+    rocminfo: amd-master
+    rocMLIR: mainline # needs the yaml file
+    ROCmValidationSuite: mainline
+    rocm_bandwidth_test: master
+    rocm_smi_lib: amd-mainline
+    rocPRIM: mainline
+    rocprofiler: amd-master
+    rocprofiler-register: amd-mainline
+    rocprofiler-sdk: amd-mainline
+    rocprofiler-systems: amd-mainline
+    rocPyDecode: mainline
+    ROCR-Runtime: amd-master
+    rocRAND: mainline
+    rocr_debug_agent: amd-mainline
+    rocSOLVER: mainline
+    rocSPARSE: mainline
+    rocThrust: mainline
+    roctracer: amd-master
+    rocWMMA: mainline
+    rpp: mainline
 - name: allowPartiallySucceededBuilds
  type: object
  default:
@@ -110,6 +177,8 @@ steps:
        buildVersionToDownload: latestFromBranch # default is 'latest'
    ${{ if eq(parameters.useDefaultBranch, true) }}:
      branchName: refs/heads/${{ parameters.defaultBranchList[parameters.componentName] }}
+    ${{ elseif eq(parameters.useMainlineBranch, true) }}:
+      branchName: refs/heads/${{ parameters.mainlineBranchList[parameters.componentName] }}
    ${{ else }}:
      branchName: ${{ parameters.branchName }}
    ${{ if containsValue(parameters.allowPartiallySucceededBuilds, parameters.componentName) }}:
--- a/.azuredevops/templates/steps/dependencies-aqlprofile.yml
+++ b/.azuredevops/templates/steps/dependencies-aqlprofile.yml
@@ -8,20 +8,22 @@ parameters:
 - name: repositoryUrl
  type: object
  default: 
-    staging: https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hsa-amd-aqlprofile
-    tag-builds: https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hsa-amd-aqlprofile
- name: packageName
-  type: object
-  default:
-    staging: hsa-amd-aqlprofile_1.0.0.60200.60200-66~22.04_amd64.deb
-    tag-builds: hsa-amd-aqlprofile_1.0.0.60200.60200-66~22.04_amd64.deb
+    staging: https://repo.radeon.com/rocm/apt/latest/pool/main/h/hsa-amd-aqlprofile/ # end slash is important for curl!
+    tag-builds: https://repo.radeon.com/rocm/apt/$(TAGGED_RELEASE)/pool/main/h/hsa-amd-aqlprofile/

 steps:
+- task: Bash@3
+  displayName: Get aqlprofile package name
+  inputs:
+    targetType: inline
+    script: |
+      export packageName=$(curl -s ${{ parameters.repositoryUrl[parameters.dependencySource] }} | grep -oP "href=\"\K[^\"]*$(lsb_release -rs)[^\"]*\.deb")
+      echo "##vso[task.setvariable variable=packageName;isreadonly=true]$packageName"
 - task: Bash@3
  displayName: 'Download aqlprofile'
  inputs:
    targetType: inline
-    script: wget -nv ${{ parameters.repositoryUrl[parameters.dependencySource] }}/${{ parameters.packageName[parameters.dependencySource] }}
+    script: wget -nv ${{ parameters.repositoryUrl[parameters.dependencySource] }}$(packageName)
    workingDirectory: '$(Pipeline.Workspace)'
 - task: Bash@3
  displayName: 'Extract aqlprofile'
@@ -29,7 +31,7 @@ steps:
    targetType: inline
    script: |
      mkdir hsa-amd-aqlprofile
-      dpkg-deb -R ${{ parameters.packageName[parameters.dependencySource] }} hsa-amd-aqlprofile
+      dpkg-deb -R $(packageName) hsa-amd-aqlprofile
    workingDirectory: '$(Pipeline.Workspace)'
 - task: Bash@3
  displayName: 'Copy aqlprofile files'
@@ -43,5 +45,5 @@ steps:
  displayName: 'Clean up aqlprofile'
  inputs:
    targetType: inline
-    script: rm -rf hsa-amd-aqlprofile ${{ parameters.packageName[parameters.dependencySource] }}
+    script: rm -rf hsa-amd-aqlprofile $(packageName)
    workingDirectory: '$(Pipeline.Workspace)'
--- a/.azuredevops/templates/steps/dependencies-boost.yml
+++ b/.azuredevops/templates/steps/dependencies-boost.yml
@@ -0,0 +1,35 @@
+steps:
+- task: DownloadPipelineArtifact@2
+  displayName: Download Boost
+  inputs:
+    buildType: specific
+    project: ROCm-CI
+    definition: $(BOOST_DEPENDENCY_PIPELINE_ID)
+    targetPath: $(Pipeline.Workspace)/d
+- task: ExtractFiles@1
+  displayName: Extract Boost
+  inputs:
+    archiveFilePatterns: '$(Pipeline.Workspace)/d/**/*.tar.gz'
+    destinationFolder: $(Agent.BuildDirectory)/boost
+    cleanDestinationFolder: true
+    overwriteExistingFiles: true
+- task: DeleteFiles@1
+  displayName: Cleanup Compressed Boost
+  inputs:
+    SourceFolder: $(Pipeline.Workspace)/d
+    Contents: '**/*.tar.gz'
+    RemoveDotFiles: true
+- task: Bash@3
+  displayName: 'List Boost files'
+  inputs:
+    targetType: inline
+    script: ls -1R $(Agent.BuildDirectory)/boost
+- task: Bash@3
+  displayName: 'Link Boost shared libraries'
+  inputs:
+    targetType: inline
+    script: |
+      echo $(Agent.BuildDirectory)/boost/lib | sudo tee /etc/ld.so.conf.d/boost.conf
+      sudo cat /etc/ld.so.conf.d/boost.conf
+      sudo ldconfig -v
+      ldconfig -p
--- a/.azuredevops/templates/steps/dependencies-rocm.yml
+++ b/.azuredevops/templates/steps/dependencies-rocm.yml
@@ -9,6 +9,7 @@ parameters:
  default: staging
  values:
    - staging
+    - mainline
    - tag-builds
    - fixed
 - name: extractToMnt
@@ -54,6 +55,7 @@ parameters:
    MIOpen: $(MIOpen_PIPELINE_ID)
    MIVisionX: $(MIVISIONX_PIPELINE_ID)
    omniperf: $(OMNIPERF_PIPELINE_ID)
+    omnitrace: $(OMNITRACE_PIPELINE_ID)
    rccl: $(RCCL_PIPELINE_ID)
    rdc: $(RDC_PIPELINE_ID)
    rocAL: $(ROCAL_PIPELINE_ID)
@@ -74,6 +76,7 @@ parameters:
    rocPRIM: $(ROCPRIM_PIPELINE_ID)
    rocprofiler-register: $(ROCPROFILER_REGISTER_PIPELINE_ID)
    rocprofiler-sdk: $(ROCPROFILER_SDK_PIPELINE_ID)
+    rocprofiler-systems: $(ROCPROFILER_SYSTEMS_PIPELINE_ID)
    rocprofiler: $(ROCPROFILER_PIPELINE_ID)
    rocPyDecode: $(ROCPYDECODE_PIPELINE_ID)
    ROCR-Runtime: $(ROCR_RUNTIME_PIPELINE_ID)
@@ -114,6 +117,7 @@ parameters:
    MIOpen: $(MIOpen_TAGGED_PIPELINE_ID)
    MIVisionX: $(MIVISIONX_TAGGED_PIPELINE_ID)
    omniperf: $(OMNIPERF_TAGGED_PIPELINE_ID)
+    omnitrace: $(OMNITRACE_TAGGED_PIPELINE_ID)
    rccl: $(RCCL_TAGGED_PIPELINE_ID)
    rdc: $(RDC_TAGGED_PIPELINE_ID)
    rocAL: $(ROCAL_TAGGED_PIPELINE_ID)
@@ -134,6 +138,7 @@ parameters:
    rocPRIM: $(ROCPRIM_TAGGED_PIPELINE_ID)
    rocprofiler-register: $(ROCPROFILER_REGISTER_TAGGED_PIPELINE_ID)
    rocprofiler-sdk: $(ROCPROFILER_SDK_TAGGED_PIPELINE_ID)
+    rocprofiler-systems: $(ROCPROFILER_SYSTEMS_PIPELINE_ID)
    rocprofiler: $(ROCPROFILER_TAGGED_PIPELINE_ID)
    rocPyDecode: $(ROCPYDECODE_TAGGED_PIPELINE_ID)
    ROCR-Runtime: $(ROCR_RUNTIME_TAGGED_PIPELINE_ID)
@@ -174,6 +179,7 @@ parameters:
    - hipRAND
    - hipSPARSELt
    - hipTensor
+    - omnitrace
    - rccl
    - rocALUTION
    - rocBLAS
@@ -181,6 +187,7 @@ parameters:
    - rocm-examples
    - rocPRIM
    - rocprofiler-sdk
+    - rocprofiler-systems
    - rocprofiler
    - rocPyDecode
    - rocRAND
@@ -216,6 +223,10 @@ steps:
        ${{ if eq(parameters.dependencySource, 'staging') }}:
          pipelineId: ${{ parameters.stagingPipelineIdentifiers[ split(dependency, ':')[0] ] }}
          latestFromBranch: ${{ parameters.latestFromBranch }}
+        ${{ elseif eq(parameters.dependencySource, 'mainline') }}:
+          pipelineId: ${{ parameters.stagingPipelineIdentifiers[ split(dependency, ':')[0] ] }}
+          useMainlineBranch: true
+          latestFromBranch: ${{ parameters.latestFromBranch }}
        ${{ elseif eq(parameters.dependencySource, 'tag-builds') }}:
          pipelineId: ${{ parameters.taggedPipelineIdentifiers[ split(dependency, ':')[0] ] }}
          latestFromBranch: false
@@ -232,6 +243,10 @@ steps:
        ${{ if eq(parameters.dependencySource, 'staging') }}:
          pipelineId: ${{ parameters.stagingPipelineIdentifiers[dependency] }}
          latestFromBranch: ${{ parameters.latestFromBranch }}
+        ${{ elseif eq(parameters.dependencySource, 'mainline') }}:
+          pipelineId: ${{ parameters.stagingPipelineIdentifiers[dependency] }}
+          useMainlineBranch: true
+          latestFromBranch: ${{ parameters.latestFromBranch }}
        ${{ elseif eq(parameters.dependencySource, 'tag-builds') }}:
          pipelineId: ${{ parameters.taggedPipelineIdentifiers[dependency] }}
          latestFromBranch: false
--- a/.azuredevops/templates/steps/gpu-diagnostics.yml
+++ b/.azuredevops/templates/steps/gpu-diagnostics.yml
@@ -0,0 +1,54 @@
+# Diagnostics for GPU-enabled systems
+parameters:
+- name: runRocminfo
+  type: boolean
+  default: true
+
+steps:
+- ${{ if eq(parameters.runRocminfo, true) }}:
+  - task: Bash@3
+    displayName: 'rocminfo'
+    continueOnError: true
+    inputs:
+      targetType: inline
+      script: $(Agent.BuildDirectory)/rocm/bin/rocminfo
+  - task: Bash@3
+    displayName: 'rocm_agent_enumerator'
+    continueOnError: true
+    inputs:
+      targetType: inline
+      script: $(Agent.BuildDirectory)/rocm/bin/rocm_agent_enumerator
+- task: Bash@3
+  displayName: 'List DRI devices'
+  continueOnError: true
+  inputs:
+    targetType: inline
+    script: ls -la /dev/dri/by-path/
+- task: Bash@3
+  displayName: 'List amdgpu/rocm/mesa packages'
+  continueOnError: true
+  inputs:
+    targetType: inline
+    script: apt list --installed | grep -E 'amdgpu|rocm|mesa'
+- task: Bash@3
+  displayName: 'List GPU processes'
+  continueOnError: true
+  inputs:
+    targetType: inline
+    script: |
+      ls /sys/class/kfd/kfd/proc/
+      sudo lsof | grep amdgpu
+- task: Bash@3
+  displayName: 'System snapshot'
+  continueOnError: true
+  inputs:
+    targetType: inline
+    script: top -bn1
+- task: Bash@3
+  displayName: 'List dmesg'
+  continueOnError: true
+  inputs:
+    targetType: inline
+    script: |
+      echo 'rocm-ci: $(Build.DefinitionName) $(System.DefinitionId)' | sudo tee /dev/kmsg
+      sudo dmesg
--- a/.azuredevops/variables-global.yml
+++ b/.azuredevops/variables-global.yml
@@ -167,6 +167,10 @@ variables:
  value: 241
 - name: OMNIPERF_TAGGED_PIPELINE_ID
  value: 242
+- name: OMNITRACE_PIPELINE_ID
+  value: 253
+- name: OMNITRACE_TAGGED_PIPELINE_ID
+  value: 252
 - name: RCCL_GFX942_TEST_PIPELINE_ID
  value: 184
 - name: RCCL_PIPELINE_ID
@@ -263,6 +267,10 @@ variables:
  value: 246
 - name: ROCPROFILER_SDK_TAGGED_PIPELINE_ID
  value: 234
+- name: ROCPROFILER_SYSTEMS_PIPELINE_ID
+  value: 255
+- name: ROCPROFILER_SYSTEMS_TAGGED_PIPELINE_ID
+  value: 254
 - name: ROCPROFILER_PIPELINE_ID
  value: 143
 - name: ROCPROFILER_TAGGED_PIPELINE_ID
@@ -325,3 +333,5 @@ variables:
  value: 78
 - name: RPP_TAGGED_PIPELINE_ID
  value: 39
+- name: BOOST_DEPENDENCY_PIPELINE_ID
+  value: 250
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -5,3 +5,4 @@ docs/ @amd-aakash @jlgreathouse @samjwu @yhuiYH @ROCm/rocm-documentation
 *.rst @amd-aakash @jlgreathouse @samjwu @yhuiYH @ROCm/rocm-documentation
 # External CI
 /.azuredevops/ @ROCm/external-ci
+tools/rocm-build/ @ROCm/rocm-devops
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -36,6 +36,7 @@ Bluefield
 Bootloader
 CCD
 CDNA
+CHTML
 CIFAR
 CLI
 CLion
@@ -70,6 +71,7 @@ Concretized
 Conda
 ConnectX
 CuPy
+Dashboarding
 DDR
 DF
 DGEMM
@@ -227,6 +229,7 @@ Mellanox's
 Meta's
 Miniconda
 MirroredStrategy
+Mixtral
 Multicore
 Multithreaded
 MyEnvironment
@@ -272,6 +275,7 @@ OpenMPI
 OpenSSL
 OpenVX
 OpenXLA
+Oversubscription
 PCC
 PCI
 PCIe
@@ -293,6 +297,7 @@ PowerShell
 PyPi
 PyTorch
 Qcycles
+Qwen
 RAII
 RAS
 RCCL
@@ -562,6 +567,7 @@ hipfort
 hipify
 hipsolver
 hipsparse
+hlist
 hotspotting
 hpc
 hpp
@@ -585,6 +591,7 @@ intra
 invariants
 invocating
 ipo
+jax
 kdb
 kfd
 latencies
@@ -605,6 +612,7 @@ migraphx
 miopen
 miopengemm
 mivisionx
+mjx
 mkdir
 mlirmiopen
 mtypes
@@ -620,6 +628,7 @@ openmp
 openssl
 optimizers
 os
+oversubscription
 pageable
 parallelization
 parameterization
--- a/docs/compatibility/compatibility-matrix.rst
+++ b/docs/compatibility/compatibility-matrix.rst
@@ -10,10 +10,7 @@ Use this matrix to view the ROCm compatibility and system requirements across su

 You can also refer to the :ref:`past versions of ROCm compatibility matrix<past-rocm-compatibility-matrix>`.

-
-.. note::
-
-   Like AMD Instinct accelerators, AMD Radeon and Radeon Pro GPUs can be used in environments for compute purposes (no display information or graphics). If using AMD Radeon or Radeon Pro GPUs with ROCm for graphics-related purposes (for example, display connected), review the `Compatibility matrices <https://rocm.docs.amd.com/projects/radeon/en/latest/docs/compatibility.html>`_ in the Use ROCm on Radeon GPU documentation to confirm system requirements.
+Accelerators and GPUs listed in the following table support compute workloads (no display information or graphics). If you’re using ROCm with AMD Radeon or Radeon Pro GPUs for graphics workloads, see the `Use ROCm on Radeon GPU documentation <https://rocm.docs.amd.com/projects/radeon/en/latest/docs/compatibility.html>`_ to verify compatibility and system requirements.

 .. |br| raw:: html

--- a/docs/conceptual/ai-migraphx-optimization.md
+++ b/docs/conceptual/ai-migraphx-optimization.md
@@ -1,333 +0,0 @@
-<head>
-  <meta charset="UTF-8">
-  <meta name="description" content="Inference optimization with MIGraphX">
-  <meta name="keywords" content="Inference optimization, MIGraphX, deep-learning, MIGraphX
-  installation, AMD, ROCm">
-</head>
-
-# Inference optimization with MIGraphX
-
-The following sections cover inferencing and introduces [MIGraphX](https://rocm.docs.amd.com/projects/AMDMIGraphX/en/latest/).
-
-## Inference
-
-The inference is where capabilities learned during deep-learning training are put to work. It refers to using a fully trained neural network to make conclusions (predictions) on unseen data that the model has never interacted with before. Deep-learning inferencing is achieved by feeding new data, such as new images, to the network, giving the Deep Neural Network a chance to classify the image.
-
-Taking our previous example of MNIST, the DNN can be fed new images of handwritten digit images, allowing the neural network to classify digits. A fully trained DNN should make accurate predictions about what an image represents, and inference cannot happen without training.
-
-## MIGraphX introduction
-
-MIGraphX is a graph compiler focused on accelerating the machine-learning inference that can target AMD GPUs and CPUs. MIGraphX accelerates the machine-learning models by leveraging several graph-level transformations and optimizations. These optimizations include:
-
-* Operator fusion
-* Arithmetic simplifications
-* Dead-code elimination
-* Common subexpression elimination (CSE)
-* Constant propagation
-
-After doing all these transformations, MIGraphX emits code for the AMD GPU by calling to MIOpen or rocBLAS or creating HIP kernels for a particular operator. MIGraphX can also target CPUs using DNNL or ZenDNN libraries.
-
-MIGraphX provides easy-to-use APIs in C++ and Python to import machine models in ONNX or TensorFlow. Users can compile, save, load, and run these models using the MIGraphX C++ and Python APIs. Internally, MIGraphX parses ONNX or TensorFlow models into internal graph representation where each operator in the model gets mapped to an operator within MIGraphX. Each of these operators defines various attributes such as:
-
-* Number of arguments
-* Type of arguments
-* Shape of arguments
-
-After optimization passes, all these operators get mapped to different kernels on GPUs or CPUs.
-
-After importing a model into MIGraphX, the model is represented as `migraphx::program`. `migraphx::program` is made up of `migraphx::module`. The program can consist of several modules, but it always has one main_module. Modules are made up of `migraphx::instruction_ref`. Instructions contain the `migraphx::op` and arguments to the operator.  
-
-## Installing MIGraphX
-
-There are three options to get started with MIGraphX installation. MIGraphX depends on ROCm libraries; assume that the machine has ROCm installed.
-
-### Option 1: installing binaries
-
-To install MIGraphX on Debian-based systems like Ubuntu, use the following command:
-
-```bash
-sudo apt update && sudo apt install -y migraphx
-```
-
-The header files and libraries are installed under `/opt/rocm-\<version\>`, where \<version\> is the ROCm version.
-
-### Option 2: building from source
-
-There are two ways to build the MIGraphX sources.
-
-* [Use the ROCm build tool](https://github.com/ROCm/AMDMIGraphX#use-the-rocm-build-tool-rbuild) - This approach uses `[rbuild](https://github.com/ROCm/rbuild)` to install the prerequisites and build the libraries with just one command.
-
-  or
-
-* [Use CMake](https://github.com/ROCm/AMDMIGraphX#use-cmake-to-build-migraphx) - This approach uses a script to install the prerequisites, then uses CMake to build the source.
-
-For detailed steps on building from source and installing dependencies, refer to the following `README` file:
-
-[https://github.com/ROCm/AMDMIGraphX#building-from-source](https://github.com/ROCm/AMDMIGraphX#building-from-source)
-
-### Option 3: use docker
-
-To use Docker, follow these steps:
-
-1. The easiest way to set up the development environment is to use Docker. To build Docker from scratch, first clone the MIGraphX repository by running:
-
-    ```bash
-    git clone --recursive https://github.com/ROCm/AMDMIGraphX
-    ```
-
-2. The repository contains a Dockerfile from which you can build a Docker image as:
-
-    ```bash
-    docker build -t migraphx .
-    ```
-
-3. Then to enter the development environment, use Docker run:
-
-    ```bash
-    docker run --device='/dev/kfd' --device='/dev/dri' -v=`pwd`:/code/AMDMIGraphX -w /code/AMDMIGraphX --group-add video -it migraphx
-    ```
-
-The Docker image contains all the prerequisites required for the installation, so users can go to the folder `/code/AMDMIGraphX` and follow the steps mentioned in [Option 2: Building from Source](#option-2-building-from-source).
-
-## MIGraphX example
-
-MIGraphX provides both C++ and Python APIs. The following sections show examples of both using the Inception v3 model. To walk through the examples, fetch the Inception v3 ONNX model by running the following:
-
-```py
-import torch
-import torchvision.models as models
-inception = models.inception_v3(pretrained=True)
-torch.onnx.export(inception,torch.randn(1,3,299,299), "inceptioni1.onnx")
-```
-
-This will create `inceptioni1.onnx`, which can be imported in MIGraphX using C++ or Python API.
-
-### MIGraphX Python API
-
-Follow these steps:
-
-1. To import the MIGraphX module in Python script, set `PYTHONPATH` to the MIGraphX libraries installation. If binaries are installed using steps mentioned in [Option 1: Installing Binaries](#option-1-installing-binaries), perform the following action:
-
-    ```bash
-    export PYTHONPATH=$PYTHONPATH:/opt/rocm/
-    ```
-
-2. The following script shows the usage of Python API to import the ONNX model, compile it, and run inference on it. Set `LD_LIBRARY_PATH` to `/opt/rocm/` if required.
-
-    ```py
-    # import migraphx and numpy
-    import migraphx
-    import numpy as np
-    # import and parse inception model
-    model = migraphx.parse_onnx("inceptioni1.onnx")
-    # compile model for the GPU target
-    model.compile(migraphx.get_target("gpu"))
-    # optionally print compiled model
-    model.print()
-    # create random input image
-    input_image = np.random.rand(1, 3, 299, 299).astype('float32')
-    # feed image to model, 'x.1` is the input param name
-    results = model.run({'x.1': input_image})
-    # get the results back
-    result_np = np.array(results[0])
-    # print the inferred class of the input image
-    print(np.argmax(result_np))
-    ```
-
-    Find additional examples of Python API in the `/examples` directory of the MIGraphX repository.
-
-## MIGraphX C++ API
-
-Follow these steps:
-
-1. The following is a minimalist example that shows the usage of MIGraphX C++ API to load ONNX file, compile it for the GPU, and run inference on it. To use MIGraphX C++ API, you only need to load the `migraphx.hpp` file. This example runs inference on the Inception v3 model.
-
-    ```c++
-    #include <vector>
-    #include <string>
-    #include <algorithm>
-    #include <ctime>
-    #include <random>
-    #include <migraphx/migraphx.hpp>
-
-    int main(int argc, char** argv)
-    {
-        migraphx::program prog;
-        migraphx::onnx_options onnx_opts;
-        // import and parse onnx file into migraphx::program
-        prog = parse_onnx("inceptioni1.onnx", onnx_opts);
-        // print imported model
-        prog.print();
-        migraphx::target targ = migraphx::target("gpu");
-        migraphx::compile_options comp_opts;
-        comp_opts.set_offload_copy();
-        // compile for the GPU
-        prog.compile(targ, comp_opts);
-        // print the compiled program
-        prog.print();
-        // randomly generate input image
-        // of shape (1, 3, 299, 299)
-        std::srand(unsigned(std::time(nullptr)));
-        std::vector<float> input_image(1*299*299*3);
-        std::generate(input_image.begin(), input_image.end(), std::rand);
-        // users need to provide data for the input
-        // parameters in order to run inference
-        // you can query into migraph program for the parameters
-        migraphx::program_parameters prog_params;
-        auto param_shapes = prog.get_parameter_shapes();
-        auto input        = param_shapes.names().front();
-        // create argument for the parameter
-        prog_params.add(input, migraphx::argument(param_shapes[input], input_image.data()));
-        // run inference
-        auto outputs = prog.eval(prog_params);
-        // read back the output
-        float* results = reinterpret_cast<float*>(outputs[0].data());
-        float* max     = std::max_element(results, results + 1000);
-        int answer = max - results;
-        std::cout << "answer: " << answer << std::endl;
-    }
-    ```
-
-2. To compile this program, you can use CMake and you only need to link the `migraphx::c` library to use MIGraphX's C++ API. The following is the `CMakeLists.txt` file that can build the earlier example:
-
-    ```cmake
-    cmake_minimum_required(VERSION 3.5)
-    project (CAI)
-
-    set (CMAKE_CXX_STANDARD 14)
-    set (EXAMPLE inception_inference)
-
-    list (APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
-    find_package (migraphx)
-
-    message("source file: " ${EXAMPLE}.cpp " ---> bin: " ${EXAMPLE})
-    add_executable(${EXAMPLE} ${EXAMPLE}.cpp)
-
-    target_link_libraries(${EXAMPLE} migraphx::c)
-    ```
-
-3. To build the executable file, run the following from the directory containing the `inception_inference.cpp` file:
-
-    ```bash
-    mkdir build
-    cd build
-    cmake ..
-    make -j$(nproc)
-    ./inception_inference
-    ```
-
-:::{note}
-    Set `LD_LIBRARY_PATH` to `/opt/rocm/lib` if required during the build. Additional examples can be found in the MIGraphX repository under the `/examples/` directory.
-:::
-
-## Tuning MIGraphX
-
-MIGraphX uses MIOpen kernels to target AMD GPU. For the model compiled with MIGraphX, tune MIOpen to pick the best possible kernel implementation. The MIOpen tuning results in a significant performance boost. Tuning can be done by setting the environment variable `MIOPEN_FIND_ENFORCE=3`.
-
-:::{note}
-    The tuning process can take a long time to finish.
-:::
-
-**Example:** The average inference time of the inception model example shown previously over 100 iterations using untuned kernels is 0.01383ms. After tuning, it reduces to 0.00459ms, which is a 3x improvement. This result is from ROCm v4.5 on a MI100 GPU.
-
-:::{note}
-    The results may vary depending on the system configurations.
-:::
-
-For reference, the following code snippet shows inference runs for only the first 10 iterations for both tuned and untuned kernels:
-
-```console
-### UNTUNED ###
-iterator : 0
-Inference complete
-Inference time: 0.063ms
-iterator : 1
-Inference complete
-Inference time: 0.008ms
-iterator : 2
-Inference complete
-Inference time: 0.007ms
-iterator : 3
-Inference complete
-Inference time: 0.007ms
-iterator : 4
-Inference complete
-Inference time: 0.007ms
-iterator : 5
-Inference complete
-Inference time: 0.008ms
-iterator : 6
-Inference complete
-Inference time: 0.007ms
-iterator : 7
-Inference complete
-Inference time: 0.028ms
-iterator : 8
-Inference complete
-Inference time: 0.029ms
-iterator : 9
-Inference complete
-Inference time: 0.029ms
-
-### TUNED ###
-iterator : 0
-Inference complete
-Inference time: 0.063ms
-iterator : 1
-Inference complete
-Inference time: 0.004ms
-iterator : 2
-Inference complete
-Inference time: 0.004ms
-iterator : 3
-Inference complete
-Inference time: 0.004ms
-iterator : 4
-Inference complete
-Inference time: 0.004ms
-iterator : 5
-Inference complete
-Inference time: 0.004ms
-iterator : 6
-Inference complete
-Inference time: 0.004ms
-iterator : 7
-Inference complete
-Inference time: 0.004ms
-iterator : 8
-Inference complete
-Inference time: 0.004ms
-iterator : 9
-Inference complete
-Inference time: 0.004ms
-```
-
-### YModel
-
-The best inference performance through MIGraphX is conditioned upon having tuned kernel configurations stored in a `/home` local User Database (DB). If a user were to move their model to a different server or allow a different user to use it, they would have to run through the MIOpen tuning process again to populate the next User DB with the best kernel configurations and corresponding solvers.
-
-Tuning is time consuming, and if the users have not performed tuning, they would see discrepancies between expected or claimed inference performance and actual inference performance. This has led to repetitive and time-consuming tuning tasks for each user.
-
-MIGraphX introduces a feature, known as YModel, that stores the kernel config parameters found during tuning into a `.mxr` file. This ensures the same level of expected performance, even when a model is copied to a different user/system.
-
-The YModel feature is available starting from ROCm 5.4.1 and UIF 1.1.
-
-#### YModel example
-
-Through the `migraphx-driver` functionality, you can generate `.mxr` files with tuning information stored inside it by passing additional `--binary --output model.mxr` to `migraphx-driver` along with the rest of the necessary flags.
-
-For example, to generate `.mxr` file from the ONNX model, use the following:
-
-```bash
-./path/to/migraphx-driver compile --onnx resnet50.onnx --enable-offload-copy --binary --output resnet50.mxr
-```
-
-To run generated `.mxr` files through `migraphx-driver`, use the following:
-
-```bash
-./path/to/migraphx-driver run --migraphx resnet50.mxr --enable-offload-copy
-```
-
-Alternatively, you can use the MIGraphX C++ or Python API to generate `.mxr` files.
-
-![Generating an MXR file](../data/conceptual/image018.png "Generating an MXR file")
--- a/docs/conceptual/oversubscription.rst
+++ b/docs/conceptual/oversubscription.rst
@@ -0,0 +1,34 @@
+.. meta::
+   :description: Learn what causes oversubscription.
+   :keywords: warning, log, gpu, performance penalty, help
+
+*******************************************************************
+Oversubscription of hardware resources in AMD Instinct accelerators
+*******************************************************************
+
+When an AMD Instinct™ MI series accelerator enters an oversubscribed state, the ``amdgpu`` driver outputs the following
+message.
+
+``amdgpu: Runlist is getting oversubscribed. Expect reduced ROCm performance.``
+
+Oversubscription occurs when application demands exceed the available hardware resources. In an oversubscribed
+state, the hardware scheduler tries to manage resource usage in a round-robin fashion. However,
+this can result in reduced performance, as resources might be occupied by applications or queues not actively
+submitting work. The granularity of hardware resources occupied by an inactive queue can be in the order of
+milliseconds, during which the accelerator or GPU is effectively blocked and unable to process work submitted by other
+queues.
+
+What triggers oversubscription?
+===============================
+
+The system enters an oversubscribed state when one of the following conditions is met:
+
+* **Hardware queue limit exceeded**: The number of user-mode compute queues requested by applications exceeds the
+  hardware limit of 24 queues for current Instinct accelerators.
+
+* **Virtual memory context slots exceeded**: The number of user processes exceeds the number of available virtual memory
+  context slots, which is 11 for current Instinct accelerators.
+
+* **Multiple processes using cooperative workgroups**: More than one process attempts to use the cooperative workgroup
+  feature, leading to resource contention.
+
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -81,6 +81,7 @@ article_pages = [
        "file": "how-to/llm-fine-tuning-optimization/profiling-and-debugging",
        "os": ["linux"],
    },
+    {"file": "how-to/performance-validation/mi300x/vllm-benchmark", "os": ["linux"]},
    {"file": "how-to/system-optimization/index", "os": ["linux"]},
    {"file": "how-to/system-optimization/mi300x", "os": ["linux"]},
    {"file": "how-to/system-optimization/mi200", "os": ["linux"]},
--- a/docs/contribute/building.md
+++ b/docs/contribute/building.md
@@ -0,0 +1,150 @@
+<head>
+  <meta charset="UTF-8">
+  <meta name="description" content="Building ROCm documentation">
+  <meta name="keywords" content="documentation, Visual Studio Code, GitHub, command line,
+  AMD, ROCm">
+</head>
+
+# Building documentation
+
+## GitHub
+
+If you open a pull request and scroll down to the summary panel,
+there is a commit status section. Next to the line
+`docs/readthedocs.com:advanced-micro-devices-demo`, there is a `Details` link.
+If you click this, it takes you to the Read the Docs build for your pull request.
+
+![GitHub PR commit status](../data/contribute/commit-status.png)
+
+If you don't see this line, click `Show all checks` to get an itemized view.
+
+## Command line
+
+You can build our documentation via the command line using Python.
+
+See the `build.tools.python` setting in the [Read the Docs configuration file](https://github.com/ROCm/ROCm/blob/develop/.readthedocs.yaml) for the Python version used by Read the Docs to build documentation.
+
+See the [Python requirements file](https://github.com/ROCm/ROCm/blob/develop/docs/sphinx/requirements.txt) for Python packages needed to build the documentation.
+
+Use the Python Virtual Environment (`venv`) and run the following commands from the project root:
+
+```sh
+python3 -mvenv .venv
+
+.venv/bin/python     -m pip install -r docs/sphinx/requirements.txt
+.venv/bin/python     -m sphinx -T -E -b html -d _build/doctrees -D language=en docs _build/html
+```
+
+Navigate to `_build/html/index.html` and open this file in a web browser.
+
+## Visual Studio Code
+
+With the help of a few extensions, you can create a productive environment to author and test
+documentation locally using Visual Studio (VS) Code. Follow these steps to configure VS Code:
+
+1. Install the required extensions:
+
+   * Python: `(ms-python.python)`
+   * Live Server: `(ritwickdey.LiveServer)`
+
+2. Add the following entries to `.vscode/settings.json`.
+
+    ```json
+      {
+        "liveServer.settings.root": "/.vscode/build/html",
+        "liveServer.settings.wait": 1000,
+        "python.terminal.activateEnvInCurrentTerminal": true
+      }
+    ```
+
+    * `liveServer.settings.root`: Sets the root of the output website for live previews. Must be changed
+      alongside the `tasks.json` command.
+    * `liveServer.settings.wait`: Tells the live server to wait with the update in order to give Sphinx time to
+      regenerate the site contents and not refresh before the build is complete.
+    * `python.terminal.activateEnvInCurrentTerminal`: Activates the automatic virtual environment, so you
+      can build the site from the integrated terminal.
+
+3. Add the following tasks to `.vscode/tasks.json`.
+
+    ```json
+      {
+        "version": "2.0.0",
+        "tasks": [
+          {
+            "label": "Build Docs",
+            "type": "process",
+            "windows": {
+              "command": "${workspaceFolder}/.venv/Scripts/python.exe"
+            },
+            "command": "${workspaceFolder}/.venv/bin/python3",
+            "args": [
+              "-m",
+              "sphinx",
+              "-j",
+              "auto",
+              "-T",
+              "-b",
+              "html",
+              "-d",
+              "${workspaceFolder}/.vscode/build/doctrees",
+              "-D",
+              "language=en",
+              "${workspaceFolder}/docs",
+              "${workspaceFolder}/.vscode/build/html"
+            ],
+            "problemMatcher": [
+              {
+                "owner": "sphinx",
+                "fileLocation": "absolute",
+                "pattern": {
+                  "regexp": "^(?:.*\\.{3}\\s+)?(\\/[^:]*|[a-zA-Z]:\\\\[^:]*):(\\d+):\\s+(WARNING|ERROR):\\s+(.*)$",
+                  "file": 1,
+                  "line": 2,
+                  "severity": 3,
+                  "message": 4
+                }
+              },
+              {
+              "owner": "sphinx",
+                "fileLocation": "absolute",
+                "pattern": {
+                  "regexp": "^(?:.*\\.{3}\\s+)?(\\/[^:]*|[a-zA-Z]:\\\\[^:]*):{1,2}\\s+(WARNING|ERROR):\\s+(.*)$",
+                  "file": 1,
+                  "severity": 2,
+                  "message": 3
+                }
+              }
+            ],
+            "group": {
+              "kind": "build",
+              "isDefault": true
+            }
+          }
+        ]
+      }
+    ```
+
+    > Implementation detail: two problem matchers were needed to be defined,
+    > because VS Code doesn't tolerate some problem information being potentially
+    > absent. While a single regex could match all types of errors, if a capture
+    > group remains empty (the line number doesn't show up in all warning/error
+    > messages) but the `pattern` references said empty capture group, VS Code
+    > discards the message completely.
+
+4. Configure the Python virtual environment (`venv`).
+
+    From the Command Palette, run `Python: Create Environment`. Select `venv` environment and
+    `docs/sphinx/requirements.txt`.
+
+5. Build the docs.
+
+    Launch the default build task using one of the following options:
+
+    * A hotkey (the default is `Ctrl+Shift+B`)
+    * Issuing the `Tasks: Run Build Task` from the Command Palette
+
+6. Open the live preview.
+
+    Navigate to the site output within VS Code: right-click on `.vscode/build/html/index.html` and
+    select `Open with Live Server`. The contents should update on every rebuild without having to
+    refresh the browser.
--- a/docs/data/contribute/commit-status.png
+++ b/docs/data/contribute/commit-status.png
--- a/docs/how-to/llm-fine-tuning-optimization/llm-inference-frameworks.rst
+++ b/docs/how-to/llm-fine-tuning-optimization/llm-inference-frameworks.rst
@@ -16,7 +16,7 @@ This section discusses how to implement `vLLM <https://docs.vllm.ai/en/latest>`_
 vLLM inference
 ==============

-vLLM is renowned for its paged attention algorithm that can reduce memory consumption and increase throughput thanks to
+vLLM is renowned for its PagedAttention algorithm that can reduce memory consumption and increase throughput thanks to
 its paging scheme. Instead of allocating GPU high-bandwidth memory (HBM) for the maximum output token lengths of the
 models, the paged attention of vLLM allocates GPU HBM dynamically for its actual decoding lengths. This paged attention
 is also effective when multiple requests share the same key and value contents for a large value of beam search or
@@ -139,9 +139,7 @@ Refer to :ref:`mi300x-vllm-optimization` for performance optimization tips.

 ROCm provides a prebuilt optimized Docker image for validating the performance of LLM inference with vLLM 
 on the MI300X accelerator. The Docker image includes ROCm, vLLM, PyTorch, and tuning files in the CSV 
-format. For more information, see the guide to 
-`LLM inference performance validation with vLLM on the AMD Instinct™ MI300X accelerator <https://github.com/ROCm/MAD/blob/develop/benchmark/vllm/README.md>`_ 
-on the ROCm GitHub repository.
+format. For more information, see :doc:`/how-to/performance-validation/mi300x/vllm-benchmark`.

 .. _fine-tuning-llms-tgi:

--- a/docs/how-to/llm-fine-tuning-optimization/model-quantization.rst
+++ b/docs/how-to/llm-fine-tuning-optimization/model-quantization.rst
@@ -181,7 +181,7 @@ Installing bitsandbytes
      # Clone the github repo
      git clone --recurse https://github.com/ROCm/bitsandbytes.git
      cd bitsandbytes
-      git checkout rocm_enabled
+      git checkout rocm_enabled_multi_backend

      # Install dependencies 
      pip install -r requirements-dev.txt
--- a/docs/how-to/llm-fine-tuning-optimization/single-gpu-fine-tuning-and-inference.rst
+++ b/docs/how-to/llm-fine-tuning-optimization/single-gpu-fine-tuning-and-inference.rst
@@ -91,7 +91,7 @@ Setting up the base implementation environment
      # Use -DBNB_ROCM_ARCH to target a specific GPU architecture.
      git clone --recurse https://github.com/ROCm/bitsandbytes.git
      cd bitsandbytes
-      git checkout rocm_enabled
+      git checkout rocm_enabled_multi_backend
      pip install -r requirements-dev.txt
      cmake -DBNB_ROCM_ARCH="gfx942" -DCOMPUTE_BACKEND=hip -S .
      python setup.py install
--- a/docs/how-to/performance-validation/mi300x/vllm-benchmark.rst
+++ b/docs/how-to/performance-validation/mi300x/vllm-benchmark.rst
@@ -0,0 +1,407 @@
+.. meta::
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the unified
+                 ROCm Docker image.
+   :keywords: model, MAD, automation, dashboarding, validate
+
+***********************************************************
+LLM inference performance validation on AMD Instinct MI300X
+***********************************************************
+
+.. _vllm-benchmark-unified-docker:
+
+The `ROCm vLLM Docker <https://hub.docker.com/r/rocm/vllm/tags>`_ image offers
+a prebuilt, optimized environment designed for validating large language model
+(LLM) inference performance on the AMD Instinct™ MI300X accelerator. This
+ROCm vLLM Docker image integrates vLLM and PyTorch tailored specifically for the
+MI300X accelerator and includes the following components:
+
+* `ROCm 6.2.1 <https://github.com/ROCm/ROCm>`_
+
+* `vLLM 0.6.4 <https://docs.vllm.ai/en/latest>`_
+
+* `PyTorch 2.5.0 <https://github.com/pytorch/pytorch>`_
+
+* Tuning files (in CSV format)
+
+With this Docker image, you can quickly validate the expected inference
+performance numbers on the MI300X accelerator. This topic also provides tips on
+optimizing performance with popular AI models.
+
+.. hlist::
+   :columns: 6
+
+   * Llama 3.1 8B
+
+   * Llama 3.1 70B
+
+   * Llama 3.1 405B
+
+   * Llama 2 7B
+
+   * Llama 2 70B
+
+   * Mixtral 8x7B
+
+   * Mixtral 8x22B
+
+   * Mixtral 7B
+
+   * Qwen2 7B
+
+   * Qwen2 72B
+
+   * JAIS 13B
+
+   * JAIS 30B
+
+.. _vllm-benchmark-vllm:
+
+.. note::
+
+   vLLM is a toolkit and library for LLM inference and serving. AMD implements
+   high-performance custom kernels and modules in vLLM to enhance performance.
+   See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
+   more information.
+
+Getting started
+===============
+
+Use the following procedures to reproduce the benchmark results on an
+MI300X accelerator with the prebuilt vLLM Docker image.
+
+.. _vllm-benchmark-get-started:
+
+1. Disable NUMA auto-balancing.
+
+   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
+   might hang until the periodic balancing is finalized. For more information,
+   see :ref:`AMD Instinct MI300X system optimization <mi300x-disable-numa>`.
+
+   .. code-block:: shell
+
+      # disable automatic NUMA balancing
+      sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
+      # check if NUMA balancing is disabled (returns 0 if disabled)
+      cat /proc/sys/kernel/numa_balancing
+      0
+
+2. Download the :ref:`ROCm vLLM Docker image <vllm-benchmark-unified-docker>`.
+
+   Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
+
+Once setup is complete, you can choose between two options to reproduce the
+benchmark results:
+
+-  :ref:`MAD-integrated benchmarking <vllm-benchmark-mad>`
+
+-  :ref:`Standalone benchmarking <vllm-benchmark-standalone>`
+
+.. _vllm-benchmark-mad:
+
+MAD-integrated benchmarking
+===========================
+
+Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+directory and install the required packages on the host machine.
+
+.. code-block:: shell
+
+   git clone https://github.com/ROCm/MAD
+   cd MAD
+   pip install -r requirements.txt
+
+Use this command to run a performance benchmark test of the Llama 3.1 8B model
+on one GPU with ``float16`` data type in the host machine.
+
+.. code-block:: shell
+
+   export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+   python3 tools/run_models.py --tags pyt_vllm_llama-3.1-8b --keep-model-dir --live-output --timeout 28800
+
+ROCm MAD launches a Docker container with the name
+``container_ci-pyt_vllm_llama-3.1-8b``. The latency and throughput reports of the
+model are collected in the following path: ``~/MAD/reports_float16/``.
+
+Although the following models are preconfigured to collect latency and
+throughput performance data, you can also change the benchmarking parameters.
+Refer to the :ref:`Standalone benchmarking <vllm-benchmark-standalone>` section.
+
+Available models
+----------------
+
+.. hlist::
+   :columns: 3
+
+   * ``pyt_vllm_llama-3.1-8b``
+
+   * ``pyt_vllm_llama-3.1-70b``
+
+   * ``pyt_vllm_llama-3.1-405b``
+
+   * ``pyt_vllm_llama-2-7b``
+
+   * ``pyt_vllm_llama-2-70b``
+
+   * ``pyt_vllm_mixtral-8x7b``
+
+   * ``pyt_vllm_mixtral-8x22b``
+
+   * ``pyt_vllm_mistral-7b``
+
+   * ``pyt_vllm_qwen2-7b``
+
+   * ``pyt_vllm_qwen2-72b``
+
+   * ``pyt_vllm_jais-13b``
+
+   * ``pyt_vllm_jais-30b``
+
+   * ``pyt_vllm_llama-3.1-8b_fp8``
+
+   * ``pyt_vllm_llama-3.1-70b_fp8``
+
+   * ``pyt_vllm_llama-3.1-405b_fp8``
+
+   * ``pyt_vllm_mixtral-8x7b_fp8``
+
+   * ``pyt_vllm_mixtral-8x22b_fp8``
+
+.. _vllm-benchmark-standalone:
+
+Standalone benchmarking
+=======================
+
+You can run the vLLM benchmark tool independently by starting the
+:ref:`Docker container <vllm-benchmark-get-started>` as shown in the following
+snippet.
+
+.. code-block::
+
+   docker pull rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
+   docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 128G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name vllm_v0.6.4 rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
+
+In the Docker container, clone the ROCm MAD repository and navigate to the
+benchmark scripts directory at ``~/MAD/scripts/vllm``.
+
+.. code-block::
+
+   git clone https://github.com/ROCm/MAD
+   cd MAD/scripts/vllm
+
+Command
+-------
+
+To start the benchmark, use the following command with the appropriate options.
+See :ref:`Options <vllm-benchmark-standalone-options>` for the list of
+options and their descriptions.
+
+.. code-block:: shell
+
+   ./vllm_benchmark_report.sh -s $test_option -m $model_repo -g $num_gpu -d $datatype
+
+See the :ref:`examples <vllm-benchmark-run-benchmark>` for more information.
+
+.. note::
+
+   The input sequence length, output sequence length, and tensor parallel (TP) are
+   already configured. You don't need to specify them with this script.
+
+.. note::
+
+   If you encounter the following error, pass your access-authorized Hugging
+   Face token to the gated models.
+
+   .. code-block:: shell
+
+      OSError: You are trying to access a gated repo.
+
+      # pass your HF_TOKEN
+      export HF_TOKEN=$your_personal_hf_token
+
+.. _vllm-benchmark-standalone-options:
+
+Options
+-------
+
+.. list-table::
+   :header-rows: 1
+   :align: center
+
+   * - Name
+     - Options
+     - Description
+
+   * - ``$test_option``
+     - latency
+     - Measure decoding token latency
+
+   * -
+     - throughput
+     - Measure token generation throughput
+
+   * -
+     - all
+     - Measure both throughput and latency
+
+   * - ``$model_repo``
+     - ``meta-llama/Meta-Llama-3.1-8B-Instruct``
+     - Llama 3.1 8B
+
+   * - (``float16``)
+     - ``meta-llama/Meta-Llama-3.1-70B-Instruct``
+     - Llama 3.1 70B
+
+   * -
+     - ``meta-llama/Meta-Llama-3.1-405B-Instruct``
+     - Llama 3.1 405B
+
+   * -
+     - ``meta-llama/Llama-2-7b-chat-hf``
+     - Llama 2 7B
+
+   * -
+     - ``meta-llama/Llama-2-70b-chat-hf``
+     - Llama 2 70B
+
+   * -
+     - ``mistralai/Mixtral-8x7B-Instruct-v0.1``
+     - Mixtral 8x7B
+
+   * -
+     - ``mistralai/Mixtral-8x22B-Instruct-v0.1``
+     - Mixtral 8x22B
+
+   * -
+     - ``mistralai/Mistral-7B-Instruct-v0.3``
+     - Mixtral 7B
+
+   * -
+     - ``Qwen/Qwen2-7B-Instruct``
+     - Qwen2 7B
+
+   * -
+     - ``Qwen/Qwen2-72B-Instruct``
+     - Qwen2 72B
+
+   * -
+     - ``core42/jais-13b-chat``
+     - JAIS 13B
+
+   * -
+     - ``core42/jais-30b-chat-v3``
+     - JAIS 30B
+
+   * - ``$model_repo``
+     - ``amd/Meta-Llama-3.1-8B-Instruct-FP8-KV``
+     - Llama 3.1 8B
+
+   * - (``float8``)
+     - ``amd/Meta-Llama-3.1-70B-Instruct-FP8-KV``
+     - Llama 3.1 70B
+
+   * -
+     - ``amd/Meta-Llama-3.1-405B-Instruct-FP8-KV``
+     - Llama 3.1 405B
+
+   * -
+     - ``amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV``
+     - Mixtral 8x7B
+
+   * -
+     - ``amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV``
+     - Mixtral 8x22B
+
+   * - ``$num_gpu``
+     - 1 or 8
+     - Number of GPUs
+
+   * - ``$datatype``
+     - ``float16`` or ``float8``
+     - Data type
+
+.. _vllm-benchmark-run-benchmark:
+
+Running the benchmark on the MI300X accelerator
+-----------------------------------------------
+
+Here are some examples of running the benchmark with various options.
+See :ref:`Options <vllm-benchmark-standalone-options>` for the list of
+options and their descriptions.
+
+Example 1: latency benchmark
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ 
+Use this command to benchmark the latency of the Llama 3.1 8B model on one GPU with the ``float16`` and ``float8`` data types.
+
+.. code-block::
+
+   ./vllm_benchmark_report.sh -s latency -m meta-llama/Meta-Llama-3.1-8B-Instruct -g 1 -d float16
+   ./vllm_benchmark_report.sh -s latency -m amd/Meta-Llama-3.1-8B-Instruct-FP8-KV -g 1 -d float8
+
+Find the latency reports at:
+
+- ``./reports_float16/summary/Meta-Llama-3.1-8B-Instruct_latency_report.csv``
+
+- ``./reports_float8/summary/Meta-Llama-3.1-8B-Instruct-FP8-KV_latency_report.csv``
+
+Example 2: throughput benchmark
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Use this command to benchmark the throughput of the Llama 3.1 8B model on one GPU with the ``float16`` and ``float8`` data types.
+
+.. code-block:: shell
+
+   ./vllm_benchmark_report.sh -s throughput -m meta-llama/Meta-Llama-3.1-8B-Instruct -g 1 -d float16
+   ./vllm_benchmark_report.sh -s throughput -m amd/Meta-Llama-3.1-8B-Instruct-FP8-KV -g 1 -d float8
+
+Find the throughput reports at:
+
+- ``./reports_float16/summary/Meta-Llama-3.1-8B-Instruct_throughput_report.csv``
+
+- ``./reports_float8/summary/Meta-Llama-3.1-8B-Instruct-FP8-KV_throughput_report.csv``
+
+.. raw:: html
+
+   <style>
+   mjx-container[jax="CHTML"][display="true"] {
+       text-align: left;
+       margin: 0;
+   }
+   </style>
+
+.. note::
+
+   Throughput is calculated as:
+
+   - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
+
+   - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
+
+Further reading
+===============
+
+- For application performance optimization strategies for HPC and AI workloads,
+  including inference with vLLM, see :doc:`/how-to/tuning-guides/mi300x/workload`.
+
+- To learn more about the options for latency and throughput benchmark scripts,
+  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
+
+- To learn more about system settings and management practices to configure your system for
+  MI300X accelerators, see :doc:`/how-to/system-optimization/mi300x`.
+
+- To learn how to run LLM models from Hugging Face or your own model, see
+  :doc:`Using ROCm for AI </how-to/rocm-for-ai/index>`.
+
+- To learn how to optimize inference on LLMs, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/llm-fine-tuning-optimization/index>`.
+
+- For a list of other ready-made Docker images for ROCm, see the
+  :doc:`Docker image support matrix <rocm-install-on-linux:reference/docker-image-support-matrix>`.
+
+- To compare with the previous version of the ROCm vLLM Docker image for performance validation, refer to
+  `LLM inference performance validation on AMD Instinct MI300X (ROCm 6.2.0) <https://rocm.docs.amd.com/en/docs-6.2.0/how-to/performance-validation/mi300x/vllm-benchmark.html>`_.
+
--- a/docs/how-to/system-optimization/mi100.md
+++ b/docs/how-to/system-optimization/mi100.md
@@ -342,8 +342,8 @@ If SMT is enabled by setting "CCD/Core/Thread Enablement > SMT Control" to
 (logical) cores of the system:

 * In the server BIOS, set IOMMU to "Enabled".
-* When configuring the Grub boot loader, add the following arguments for the
-  Linux kernel: `amd_iommu=on iommu=pt`
+* When configuring the Grub boot loader, add the following argument for the
+  Linux kernel: `iommu=pt`
 * Update Grub to use the modified configuration:

  ```shell
@@ -355,7 +355,7 @@ If SMT is enabled by setting "CCD/Core/Thread Enablement > SMT Control" to

  ```none
  [...]
-  [   0.000000] Kernel command line: [...] amd_iommu=on iommu=pt
+  [   0.000000] Kernel command line: [...] iommu=pt
     [...]
  ```

--- a/docs/how-to/system-optimization/mi200.md
+++ b/docs/how-to/system-optimization/mi200.md
@@ -327,8 +327,8 @@ If SMT is enabled by setting "CCD/Core/Thread Enablement > SMT Control" to
 (logical) cores of the system:

 * In the server BIOS, set IOMMU to "Enabled".
-* When configuring the Grub boot loader, add the following arguments for the
-  Linux kernel: `amd_iommu=on iommu=pt`
+* When configuring the Grub boot loader, add the following argument for the
+  Linux kernel: `iommu=pt`
 * Update Grub to use the modified configuration:

  ```shell
@@ -340,7 +340,7 @@ If SMT is enabled by setting "CCD/Core/Thread Enablement > SMT Control" to

  ```none
  [...]
-  [   0.000000] Kernel command line: [...] amd_iommu=on iommu=pt
+  [   0.000000] Kernel command line: [...] iommu=pt
     [...]
  ```

--- a/docs/how-to/system-optimization/mi300x.rst
+++ b/docs/how-to/system-optimization/mi300x.rst
@@ -299,7 +299,7 @@ For a system that has AMD host CPUs add this to ``GRUB_CMDLINE_LINUX``:

 .. code-block:: text

-   amd_iommu=on iommu=pt
+   iommu=pt

 Otherwise, if the system has Intel host CPUs add this instead to
 ``GRUB_CMDLINE_LINUX``:
@@ -500,7 +500,7 @@ If SMT is enabled by setting ``CCD/Core/Thread Enablement > SMT Control`` to

 #. In the server BIOS, set IOMMU to ``Enabled``.

-#. When configuring the GRUB boot loader, add the following arguments for the Linux kernel: ``amd_iommu=on iommu=pt``.
+#. When configuring the GRUB boot loader, add the following argument for the Linux kernel: ``iommu=pt``.

 #. Update GRUB.

@@ -515,7 +515,7 @@ If SMT is enabled by setting ``CCD/Core/Thread Enablement > SMT Control`` to
 .. code-block:: shell

   [...]
-   [   0.000000] Kernel command line: [...] amd_iommu=on iommu=pt
+   [   0.000000] Kernel command line: [...] iommu=pt
   [...]

 Once the system is properly configured, ROCm software can be
--- a/docs/how-to/system-optimization/w6000-v620.md
+++ b/docs/how-to/system-optimization/w6000-v620.md
@@ -111,7 +111,7 @@ sudo virsh net-start default /*to enable Virtual network by default
 Enable input-output memory management unit (IOMMU) in GRUB settings by adding the following line to `/etc/default/grub`:

 ```none
-GRUB_CMDLINE_LINUX_DEFAULT="quiet splash amd_iommu=on" for AMD CPU
+GRUB_CMDLINE_LINUX_DEFAULT="quiet splash" for AMD CPU
 ```

 Update grub and reboot
--- a/docs/how-to/tuning-guides/mi300x/index.rst
+++ b/docs/how-to/tuning-guides/mi300x/index.rst
@@ -8,6 +8,8 @@ accelerators. They include detailed instructions on system settings and
 application tuning suggestions to help you fully leverage the capabilities of
 these accelerators, thereby achieving optimal performance.

+* :doc:`/how-to/performance-validation/mi300x/vllm-benchmark`
+
 * :doc:`/how-to/tuning-guides/mi300x/system`

 * :doc:`/how-to/tuning-guides/mi300x/workload`
--- a/docs/how-to/tuning-guides/mi300x/workload.rst
+++ b/docs/how-to/tuning-guides/mi300x/workload.rst
@@ -152,9 +152,7 @@ address any new bottlenecks that may emerge.

 ROCm provides a prebuilt optimized Docker image that has everything required to implement
 the tips in this section. It includes ROCm, vLLM, PyTorch, and tuning files in the CSV 
-format. For more information, see the guide to 
-`LLM inference performance validation with vLLM on the AMD Instinct™ MI300X accelerator <https://github.com/ROCm/MAD/blob/develop/benchmark/vllm/README.md>`_ 
-on the ROCm GitHub repository.
+format. For more information, see :doc:`/how-to/performance-validation/mi300x/vllm-benchmark`.

 .. _mi300x-profiling-tools:

@@ -378,11 +376,10 @@ Refer to `vLLM documentation <https://docs.vllm.ai/en/latest/models/performance.
 for additional performance tips. :ref:`fine-tuning-llms-vllm` describes vLLM
 usage with ROCm.

-ROCm provides a prebuilt optimized Docker image for validating the performance of LLM inference with vLLM 
-on the MI300X accelerator. The Docker image includes ROCm, vLLM, PyTorch, and tuning files in the CSV 
-format. For more information, see the guide to 
-`LLM inference performance validation with vLLM on the AMD Instinct™ MI300X accelerator <https://github.com/ROCm/MAD/blob/develop/benchmark/vllm/README.md>`_ 
-on the ROCm GitHub repository.
+ROCm provides a prebuilt optimized Docker image for validating the performance
+of LLM inference with vLLM on the MI300X accelerator. The Docker image includes
+ROCm, vLLM, PyTorch, and tuning files in the CSV format. For more information,
+see :doc:`/how-to/performance-validation/mi300x/vllm-benchmark`.

 Maximize throughput
 -------------------
--- a/docs/index.md
+++ b/docs/index.md
@@ -45,7 +45,7 @@ ROCm documentation is organized into the following categories:
 * [Using ROCm for HPC](./how-to/rocm-for-hpc/index.rst)
 * [Fine-tuning LLMs and inference optimization](./how-to/llm-fine-tuning-optimization/index.rst)
 * [System optimization](./how-to/system-optimization/index.rst)
-* [AMD Instinct MI300X tuning guides](./how-to/tuning-guides/mi300x/index.rst)
+* [AMD Instinct MI300X performance validation and tuning](./how-to/tuning-guides/mi300x/index.rst)
 * [GPU cluster networking](https://rocm.docs.amd.com/projects/gpu-cluster-networking/en/latest/index.html)
 * [System debugging](./how-to/system-debugging.md)
 * [Using MPI](./how-to/gpu-enabled-mpi.rst)
@@ -64,7 +64,7 @@ ROCm documentation is organized into the following categories:
 * [Using CMake](./conceptual/cmake-packages.rst)
 * [ROCm & PCIe atomics](./conceptual/More-about-how-ROCm-uses-PCIe-Atomics.rst)
 * [Inception v3 with PyTorch](./conceptual/ai-pytorch-inception.md)
-* [Inference optimization with MIGraphX](./conceptual/ai-migraphx-optimization.md)
+* [Oversubscription of hardware resources](./conceptual/oversubscription.rst)
 :::

 <!-- markdownlint-disable MD051 -->
--- a/docs/reference/gpu-arch-specs.rst
+++ b/docs/reference/gpu-arch-specs.rst
@@ -7,6 +7,8 @@ Accelerator and GPU hardware specifications

 The following tables provide an overview of the hardware specifications for AMD Instinct™ accelerators, and AMD Radeon™ PRO and Radeon™ GPUs.

+For more information about ROCm hardware compatibility, see the ROCm `Compatibility matrix <https://rocm.docs.amd.com/en/latest/compatibility/compatibility-matrix.html>`_.
+
 .. tab-set::

  .. tab-item:: AMD Instinct accelerators
@@ -35,11 +37,11 @@ The following tables provide an overview of the hardware specifications for AMD
          - CDNA3
          - gfx941 or gfx942
          - 192
-          - 304
+          - 304 (38 per XCD)
          - 64
          - 64
          - 256
-          - 32
+          - 32 (4 per XCD)
          - 32
          - 16 per 2 CUs
          - 64 per 2 CUs
@@ -50,11 +52,11 @@ The following tables provide an overview of the hardware specifications for AMD
          - CDNA3
          - gfx940 or gfx942
          - 128
-          - 228
+          - 228 (38 per XCD)
          - 64
          - 64
          - 256
-          - 24
+          - 24 (4 per XCD)
          - 32
          - 16 per 2 CUs
          - 64 per 2 CUs
@@ -80,7 +82,7 @@ The following tables provide an overview of the hardware specifications for AMD
          - CDNA2
          - gfx90a
          - 128
-          - 208
+          - 208 (104 per GCD)
          - 64
          - 64
          -
@@ -770,3 +772,7 @@ scalar instructions.
 **GCD**

 Graphics Compute Die.
+
+**XCD**
+
+Accelerator Complex Die.
--- a/docs/reference/precision-support.rst
+++ b/docs/reference/precision-support.rst
@@ -41,6 +41,8 @@ together with their corresponding HIP type and a short description.
      - ``int64_t``, ``uint64_t``
      - A signed or unsigned 64-bit integer

+.. _precision_support_floating_point_types:
+
 Floating-point types
 ==========================================

--- a/docs/sphinx/_toc.yml.in
+++ b/docs/sphinx/_toc.yml.in
@@ -70,9 +70,11 @@ subtrees:
      - file: how-to/system-optimization/w6000-v620.md
        title: AMD RDNA 2
  - file: how-to/tuning-guides/mi300x/index.rst
-    title: AMD MI300X tuning guides
+    title: AMD MI300X performance validation and tuning
    subtrees:
    - entries:
+      - file: how-to/performance-validation/mi300x/vllm-benchmark.rst
+        title: Performance validation
      - file: how-to/tuning-guides/mi300x/system.rst
        title: System tuning
      - file: how-to/tuning-guides/mi300x/workload.rst
@@ -158,8 +160,8 @@ subtrees:
    title: ROCm & PCIe atomics
  - file: conceptual/ai-pytorch-inception.md
    title: Inception v3 with PyTorch
-  - file: conceptual/ai-migraphx-optimization.md
-    title: Inference optimization with MIGraphX
+  - file: conceptual/oversubscription.rst
+    title: Oversubscription of hardware resources

 - caption: Reference
  entries:
@@ -180,6 +182,7 @@ subtrees:
    - entries:
      - file: contribute/toolchain.md
        title: ROCm documentation toolchain
+      - file: contribute/building.md
  - file: contribute/feedback.md
    title: Providing feedback about the ROCm documentation
  - file: about/license.md
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
@@ -1,2 +1,2 @@
-rocm-docs-core==1.8.2
+rocm-docs-core==1.8.3
 sphinx-reredirects
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -92,7 +92,7 @@ requests==2.32.3
    # via
    #   pygithub
    #   sphinx
-rocm-docs-core==1.8.2
+rocm-docs-core==1.8.3
    # via -r requirements.in
 smmap==5.0.1
    # via gitdb
--- a/tools/autotag/templates/highlights/5.3.0.md
+++ b/tools/autotag/templates/highlights/5.3.0.md
@@ -184,5 +184,5 @@ clinfo, and HelloWord.cl and cause a system crash.
 * IRQ remapping does not support X2APIC mode
 * NMI error

-Workaround: To avoid the system crash, add `amd_iommu=on iommu=pt` as the kernel bootparam, as
+Workaround: To avoid the system crash, add `iommu=pt` as the kernel bootparam, as
 indicated in the warning message.
Author	SHA1	Message	Date
Peter Park	17d04124c1	update links to vllm perf validation doc	2024-10-30 15:52:12 -04:00
Daniel Su	4b8fdf1ae3	External CI: various fixes (#3963 )	2024-10-30 15:52:12 -04:00
Joseph Macaranas	c88f3996dc	External CI: Add aqlprofile to Tensile test dependencies (#3961 )	2024-10-30 15:52:12 -04:00
Peter Park	5b53802c54	add suggestions to vllm perf validation doc	2024-10-30 14:08:35 -04:00
Peter Park	bdeef73263	add vllm performance validation doc	2024-10-30 12:51:40 -04:00
Joseph Macaranas	2960fbbbd6	External CI: Always force rocPyDecode cleanup step	2024-10-30 10:50:44 -04:00
Brian Cornille	d367700f84	Improve consistency of the gpu-arch-specs table. (#3936 ) * Improve consistency of the gpu-arch-specs table. * Add XCD to the glossary.	2024-10-30 10:01:34 -04:00
Joseph Macaranas	0298a79c97	External CI: do not assume python is python3 for rocpydecode (#3955 )	2024-10-29 13:28:43 -04:00
Joseph Macaranas	ae2b197fc8	External CI: rocpydecode dependency installation change (#3954 ) - Install pybind11 through pip instead of apt - Add pip-installed pybind11 path to CMAKE_PREFIX_PATH - Tested against source of PR 122	2024-10-29 11:01:54 -04:00
Joseph Macaranas	bce439ecac	External CI: pytorch pipeline updates (#3948 ) To support recent upstream changes and issues observed.	2024-10-25 17:17:16 -04:00
Daniel Su	9fdd785979	External CI: fix HIP_PIPELINE_ID (#3944 )	2024-10-25 11:23:32 -04:00
Daniel Su	bc7ab7707d	External CI: move gpu-diag directly before tests (#3943 )	2024-10-25 11:23:23 -04:00
Daniel Su	55239688b4	External CI: enumerate GPUs in gpu-diagnostics (#3942 )	2024-10-24 16:56:12 -04:00
Daniel Su	d90775066f	External CI: rocDecode add libva-amdgpu-dev dependency (#3940 )	2024-10-24 12:06:51 -04:00
dependabot[bot]	60d99fe592	Build(deps): Bump rocm-docs-core from 1.8.2 to 1.8.3 in /docs/sphinx (#3933 ) Bumps [rocm-docs-core](https://github.com/ROCm/rocm-docs-core) from 1.8.2 to 1.8.3. - [Release notes](https://github.com/ROCm/rocm-docs-core/releases) - [Changelog](https://github.com/ROCm/rocm-docs-core/blob/develop/CHANGELOG.md) - [Commits](https://github.com/ROCm/rocm-docs-core/compare/v1.8.2...v1.8.3) --- updated-dependencies: - dependency-name: rocm-docs-core dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-10-23 12:58:06 -06:00
Daniel Su	6b54336374	External CI: AMDMIGraphX greater-equal pip dependencies (#3939 )	2024-10-23 14:55:09 -04:00
Daniel Su	fd2f2dc77a	External CI: add support to disable individual component tests (#3938 )	2024-10-23 14:48:34 -04:00
Joseph Macaranas	fe7cb52882	External CI: Add CK into pytorch build environment (#3934 )	2024-10-22 11:46:51 -04:00
Daniel Su	93be8ca70f	External CI: create a GPU diagnostics template (#3932 )	2024-10-22 09:33:35 -04:00
Daniel Su	62d855a658	External CI: disable rocMLIR tests (#3931 ) * External CI: disable rocMLIR tests * roctracer AMDGPU_TARGETS flag	2024-10-21 14:47:16 -04:00
Joseph Macaranas	82a14d78cc	External CI: Moved location of upstream pytorch build scripts (#3930 ) https://github.com/pytorch/pytorch/pull/138103	2024-10-21 14:26:58 -04:00
Joseph Macaranas	91a06398ca	External CI: hipBLASLt build now requires python packaging module (#3926 ) https://github.com/ROCm/hipBLASLt/pull/1250/files#diff-fee2e6f068b33fca3a1dc49392de8848dbf05c3f4632b680abb1052523e5a30fR35	2024-10-21 10:25:52 -04:00
Joseph Greathouse	d80340d0a2	Merge pull request #3919 from ROCm/krussell/amdiommufix docs: Remove invalid amd_iommu=on parameter	2024-10-18 10:05:56 -05:00
Kent Russell	74333b667d	docs: Remove invalid amd_iommu=on parameter Per kernel-parameters.txt, there is no "on" option for amd_iommu. While intel_iommu has it, amd_iommu is automatically on unless specified otherwise. For more info, see these 2 links: https://www.kernel.org/doc/Documentation/admin-guide/kernel-parameters.txt `75aa74d52f/drivers/iommu/amd/init.c (L3481)` Signed-off-by: Kent Russell <kent.russell@amd.com>	2024-10-18 10:44:07 -04:00
spolifroni-amd	a0f88ce17e	added a link to the compatibility matrix (#3904 ) * added a link to the compatibility matrix * removed quotes	2024-10-17 16:35:03 -04:00
Joseph Macaranas	23dde7e7b6	External CI: Use pip to install latest cmake on test system (#3915 )	2024-10-17 14:07:33 -04:00
Daniel Su	6177d7c635	External CI: MIOpen parse test results (#3913 )	2024-10-17 10:53:27 -04:00
Daniel Su	be5f00737b	External CI: omnitrace/rocprof-sys pipeline IDs (#3908 )	2024-10-16 11:18:09 -04:00
Daniel Su	ba9b3b1ec3	External CI: create rocprofiler-systems pipeline (#3906 )	2024-10-15 16:40:02 -04:00
Daniel Su	c9e89f108e	External CI: use Boost template for MIOpen (#3903 )	2024-10-15 14:17:41 -04:00
MKKnorr	d1e5db13be	Documentation: Add reference to precision-support floating-point types (#3899 )	2024-10-15 09:33:43 -06:00
Peter Park	b541be7bcb	Update bitsandbytes branch in docs (#3898 )	2024-10-15 10:47:56 -04:00
Joseph Macaranas	1a27f64e5f	External CI: Add pipeline to build upstream boost (#3896 )	2024-10-15 10:05:50 -04:00
Peter Park	49342eaed3	Merge pull request #3887 from peterjunpark/docs/6.2.4 docs/6.2.4: add oversubscription conceptual doc (#3885)	2024-10-11 16:00:42 -04:00
Peter Park	837175aea1	add oversubscription conceptual doc (#3885 ) (cherry picked from commit `d0ecf51b0c`)	2024-10-11 15:53:19 -04:00
Peter Park	d0ecf51b0c	add oversubscription conceptual doc (#3885 ) add mitigiation steps add to toc move page for build move doc fix spelling update doc update oversubscription update order fix spelling add oversubscription to wordlist move oversubscription topic to bottom of toc and index	2024-10-11 15:47:23 -04:00
Joseph Macaranas	5656ea9285	External CI: Tensile pipeline (#3884 )	2024-10-11 13:47:37 -04:00
Daniel Su	4fa8be6136	External CI: use ctest for rocm-examples (#3877 )	2024-10-09 17:03:25 -04:00
Daniel Su	40a4658fe4	External CI: programmatically get latest aqlprofile (#3876 )	2024-10-09 10:10:51 -04:00
Sam Wu	abc0e6a087	Add building doc section (#3873 )	2024-10-08 10:01:17 -06:00
Daniel Su	3d16142166	External CI: add libstdc++-12 to rocMLIR (#3874 )	2024-10-08 09:50:53 -04:00
Joseph Macaranas	21d0f09a88	External CI: Stop building gfx90a (#3872 ) Save on VM resources until infrastructure has test targets.	2024-10-07 18:21:03 -04:00
Daniel Su	2008056d90	External CI: update component default/mainline branches (#3871 )	2024-10-07 16:44:49 -04:00
Daniel Su	f72e28afbb	External CI: add a global variable to control gfx942 tests (#3864 )	2024-10-04 18:57:19 -04:00
spolifroni-amd	a8dd588a72	Removed MIGraphX optimization page (#3848 )	2024-10-04 17:06:51 -04:00
Joseph Macaranas	f7dbbb5ad8	External CI: Add option to pull mainline branch for dependencies (#3689 ) * External CI: Add option to pull mainline branch for dependencies * Missing parameter for mainline branch dependencies. * External CI: mainline branch definitions	2024-10-04 16:28:16 -04:00
Sam Wu	6d5d22ac6a	Set devops team as codeowners for rocm-build (#3860 ) * Set ext CI as codeowners for rocm-build * Update CODEOWNERS to rocm-devops	2024-10-04 11:09:17 -06:00
spolifroni-amd	1a86548a2c	updated the radeon note, as it were (#3857 ) * updated the radeon note, as it were * updated the note again	2024-10-03 10:58:58 -04:00