[Ex CI] hipSPARSELt monorepo enablement (#5033 )

[Ex CI] increase hipSPARSELt test timeout (#5028 )
HIP 7.0 upcoming changes blog link updated (#5021 )
2026-01-09 22:58:17 -05:00 · 2025-07-11 16:40:18 -04:00 · 2025-07-10 12:04:06 -04:00 · 2025-07-10 09:53:44 -04:00 · 2025-07-09 14:57:53 -04:00 · 2025-07-09 14:38:24 -04:00
65 changed files with 1056 additions and 425 deletions
--- a/.azuredevops/components/ROCmValidationSuite.yml
+++ b/.azuredevops/components/ROCmValidationSuite.yml
@@ -86,8 +86,7 @@ jobs:
      value: $(Agent.BuildDirectory)/rocm
    - name: HIP_INC_DIR
      value: $(Agent.BuildDirectory)/rocm
-    pool:
-      vmImage: ${{ variables.BASE_BUILD_POOL }}
+    pool: ${{ variables.MEDIUM_BUILD_POOL }}
    workspace:
      clean: all
    steps:
--- a/.azuredevops/components/Tensile.yml
+++ b/.azuredevops/components/Tensile.yml
@@ -33,8 +33,9 @@ parameters:
  type: object
  default:
    - cmake
-    - libmsgpack-dev
+    - libboost-filesystem-dev
    - libboost-program-options-dev
+    - libmsgpack-dev
 - name: pipModules
  type: object
  default:
--- a/.azuredevops/components/copyHIP.yml
+++ b/.azuredevops/components/copyHIP.yml
@@ -39,4 +39,6 @@ jobs:
      parameters:
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      inputs:
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
--- a/.azuredevops/components/hipBLAS-common.yml
+++ b/.azuredevops/components/hipBLAS-common.yml
@@ -51,15 +51,15 @@ parameters:
    buildJobs:
      - { os: ubuntu2204, packageManager: apt }
      - { os: almalinux8, packageManager: dnf }
-# - name: downstreamComponentMatrix
-#   type: object
-#   default:
-#     - hipBLASLt:
-#       name: hipBLASLt
-#       sparseCheckoutDir: projects/hipblaslt
-#       skipUnifiedBuild: 'false'
-#       buildDependsOn:
-#         - hipBLAS_common_build
+- name: downstreamComponentMatrix
+  type: object
+  default:
+    - hipBLASLt:
+      name: hipBLASLt
+      sparseCheckoutDir: projects/hipblaslt
+      skipUnifiedBuild: 'false'
+      buildDependsOn:
+        - hipBLAS_common_build

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
@@ -122,14 +122,14 @@ jobs:
    #     extraEnvVars:
    #       - ROCM_PATH:::/home/user/workspace/rocm

-# - ${{ if parameters.triggerDownstreamJobs }}:
-#   - ${{ each component in parameters.downstreamComponentMatrix }}:
-#     - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
-#       - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
-#         parameters:
-#           checkoutRepo: ${{ parameters.checkoutRepo }}
-#           sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
-#           buildDependsOn: ${{ component.buildDependsOn }}
-#           downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
-#           triggerDownstreamJobs: true
-#           unifiedBuild: ${{ parameters.unifiedBuild }}
+- ${{ if parameters.triggerDownstreamJobs }}:
+  - ${{ each component in parameters.downstreamComponentMatrix }}:
+    - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
+      - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
+        parameters:
+          checkoutRepo: ${{ parameters.checkoutRepo }}
+          sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
+          buildDependsOn: ${{ component.buildDependsOn }}
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
+          triggerDownstreamJobs: true
+          unifiedBuild: ${{ parameters.unifiedBuild }}
--- a/.azuredevops/components/hipBLASLt.yml
+++ b/.azuredevops/components/hipBLASLt.yml
@@ -77,28 +77,28 @@ parameters:
  type: object
  default:
    buildJobs:
-      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
-      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
-      # - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
-      # - { os: almalinux8, packageManager: dnf, target: gfx942 }
-      # - { os: almalinux8, packageManager: dnf, target: gfx90a }
-      # - { os: almalinux8, packageManager: dnf, target: gfx1201 }
-      # - { os: almalinux8, packageManager: dnf, target: gfx1100 }
-      # - { os: almalinux8, packageManager: dnf, target: gfx1030 }
+      - { pool: rocm-ci_ultra_build_pool, os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { pool: rocm-ci_medium_build_pool, os: ubuntu2204, packageManager: apt, target: gfx90a }
+      - { pool: rocm-ci_medium_build_pool, os: ubuntu2204, packageManager: apt, target: gfx1201 }
+      - { pool: rocm-ci_medium_build_pool, os: ubuntu2204, packageManager: apt, target: gfx1100 }
+      - { pool: rocm-ci_medium_build_pool, os: ubuntu2204, packageManager: apt, target: gfx1030 }
+      - { pool: rocm-ci_ultra_build_pool, os: almalinux8, packageManager: dnf, target: gfx942 }
+      - { pool: rocm-ci_medium_build_pool, os: almalinux8, packageManager: dnf, target: gfx90a }
+      - { pool: rocm-ci_medium_build_pool, os: almalinux8, packageManager: dnf, target: gfx1201 }
+      - { pool: rocm-ci_medium_build_pool, os: almalinux8, packageManager: dnf, target: gfx1100 }
+      - { pool: rocm-ci_medium_build_pool, os: almalinux8, packageManager: dnf, target: gfx1030 }
    testJobs:
      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
-# - name: downstreamComponentMatrix
-#   type: object
-#   default:
-#     - rocBLAS:
-#       name: rocBLAS
-#       sparseCheckoutDir: projects/rocblas
-#       skipUnifiedBuild: 'false'
-#       buildDependsOn:
-#         - hipBLASLt_build
+- name: downstreamComponentMatrix
+  type: object
+  default:
+    - rocBLAS:
+      name: rocBLAS
+      sparseCheckoutDir: projects/rocblas
+      skipUnifiedBuild: 'false'
+      buildDependsOn:
+        - hipBLASLt_build

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
@@ -121,7 +121,7 @@ jobs:
      value: $(Agent.BuildDirectory)/rocm
    - name: DAY_STRING
      value: $[format('{0:ddMMyyyy}', pipeline.startTime)]
-    pool: ${{ variables.ULTRA_BUILD_POOL }}
+    pool: ${{ job.pool }}
    ${{ if eq(job.os, 'almalinux8') }}:
      container:
        image: rocmexternalcicd.azurecr.io/manylinux228:latest
@@ -140,6 +140,10 @@ jobs:
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
+      parameters:
+        dependencyList:
+          - gtest
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
@@ -156,18 +160,15 @@ jobs:
        script: |
          echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/bin"
          echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/llvm/bin"
-    # hipBLASLt has a script for gtest and lapack
-    # https://github.com/ROCm/hipBLASLt/blob/develop/deps/CMakeLists.txt
-    # $(Agent.BuildDirectory)/deps is a temporary folder for the build process
-    # $(Agent.BuildDirectory)/s/deps is part of the hipBLASLt repo
    - task: Bash@3
-      displayName: Build and install external dependencies
+      displayName: Build and install LAPACK
      inputs:
        targetType: inline
        script: |
-          mkdir -p $(Agent.BuildDirectory)/deps
-          cd $(Agent.BuildDirectory)/deps
-          cmake -DCMAKE_POSITION_INDEPENDENT_CODE=ON $(Agent.BuildDirectory)/s/deps
+          mkdir -p $(Agent.BuildDirectory)/temp-deps
+          cd $(Agent.BuildDirectory)/temp-deps
+          # position-independent LAPACK is required for almalinux8 builds
+          cmake -DBUILD_GTEST=OFF -DBUILD_LAPACK=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON $(Agent.BuildDirectory)/s/deps
          make
          sudo make install
    - script: |
@@ -187,7 +188,7 @@ jobs:
      parameters:
        os: ${{ job.os }}
        extraBuildFlags: >-
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor
          -DCMAKE_INCLUDE_PATH=$(Agent.BuildDirectory)/rocm/llvm/include
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
@@ -244,6 +245,7 @@ jobs:
      workspace:
        clean: all
      steps:
+      - checkout: none
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
        parameters:
          aptPackages: ${{ parameters.aptPackages }}
@@ -280,14 +282,14 @@ jobs:
          environment: test
          gpuTarget: ${{ job.target }}

-# - ${{ if parameters.triggerDownstreamJobs }}:
-#   - ${{ each component in parameters.downstreamComponentMatrix }}:
-#     - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
-#       - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
-#         parameters:
-#           checkoutRepo: ${{ parameters.checkoutRepo }}
-#           sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
-#           buildDependsOn: ${{ component.buildDependsOn }}
-#           downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
-#           triggerDownstreamJobs: true
-#           unifiedBuild: ${{ parameters.unifiedBuild }}
+- ${{ if parameters.triggerDownstreamJobs }}:
+  - ${{ each component in parameters.downstreamComponentMatrix }}:
+    - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
+      - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
+        parameters:
+          checkoutRepo: ${{ parameters.checkoutRepo }}
+          sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
+          buildDependsOn: ${{ component.buildDependsOn }}
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
+          triggerDownstreamJobs: true
+          unifiedBuild: ${{ parameters.unifiedBuild }}
--- a/.azuredevops/components/hipFFT.yml
+++ b/.azuredevops/components/hipFFT.yml
@@ -156,6 +156,7 @@ jobs:
    workspace:
      clean: all
    steps:
+    - checkout: none
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/hipSOLVER.yml
+++ b/.azuredevops/components/hipSOLVER.yml
@@ -70,8 +70,7 @@ jobs:
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
-    pool:
-      vmImage: ${{ variables.BASE_BUILD_POOL }}
+    pool: ${{ variables.MEDIUM_BUILD_POOL }}
    workspace:
      clean: all
    steps:
--- a/.azuredevops/components/hipSPARSELt.yml
+++ b/.azuredevops/components/hipSPARSELt.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: hipSPARSELt
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -64,7 +83,11 @@ parameters:

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: hipSPARSELt_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_ubuntu2204_${{ job.target }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_ubuntu2204_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -91,12 +114,15 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
  # Build and install gtest and lapack
  # $(Pipeline.Workspace)/deps is a temporary folder for the build process
  # $(Pipeline.Workspace)/s/deps is part of the hipSPARSELt repo
@@ -131,8 +157,10 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
        gpuTarget: ${{ job.target }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
@@ -150,44 +178,49 @@ jobs:
          - TENSILE_ROCM_OFFLOAD_BUNDLER_PATH:::/home/user/workspace/rocm/llvm/bin/clang-offload-bundler
        installLatestCMake: true

- ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: hipSPARSELt_test_${{ job.target }}
-    dependsOn: hipSPARSELt_build_${{ job.target }}
-    condition:
-      and(succeeded(),
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
-        eq(${{ parameters.aggregatePipeline }}, False)
-      )
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool: ${{ job.target }}_test_pool
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-      parameters:
-        gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmTestDependencies }}
-        gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: hipSPARSELt
-        testDir: '$(Agent.BuildDirectory)/rocm/bin'
-        testExecutable: './hipsparselt-test'
-        testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes --gtest_filter=*pre_checkin*'
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-        environment: test
-        gpuTarget: ${{ job.target }}
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
+    - job: ${{ parameters.componentName }}_test_ubuntu2204_${{ job.target }}
+      timeoutInMinutes: 120
+      dependsOn: ${{ parameters.componentName }}_build_ubuntu2204_${{ job.target }}
+      condition:
+        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
+          eq(${{ parameters.aggregatePipeline }}, False)
+        )
+      variables:
+      - group: common
+      - template: /.azuredevops/variables-global.yml
+      pool: ${{ job.target }}_test_pool
+      workspace:
+        clean: all
+      steps:
+      - checkout: none
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+        parameters:
+          gpuTarget: ${{ job.target }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
+          dependencyList: ${{ parameters.rocmTestDependencies }}
+          gpuTarget: ${{ job.target }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: ${{ parameters.componentName }}
+          testDir: '$(Agent.BuildDirectory)/rocm/bin'
+          testExecutable: './hipsparselt-test'
+          testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes --gtest_filter=*pre_checkin*'
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+          environment: test
+          gpuTarget: ${{ job.target }}
--- a/.azuredevops/components/llvm-project.yml
+++ b/.azuredevops/components/llvm-project.yml
@@ -67,7 +67,6 @@ jobs:
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
-        skipLlvmSymlink: true
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
--- a/.azuredevops/components/rocAL.yml
+++ b/.azuredevops/components/rocAL.yml
@@ -86,8 +86,7 @@ jobs:
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
-    pool:
-      vmImage: ${{ variables.BASE_BUILD_POOL }}
+    pool:  ${{ variables.MEDIUM_BUILD_POOL }}
    workspace:
      clean: all
    steps:
--- a/.azuredevops/components/rocALUTION.yml
+++ b/.azuredevops/components/rocALUTION.yml
@@ -73,8 +73,7 @@ jobs:
    - template: /.azuredevops/variables-global.yml
    - name: HIP_ROCCLR_HOME
      value: $(Build.BinariesDirectory)/rocm
-    pool:
-      vmImage: ${{ variables.BASE_BUILD_POOL }}
+    pool: ${{ variables.MEDIUM_BUILD_POOL }}
    workspace:
      clean: all
    steps:
--- a/.azuredevops/components/rocBLAS.yml
+++ b/.azuredevops/components/rocBLAS.yml
@@ -33,17 +33,15 @@ parameters:
  type: object
  default:
    - cmake
-    - ninja-build
-    - python3-venv
    - git
-    - libmsgpack-dev
    - gfortran
-    - libopenblas-dev
-    - googletest
-    - libgtest-dev
-    - wget
-    - python3-pip
    - libdrm-dev
+    - libmsgpack-dev
+    - libopenblas-dev
+    - ninja-build
+    - python3-pip
+    - python3-venv
+    - wget
 - name: pipModules
  type: object
  default:
@@ -52,18 +50,17 @@ parameters:
 - name: rocmDependencies
  type: object
  default:
-    - rocm-cmake
-    - llvm-project
-    - ROCR-Runtime
-    - clr
-    - rocminfo
-    - rocprofiler-register
-    - rocm_smi_lib
-    - rocm-core
    - aomp
-    - aomp-extras
+    - clr
    - hipBLAS-common
    - hipBLASLt
+    - llvm-project
+    - rocm-cmake
+    - rocm-core
+    - rocm_smi_lib
+    - rocminfo
+    - rocprofiler-register
+    - ROCR-Runtime
    - roctracer
 - name: rocmTestDependencies
  type: object
@@ -86,32 +83,40 @@ parameters:
      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
-      # - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
-      # - { os: almalinux8, packageManager: dnf, target: gfx942 }
-      # - { os: almalinux8, packageManager: dnf, target: gfx90a }
-      # - { os: almalinux8, packageManager: dnf, target: gfx1201 }
-      # - { os: almalinux8, packageManager: dnf, target: gfx1100 }
-      # - { os: almalinux8, packageManager: dnf, target: gfx1030 }
+      - { os: almalinux8, packageManager: dnf, target: gfx942 }
+      - { os: almalinux8, packageManager: dnf, target: gfx90a }
+      - { os: almalinux8, packageManager: dnf, target: gfx1201 }
+      - { os: almalinux8, packageManager: dnf, target: gfx1100 }
+      - { os: almalinux8, packageManager: dnf, target: gfx1030 }
    testJobs:
      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
-# - name: downstreamComponentMatrix
-#   type: object
-#   default:
-#     # rocSOLVER depends on both rocBLAS and rocPRIM
-#     # for a unified build, rocBLAS will be the one to call rocSOLVER
-#     - rocSOLVER:
-#       name: rocSOLVER
-#       sparseCheckoutDir: projects/rocsolver
-#       skipUnifiedBuild: 'false'
-#       buildDependsOn:
-#         - rocBLAS_build
-#       unifiedBuild:
-#         downstreamAggregateNames: rocBLAS+rocPRIM
-#         buildDependsOn:
-#           - rocBLAS_build
-#           - rocPRIM_build
+- name: downstreamComponentMatrix
+  type: object
+  default:
+   # technically hipSPARSELt is a downstream component of hipSPARSE
+   # since hipSPARSE is not yet enabled, we will trigger it from rocBLAS in the interim
+    - hipSPARSELt:
+      name: hipSPARSELt
+      sparseCheckoutDir: projects/hipsparselt
+      skipUnifiedBuild: 'false'
+      buildDependsOn:
+        - rocBLAS_build
+    # rocSOLVER depends on both rocBLAS and rocPRIM
+    # for a unified build, rocBLAS will be the one to call rocSOLVER
+    # - rocSOLVER:
+    #   name: rocSOLVER
+    #   sparseCheckoutDir: projects/rocsolver
+    #   skipUnifiedBuild: 'false'
+    #   buildDependsOn:
+    #     - rocBLAS_build
+    #   unifiedBuild:
+    #     downstreamAggregateNames: rocBLAS+rocPRIM
+    #     buildDependsOn:
+    #       - rocBLAS_build
+    #       - rocPRIM_build

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
@@ -151,6 +156,12 @@ jobs:
        checkoutRepo: ${{ parameters.checkoutRepo }}
        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aocl.yml
+      parameters:
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
+      parameters:
+        dependencyList:
+          - gtest
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
@@ -164,21 +175,12 @@ jobs:
      parameters:
        os: ${{ job.os }}
        extraBuildFlags: >-
-          -DCMAKE_TOOLCHAIN_FILE=toolchain-linux.cmake
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm/llvm;$(Agent.BuildDirectory)/rocm
+          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm/llvm;$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor
          -DCMAKE_BUILD_TYPE=Release
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/bin/amdclang++
          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/bin/amdclang
          -DGPU_TARGETS=${{ job.target }}
-          -DTensile_CODE_OBJECT_VERSION=default
-          -DTensile_LOGIC=asm_full
-          -DTensile_SEPARATE_ARCHITECTURES=ON
-          -DTensile_LAZY_LIBRARY_LOADING=ON
-          -DTensile_LIBRARY_FORMAT=msgpack
          -DBUILD_CLIENTS_TESTS=ON
-          -DBUILD_CLIENTS_BENCHMARKS=OFF
-          -DBUILD_CLIENTS_SAMPLES=OFF
-          -DROCM_PATH=$(Agent.BuildDirectory)/rocm
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
@@ -208,6 +210,7 @@ jobs:
 - ${{ if eq(parameters.unifiedBuild, False) }}:
  - ${{ each job in parameters.jobMatrix.testJobs }}:
    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
+      timeoutInMinutes: 120
      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
      condition:
        and(succeeded(),
@@ -222,6 +225,7 @@ jobs:
      workspace:
        clean: all
      steps:
+      - checkout: none
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
        parameters:
          aptPackages: ${{ parameters.aptPackages }}
@@ -258,18 +262,18 @@ jobs:
          environment: test
          gpuTarget: ${{ job.target }}

-# - ${{ if parameters.triggerDownstreamJobs }}:
-#   - ${{ each component in parameters.downstreamComponentMatrix }}:
-#     - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
-#       - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
-#         parameters:
-#           checkoutRepo: ${{ parameters.checkoutRepo }}
-#           sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
-#           triggerDownstreamJobs: true
-#           unifiedBuild: ${{ parameters.unifiedBuild }}
-#           ${{ if parameters.unifiedBuild }}:
-#             buildDependsOn: ${{ component.unifiedBuild.buildDependsOn }}
-#             downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ component.unifiedBuild.downstreamAggregateNames }}
-#           ${{ else }}:
-#             buildDependsOn: ${{ component.buildDependsOn }}
-#             downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
+- ${{ if parameters.triggerDownstreamJobs }}:
+  - ${{ each component in parameters.downstreamComponentMatrix }}:
+    - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
+      - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
+        parameters:
+          checkoutRepo: ${{ parameters.checkoutRepo }}
+          sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
+          triggerDownstreamJobs: true
+          unifiedBuild: ${{ parameters.unifiedBuild }}
+          ${{ if parameters.unifiedBuild }}:
+            buildDependsOn: ${{ component.unifiedBuild.buildDependsOn }}
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ component.unifiedBuild.downstreamAggregateNames }}
+          ${{ else }}:
+            buildDependsOn: ${{ component.buildDependsOn }}
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
--- a/.azuredevops/components/rocFFT.yml
+++ b/.azuredevops/components/rocFFT.yml
@@ -166,6 +166,7 @@ jobs:
    workspace:
      clean: all
    steps:
+    - checkout: none
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/rocMLIR.yml
+++ b/.azuredevops/components/rocMLIR.yml
@@ -27,6 +27,7 @@ parameters:
    - numpy
    - tomli
    - scipy
+    - pybind11
 - name: rocmDependencies
  type: object
  default:
--- a/.azuredevops/components/rocPRIM.yml
+++ b/.azuredevops/components/rocPRIM.yml
@@ -210,7 +210,7 @@ jobs:
        parameters:
          componentName: ${{ parameters.componentName }}
          testDir: '$(Agent.BuildDirectory)/rocm/bin/rocprim'
-          extraTestParameters: '-I ${{ job.shard }},,${{ job.shardCount }}'
+          extraTestParameters: '-I ${{ job.shard }},,${{ job.shardCount }} -E device_merge_inplace'
          os: ${{ job.os }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
        parameters:
--- a/.azuredevops/components/rocPyDecode.yml
+++ b/.azuredevops/components/rocPyDecode.yml
@@ -36,6 +36,7 @@ parameters:
    - clr
    - llvm-project
    - rocDecode
+    - rocJPEG
    - rocm-cmake
    - rocm-core
    - rocminfo
@@ -192,9 +193,9 @@ jobs:
      inputs:
        itemPattern: '**/*.whl'
        targetPath: $(Agent.BuildDirectory)
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
      parameters:
-        checkoutRepo: ${{ parameters.checkoutRepo }}
+        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
@@ -221,25 +222,17 @@ jobs:
    - task: CMake@1
      displayName: 'rocPyDecode Test CMake Flags'
      inputs:
+        workingDirectory: $(Agent.BuildDirectory)/rocm/share/rocpydecode/tests
        cmakeArgs: >-
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(PYTHON_USER_SITE)/pybind11;$(PYTHON_DIST_PACKAGES)/pybind11;$(PYBIND11_PATH)
          -DCMAKE_BUILD_TYPE=Release
          -DGPU_TARGETS=${{ job.target }}
-          ..
+          .
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
      parameters:
        componentName: rocPyDecode
-        testDir: $(Build.SourcesDirectory)/build
-  # sudo required for pip install but screws up permissions for next pipeline run
-    - task: Bash@3
-      displayName: Clean up test environment
-      condition: always()
-      inputs:
-        targetType: inline
-        script: |
-          pip uninstall -y rocPyDecode
-          pip uninstall -y hip-python
+        testDir: $(Agent.BuildDirectory)/rocm/share/rocpydecode/tests
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/rocSOLVER.yml
+++ b/.azuredevops/components/rocSOLVER.yml
@@ -33,13 +33,11 @@ parameters:
  type: object
  default:
    - cmake
-    - ninja-build
-    - libsuitesparse-dev
    - gfortran
-    - libfmt-dev
    - git
-    - googletest
-    - libgtest-dev
+    - libfmt-dev
+    - libsuitesparse-dev
+    - ninja-build
    - python3-pip
 - name: rocmDependencies
  type: object
@@ -75,13 +73,13 @@ parameters:
      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
-      # - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
-      # - { os: almalinux8, packageManager: dnf, target: gfx942 }
-      # - { os: almalinux8, packageManager: dnf, target: gfx90a }
-      # - { os: almalinux8, packageManager: dnf, target: gfx1201 }
-      # - { os: almalinux8, packageManager: dnf, target: gfx1100 }
-      # - { os: almalinux8, packageManager: dnf, target: gfx1030 }
+      - { os: almalinux8, packageManager: dnf, target: gfx942 }
+      - { os: almalinux8, packageManager: dnf, target: gfx90a }
+      - { os: almalinux8, packageManager: dnf, target: gfx1201 }
+      - { os: almalinux8, packageManager: dnf, target: gfx1100 }
+      - { os: almalinux8, packageManager: dnf, target: gfx1030 }
    testJobs:
      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
@@ -119,6 +117,10 @@ jobs:
        targetType: inline
        script: git clone --depth 1 --branch v3.9.1 https://github.com/Reference-LAPACK/lapack
        workingDirectory: '$(Build.SourcesDirectory)'
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
+      parameters:
+        dependencyList:
+          - gtest
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
@@ -134,6 +136,7 @@ jobs:
        os: ${{ job.os }}
        extraBuildFlags: >-
          -DCMAKE_BUILD_TYPE=Release
+          -DCMAKE_POSITION_INDEPENDENT_CODE=ON
          -DCMAKE_Fortran_FLAGS=-fno-optimize-sibling-calls
          -DBUILD_TESTING=OFF
          -DCBLAS=ON
@@ -146,7 +149,7 @@ jobs:
      parameters:
        os: ${{ job.os }}
        extraBuildFlags: >-
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Pipeline.Workspace)/deps-install
+          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Pipeline.Workspace)/deps-install;$(Agent.BuildDirectory)/vendor
          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
          -DAMDGPU_TARGETS=${{ job.target }}
@@ -191,6 +194,7 @@ jobs:
      workspace:
        clean: all
      steps:
+      - checkout: none
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
        parameters:
          aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/rocminfo.yml
+++ b/.azuredevops/components/rocminfo.yml
@@ -67,7 +67,6 @@ jobs:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-        skipLlvmSymlink: true
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
--- a/.azuredevops/nightly/pytorch.yml
+++ b/.azuredevops/nightly/pytorch.yml
@@ -407,7 +407,6 @@ jobs:
      dependencyList: ${{ parameters.rocmTestDependencies }}
      gpuTarget: $(JOB_GPU_TARGET)
      dependencySource: staging
-      skipLlvmSymlink: true
 # get sources to run test scripts
  - task: Bash@3
    displayName: git clone upstream pytorch
--- a/.azuredevops/nightly/rocm-nightly.yml
+++ b/.azuredevops/nightly/rocm-nightly.yml
@@ -119,7 +119,6 @@ jobs:
        dependencyList: ${{ parameters.rocmDependencies }}
        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
-        skipLibraryLinking: true
    - script: df -h
      displayName: System disk space after ROCm
    - script: du -sh $(Agent.BuildDirectory)/rocm
--- a/.azuredevops/templates/steps/artifact-download.yml
+++ b/.azuredevops/templates/steps/artifact-download.yml
@@ -12,6 +12,9 @@ parameters:
 - name: fileFilter
  type: string
  default: ''
+- name: extractAndDeleteFiles
+  type: boolean
+  default: true
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -37,16 +40,17 @@ steps:
        buildVersionToDownload: latest # aomp trigger lives in ROCm/ROCm, so cannot use ROCm/aomp branch names
      ${{ else }}:
        buildVersionToDownload: latestFromBranch
- task: ExtractFiles@1
-  displayName: Extract ${{ parameters.componentName }}
-  inputs:
-    archiveFilePatterns: '$(Pipeline.Workspace)/d/**/*.tar.gz'
-    destinationFolder: '$(Agent.BuildDirectory)/rocm'
-    cleanDestinationFolder: false
-    overwriteExistingFiles: true
- task: DeleteFiles@1
-  displayName: Cleanup Compressed ${{ parameters.componentName }}
-  inputs:
-    SourceFolder: '$(Pipeline.Workspace)/d'
-    Contents: '**/*.tar.gz'
-    RemoveDotFiles: true
+- ${{ if eq(parameters.extractAndDeleteFiles, true) }}:
+  - task: ExtractFiles@1
+    displayName: Extract ${{ parameters.componentName }}
+    inputs:
+      archiveFilePatterns: '$(Pipeline.Workspace)/d/**/*.tar.gz'
+      destinationFolder: '$(Agent.BuildDirectory)/rocm'
+      cleanDestinationFolder: false
+      overwriteExistingFiles: true
+  - task: DeleteFiles@1
+    displayName: Clean up Compressed ${{ parameters.componentName }}
+    inputs:
+      SourceFolder: '$(Pipeline.Workspace)/d'
+      Contents: '**/*.tar.gz'
+      RemoveDotFiles: true
--- a/.azuredevops/templates/steps/artifact-links.yml
+++ b/.azuredevops/templates/steps/artifact-links.yml
@@ -15,8 +15,8 @@ steps:
      URL_BEGIN="https://artprodcus3.artifacts.visualstudio.com/"
      URL_MIDDLE="/_apis/artifact/"
      URL_END="/content?format=file&subPath=%2F"
-      FORMATTED_JOB_NAME=$(echo $(Agent.JobName) | sed 's/ /./g; s/[-_]//g')
-      ARTIFACT_STRING="pipelineartifact://ROCm-CI/projectId/$(DOWNLOAD_PROJECT_ID)/buildId/$(Build.BuildId)/artifactName/${FORMATTED_JOB_NAME}"
+      ARTIFACT_NAME="$(Agent.JobName)_$(System.JobAttempt)"
+      ARTIFACT_STRING="pipelineartifact://ROCm-CI/projectId/$(DOWNLOAD_PROJECT_ID)/buildId/$(Build.BuildId)/artifactName/${ARTIFACT_NAME}"
      ENCODED_STRING=$(echo -n "${ARTIFACT_STRING}" | base64 -w 0)
      PADDING_COUNT=$(echo -n "${ENCODED_STRING}" | awk -F= '{print NF-1}')
      if [ "$PADDING_COUNT" -gt 0 ]; then
--- a/.azuredevops/templates/steps/artifact-upload.yml
+++ b/.azuredevops/templates/steps/artifact-upload.yml
@@ -46,5 +46,5 @@ steps:
    displayName: '${{ parameters.artifactName }} Publish'
    retryCountOnTaskFailure: 3
    inputs:
-      artifactName: ${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}_$(System.JobAttempt)
+      artifactName: $(Agent.JobName)_$(System.JobAttempt)
      targetPath: '$(Build.ArtifactStagingDirectory)'
--- a/.azuredevops/templates/steps/dependencies-aocl.yml
+++ b/.azuredevops/templates/steps/dependencies-aocl.yml
@@ -1,10 +1,15 @@
 parameters:
+- name: os
+  type: string
+  default: ubuntu2204
 - name: repositoryUrl
  type: string
  default: https://download.amd.com/developer/eula/aocl/aocl-4-2
 - name: packageName
-  type: string
-  default: aocl-linux-gcc-4.2.0_1_amd64.deb
+  type: object
+  default:
+    ubuntu2204: aocl-linux-gcc-4.2.0_1_amd64.deb
+    almalinux8: aocl-linux-gcc-4.2.0-1.x86_64.rpm

 steps:
 - task: Bash@3
@@ -12,16 +17,19 @@ steps:
  inputs:
    targetType: inline
    workingDirectory: $(Pipeline.Workspace)
-    script: wget -nv ${{ parameters.repositoryUrl }}/${{ parameters.packageName }}
+    script: wget -nv ${{ parameters.repositoryUrl }}/${{ parameters.packageName[parameters.os] }}
 - task: Bash@3
  displayName: Install AOCL
  inputs:
    targetType: inline
    workingDirectory: $(Pipeline.Workspace)
-    script: sudo apt install -y ./${{ parameters.packageName }}
+    ${{ if eq(parameters.os, 'ubuntu2204') }}:
+      script: sudo apt install -y ./${{ parameters.packageName[parameters.os] }}
+    ${{ elseif eq(parameters.os, 'almalinux8') }}:
+      script: sudo dnf install -y ./${{ parameters.packageName[parameters.os] }}
 - task: Bash@3
  displayName: Clean up AOCL
  inputs:
    targetType: inline
    workingDirectory: $(Pipeline.Workspace)
-    script: rm -f ${{ parameters.packageName }}
+    script: rm -f ${{ parameters.packageName[parameters.os] }}
--- a/.azuredevops/templates/steps/dependencies-dnf.yml
+++ b/.azuredevops/templates/steps/dependencies-dnf.yml
@@ -52,6 +52,7 @@ parameters:
    libexpat-dev: expat-devel
    libffi-dev: libffi-devel
    libfftw3-dev: fftw-devel
+    libfmt-dev: fmt-devel
    libgmp-dev: gmp-devel
    liblzma-dev: xz-devel
    libmpfr-dev: mpfr-devel
--- a/.azuredevops/templates/steps/dependencies-rocm.yml
+++ b/.azuredevops/templates/steps/dependencies-rocm.yml
@@ -19,16 +19,6 @@ parameters:
 - name: gpuTarget
  type: string
  default: ''
-# set to true if you're calling this template file multiple files in same pipeline
-# only leave last call false to optimize sequence
- name: skipLibraryLinking
-  type: boolean
-  default: false
-# set to true if llvm-project is not downloaded in a particular call
-# or if you just don't want the symlink
- name: skipLlvmSymlink
-  type: boolean
-  default: false
 # set to true if dlopen calls for HIP libraries are causing failures
 # because they do not follow shared library symlink convention
 - name: setupHIPLibrarySymlinks
@@ -367,6 +357,7 @@ steps:
        componentName: ${{ split(dependency, ':')[0] }}
        pipelineId: ${{ parameters.componentVarList[split(dependency, ':')[0]].pipelineId }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        extractAndDeleteFiles: false
        ${{ if parameters.componentVarList[split(dependency, ':')[0]].hasGpuTarget }}:
          fileFilter: "${{ split(dependency, ':')[1] }}*_${{ parameters.os }}_${{ parameters.gpuTarget }}"
        # dependencySource = staging
@@ -405,6 +396,7 @@ steps:
        componentName: ${{ dependency }}
        pipelineId: ${{ parameters.componentVarList[dependency].pipelineId }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        extractAndDeleteFiles: false
        ${{ if parameters.componentVarList[dependency].hasGpuTarget }}:
          fileFilter: ${{ parameters.os }}_${{ parameters.gpuTarget }}
        ${{ else }}:
@@ -430,8 +422,20 @@ steps:
        # default = staging
        ${{ else }}:
          branchName: ${{ parameters.componentVarList[dependency].stagingBranch }}
-# Set link to redirect llvm folder
- ${{ if eq(parameters.skipLlvmSymlink, false) }}:
+- task: ExtractFiles@1
+  displayName: Extract ROCm artifacts
+  inputs:
+    archiveFilePatterns: $(Pipeline.Workspace)/d/**/*.tar.gz
+    destinationFolder: $(Agent.BuildDirectory)/rocm
+    cleanDestinationFolder: false
+    overwriteExistingFiles: true
+- task: DeleteFiles@1
+  displayName: Clean up ROCm artifacts
+  inputs:
+    SourceFolder: $(Pipeline.Workspace)/d
+    Contents: '**/*.tar.gz'
+    RemoveDotFiles: true
+- ${{ if containsValue(parameters.dependencyList, 'llvm-project') }}:
  - task: Bash@3
    displayName: Symlink from rocm/llvm to rocm/lib/llvm
    inputs:
@@ -439,6 +443,7 @@ steps:
      script: |
        sudo mkdir -p $(Agent.BuildDirectory)/rocm/lib
        sudo ln -sr $(Agent.BuildDirectory)/rocm/llvm $(Agent.BuildDirectory)/rocm/lib/llvm
+        echo "Created symlink from rocm/llvm to rocm/lib/llvm"
  - task: Bash@3
    displayName: Symlink executables from rocm/llvm/bin to rocm/bin
    inputs:
@@ -446,7 +451,14 @@ steps:
      script: |
        for file in amdclang amdclang++ amdclang-cl amdclang-cpp amdflang amdlld aompcc mygpu mycpu offload-arch; do
          sudo ln -sr $(Agent.BuildDirectory)/rocm/llvm/bin/$file $(Agent.BuildDirectory)/rocm/bin/$file
+          echo "Created symlink from rocm/llvm/bin/$file to rocm/bin/$file"
        done
+- ${{ if containsValue(parameters.dependencyList, 'rocm-core') }}:
+  - task: Bash@3
+    displayName: Print rocm/.info/version
+    inputs:
+      targetType: inline
+      script: cat $(Agent.BuildDirectory)/rocm/.info/version
 # dlopen calls within a ctest or pytest sequence runs into issues when shared library symlink convention is not followed
 # the convention is as follows:
 # unversioned .so is a symlink to major version .so
@@ -483,17 +495,16 @@ steps:
  inputs:
    targetType: inline
    script: ls -la1R $(Agent.BuildDirectory)/rocm
- ${{ if eq(parameters.skipLibraryLinking, false) }}:
-  - task: Bash@3
-    displayName: 'Link ROCm shared libraries'
-    inputs:
-      targetType: inline
-# OS ignores if the ROCm lib folder shows up more than once
-      script: |
-        echo $(Agent.BuildDirectory)/rocm/lib | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
-        echo $(Agent.BuildDirectory)/rocm/llvm/lib | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
-        echo $(Agent.BuildDirectory)/rocm/lib64 | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
-        echo $(Agent.BuildDirectory)/rocm/llvm/lib64 | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
-        sudo cat /etc/ld.so.conf.d/rocm-ci.conf
-        sudo ldconfig -v
-        ldconfig -p
+- task: Bash@3
+  displayName: 'Link ROCm shared libraries'
+  inputs:
+    targetType: inline
+    # OS ignores if the ROCm lib folder shows up more than once
+    script: |
+      echo $(Agent.BuildDirectory)/rocm/lib | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
+      echo $(Agent.BuildDirectory)/rocm/llvm/lib | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
+      echo $(Agent.BuildDirectory)/rocm/lib64 | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
+      echo $(Agent.BuildDirectory)/rocm/llvm/lib64 | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
+      sudo cat /etc/ld.so.conf.d/rocm-ci.conf
+      sudo ldconfig -v
+      ldconfig -p
--- a/.azuredevops/templates/steps/manifest.yml
+++ b/.azuredevops/templates/steps/manifest.yml
@@ -23,13 +23,14 @@ steps:
  inputs:
    targetType: inline
    script: |
-      sudo apt-get install -y jq
+      ${{ iif(or(eq(parameters.os, 'ubuntu2204'), eq(parameters.os, 'ubuntu2404')), 'sudo apt-get install -y jq', '') }}

      # RESOURCES_REPOSITORIES is a runtime variable (not an env var!) that contains quotations and newlines
      # So we need to save it to a file to properly preserve its formatting and contents
      cat <<EOF > resources.repositories
      $(RESOURCES_REPOSITORIES)
      EOF
+      echo "Value of resources.repositories:"
      cat resources.repositories

      IS_TAG_BUILD=$(jq 'has("release_repo")' resources.repositories)
@@ -66,8 +67,6 @@ steps:
        )
      ' resources.repositories)

-      manifest_json=$(Build.ArtifactStagingDirectory)/manifest_${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.json
-
      dependencies=()
      for manifest_file in $(Pipeline.Workspace)/d/**/manifest_*.json; do
        echo "Processing $manifest_file"
@@ -78,6 +77,10 @@ steps:
      done
      dependencies_json=$(printf '%s\n' "${dependencies[@]}" | jq -s '.')

+      manifest_filename="manifest_${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}"
+      echo "##vso[task.setvariable variable=manifest_filename]$manifest_filename"
+      manifest_json=$(Build.ArtifactStagingDirectory)/$manifest_filename.json
+
      jq -n \
        --argjson current "$current" \
        --argjson dependencies "$dependencies_json" \
@@ -111,8 +114,14 @@ steps:
        ')
      dependencies_rows=$(echo $dependencies_rows)
      echo "##vso[task.setvariable variable=dependencies_rows;]$dependencies_rows"
-
-      cat $manifest_json
+- task: Bash@3
+  displayName: Print manifest.json
+  condition: always()
+  continueOnError: true
+  inputs:
+    targetType: inline
+    script: |
+      cat $(Build.ArtifactStagingDirectory)/$(manifest_filename).json
 - task: Bash@3
  displayName: Create manifest.html
  condition: always()
@@ -120,10 +129,10 @@ steps:
  inputs:
    targetType: inline
    script: |
-      manifest_html=$(Build.ArtifactStagingDirectory)/manifest_${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.html
+      manifest_html="$(Build.ArtifactStagingDirectory)/$(manifest_filename).html"
      cat <<EOF > $manifest_html
      <html>
-      <h1>Manifest</h1>
+      <h1>$(manifest_filename)</h1>
      <h2>Current</h2>
      <table border="1">
      <tr>
@@ -163,7 +172,7 @@ steps:
  continueOnError: true
  inputs:
    tabName: Manifest
-    reportDir: $(Build.ArtifactStagingDirectory)/manifest_${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.html
+    reportDir: $(Build.ArtifactStagingDirectory)/$(manifest_filename).html
 - task: Bash@3
  displayName: Save manifest artifact file name
  condition: always()
@@ -172,5 +181,5 @@ steps:
    workingDirectory: $(Pipeline.Workspace)
    targetType: inline
    script: |
-      echo "manifest_${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.html" >> pipelineArtifacts.txt
-      echo "manifest_${{ parameters.componentName }}_$(Build.BuildId)_$(Build.BuildNumber)_${{ parameters.os }}_${{ parameters.gpuTarget }}_${{ parameters.artifactName }}.json" >> pipelineArtifacts.txt
+      echo "$(manifest_filename).html" >> pipelineArtifacts.txt
+      echo "$(manifest_filename).json" >> pipelineArtifacts.txt
--- a/.azuredevops/templates/steps/miopen-get-ck-build.yml
+++ b/.azuredevops/templates/steps/miopen-get-ck-build.yml
@@ -17,7 +17,6 @@ steps:
    script: |
      AZ_API="https://dev.azure.com/ROCm-CI/ROCm-CI/_apis"
      GH_API="https://api.github.com/repos/ROCm"
-      ARTIFACT_NAME="composablekernelbuild${{ parameters.gpuTarget }}"
      EXIT_CODE=0

      # Try to find an Azure build for the specific CK commit called out in MIOpen's requirements.txt
@@ -39,8 +38,15 @@ steps:
        echo "Found specific CK build ID: $CK_BUILD_ID"
      fi

-      AZURE_URL="$AZ_API/build/builds/$CK_BUILD_ID/artifacts?artifactName=$ARTIFACT_NAME&api-version=7.1"
-      ARTIFACT_URL=$(curl -s $AZURE_URL | jq '.resource.downloadUrl' | tr -d '"')
+      AZURE_URL="$AZ_API/build/builds/$CK_BUILD_ID/artifacts?api-version=7.1"
+      ARTIFACT_URL=$(curl -s $AZURE_URL | \
+        jq --arg os "ubuntu2204" --arg gfx "${{ parameters.gpuTarget }}" '
+          .value
+          | map(select(.name | test($os) and test($gfx)))
+          | max_by(.name | capture("drop_(?<dropNumber>\\d+)").dropNumber | tonumber)
+          | .resource.downloadUrl
+        ' | \
+        tr -d '"')

      # If using the specific CK commit and it doesn't have any valid artifacts, use latest successful CK build instead
      if { [[ -z "$ARTIFACT_URL" ]] || [[ "$ARTIFACT_URL" == "null" ]]; } && [[ $EXIT_CODE -eq 0 ]]; then
@@ -48,8 +54,15 @@ steps:
        LATEST_BUILD_URL="$AZ_API/build/builds?definitions=$(COMPOSABLE_KERNEL_PIPELINE_ID)&statusFilter=completed&resultFilter=succeeded&\$top=1&api-version=7.1"
        CK_BUILD_ID=$(curl -s $LATEST_BUILD_URL | jq '.value[0].id')
        echo "Found latest CK build ID: $CK_BUILD_ID"
-        AZURE_URL="$AZ_API/build/builds/$CK_BUILD_ID/artifacts?artifactName=$ARTIFACT_NAME&api-version=7.1"
-        ARTIFACT_URL=$(curl -s $AZURE_URL | jq '.resource.downloadUrl' | tr -d '"')
+        AZURE_URL="$AZ_API/build/builds/$CK_BUILD_ID/artifacts?api-version=7.1"
+        ARTIFACT_URL=$(curl -s $AZURE_URL | \
+          jq --arg os "ubuntu2204" --arg gfx "${{ parameters.gpuTarget }}" '
+            .value
+            | map(select(.name | test($os) and test($gfx)))
+            | max_by(.name | capture("drop_(?<dropNumber>\\d+)").dropNumber | tonumber)
+            | .resource.downloadUrl
+          ' | \
+          tr -d '"')
        EXIT_CODE=2
      fi

@@ -57,8 +70,8 @@ steps:
      wget --tries=5 --waitretry=10 --retry-connrefused -nv $ARTIFACT_URL -O $(System.ArtifactsDirectory)/ck.zip
      unzip $(System.ArtifactsDirectory)/ck.zip -d $(System.ArtifactsDirectory)
      mkdir -p $(Agent.BuildDirectory)/rocm
-      tar -zxvf $(System.ArtifactsDirectory)/$ARTIFACT_NAME/*.tar.gz -C $(Agent.BuildDirectory)/rocm
-      rm -r $(System.ArtifactsDirectory)/ck.zip $(System.ArtifactsDirectory)/$ARTIFACT_NAME
+      tar -zxvf $(System.ArtifactsDirectory)/composable_kernel*/*.tar.gz -C $(Agent.BuildDirectory)/rocm
+      rm -r $(System.ArtifactsDirectory)/ck.zip $(System.ArtifactsDirectory)/composable_kernel*

      if [[ $EXIT_CODE -ne 0 ]]; then
        BUILD_COMMIT=$(curl -s $AZ_API/build/builds/$CK_BUILD_ID | jq '.sourceVersion' | tr -d '"')
--- a/.azuredevops/variables-global.yml
+++ b/.azuredevops/variables-global.yml
@@ -66,11 +66,11 @@ variables:
 - name: HIP_TESTS_PIPELINE_ID
  value: 233
 - name: HIPBLAS_COMMON_PIPELINE_ID
-  value: 223
+  value: 300
 - name: HIPBLAS_PIPELINE_ID
  value: 87
 - name: HIPBLASLT_PIPELINE_ID
-  value: 112
+  value: 301
 - name: HIPCUB_PIPELINE_ID
  value: 277
 - name: HIPFFT_PIPELINE_ID
@@ -104,7 +104,7 @@ variables:
 - name: ROCALUTION_PIPELINE_ID
  value: 89
 - name: ROCBLAS_PIPELINE_ID
-  value: 85
+  value: 302
 - name: ROCDBGAPI_PIPELINE_ID
  value: 135
 - name: ROCDECODE_PIPELINE_ID
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -654,4 +654,4 @@ There are a number of upcoming changes planned for HIP runtime API in an upcomin
 that are not backward compatible with prior releases. Most of these changes increase 
 alignment between HIP and CUDA APIs or behavior. Some of the upcoming changes are to 
 clean up header files, remove namespace collision, and have a clear separation between 
-`hipRTC` and HIP runtime.
+`hipRTC` and HIP runtime. For more information, see [HIP 7.0 Is Coming: What You Need to Know to Stay Ahead](https://rocm.blogs.amd.com/ecosystems-and-partners/transition-to-hip-7.0-blog/README.html).
--- a/docs/compatibility/compatibility-matrix.rst
+++ b/docs/compatibility/compatibility-matrix.rst
@@ -155,7 +155,7 @@ compatibility and system requirements.
 .. [#mi300x] Oracle Linux and Azure Linux are supported only on AMD Instinct MI300X.
 .. [#single-node] Debian 12 is supported only on AMD Instinct MI300X for single-node functionality. 
 .. [#mi300_620] **For ROCm 6.2.0** - MI300X (gfx942) is supported on listed operating systems *except* Ubuntu 22.04.5 [6.8 HWE] and Ubuntu 22.04.4 [6.5 HWE].
-.. [#kfd_support] Starting from ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart (assuming hardware support is available in both). For earlier ROCm releases, the compatibility is provided for +/- 2 releases. These are the compatibility combinations that are currently supported.
+.. [#kfd_support] As of ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The tested user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and kernel-space support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
 .. [#ROCT-rocr] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.
 .. [#RDNA-OS] Radeon AI PRO R9700, Radeon RX 9070 XT (gfx1201), Radeon RX 9060 XT (gfx1200), Radeon PRO W7700 (gfx1101), and Radeon RX 7800 XT (gfx1101) are supported only on Ubuntu 24.04.2, Ubuntu 22.04.5, RHEL 9.6, RHEL 9.5, and RHEL 9.4.

@@ -235,6 +235,6 @@ Expand for full historical view of:
   .. [#mi300_610-past-60] **For ROCm 6.1.0** - MI300A (gfx942) is supported on Ubuntu 22.04.4, RHEL 9.4, RHEL 9.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.4.
   .. [#mi300_602-past-60] **For ROCm 6.0.2** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.
   .. [#mi300_600-past-60] **For ROCm 6.0.0** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.
-   .. [#kfd_support-past-60] Starting from ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart (assuming hardware support is available in both). For earlier ROCm releases, the compatibility is provided for +/- 2 releases. These are the compatibility combinations that are currently supported.
+   .. [#kfd_support-past-60] As of ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The tested user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and kernel-space support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
   .. [#ROCT-rocr-past-60] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.
   .. [#RDNA-OS-past-60] Radeon AI PRO R9700, Radeon RX 9070 XT (gfx1201), Radeon RX 9060 XT (gfx1200), Radeon PRO W7700 (gfx1101), and Radeon RX 7800 XT (gfx1101) are supported only on Ubuntu 24.04.2, Ubuntu 22.04.5, RHEL 9.6, RHEL 9.5, and RHEL 9.4.
--- a/docs/conceptual/gpu-arch/mi300-mi200-performance-counters.rst
+++ b/docs/conceptual/gpu-arch/mi300-mi200-performance-counters.rst
@@ -8,7 +8,7 @@ MI300 and MI200 series performance counters and metrics

 This document lists and describes the hardware performance counters and derived metrics available
 for the AMD Instinct™ MI300 and MI200 GPU. You can also access this information using the
-:doc:`ROCProfiler tool <rocprofiler:rocprofv1>`.
+:doc:`ROCprofiler-SDK <rocprofiler-sdk:how-to/using-rocprofv3>`.

 MI300 and MI200 series performance counters
 ===============================================================
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -129,6 +129,7 @@ html_theme_options = {"link_main_doc": False}
 redirects = {"reference/openmp/openmp": "../../about/compatibility/openmp.html"}

 numfig = False
+suppress_warnings = ["autosectionlabel.*"]

 html_context = {
    "project_path" : {project_path},
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.0.1_20250605-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.0.1_20250605-benchmark-models.yaml
@@ -0,0 +1,162 @@
+vllm_benchmark:
+  unified_docker:
+    latest:
+      pull_tag: rocm/vllm:rocm6.4.1_vllm_0.9.0.1_20250605
+      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.0.1_20250605/images/sha256-f48beeb3d72663a93c77211eb45273d564451447c097e060befa713d565fa36c
+      rocm_version: 6.4.1
+      vllm_version: 0.9.0.1 (0.9.0.2.dev108+g71faa1880.rocm641)
+      pytorch_version: 2.7.0+gitf717b2a
+      hipblaslt_version: 0.15
+  model_groups:
+    - group: Meta Llama
+      tag: llama
+      models:
+      - model: Llama 3.1 8B
+        mad_tag: pyt_vllm_llama-3.1-8b
+        model_repo: meta-llama/Llama-3.1-8B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-8B
+        precision: float16
+      - model: Llama 3.1 70B
+        mad_tag: pyt_vllm_llama-3.1-70b
+        model_repo: meta-llama/Llama-3.1-70B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
+        precision: float16
+      - model: Llama 3.1 405B
+        mad_tag: pyt_vllm_llama-3.1-405b
+        model_repo: meta-llama/Llama-3.1-405B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
+        precision: float16
+      - model: Llama 2 7B
+        mad_tag: pyt_vllm_llama-2-7b
+        model_repo: meta-llama/Llama-2-7b-chat-hf
+        url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
+        precision: float16
+      - model: Llama 2 70B
+        mad_tag: pyt_vllm_llama-2-70b
+        model_repo: meta-llama/Llama-2-70b-chat-hf
+        url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
+        precision: float16
+      - model: Llama 3.1 8B FP8
+        mad_tag: pyt_vllm_llama-3.1-8b_fp8
+        model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
+        precision: float8
+      - model: Llama 3.1 70B FP8
+        mad_tag: pyt_vllm_llama-3.1-70b_fp8
+        model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
+        precision: float8
+      - model: Llama 3.1 405B FP8
+        mad_tag: pyt_vllm_llama-3.1-405b_fp8
+        model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
+        precision: float8
+    - group: Mistral AI
+      tag: mistral
+      models:
+      - model: Mixtral MoE 8x7B
+        mad_tag: pyt_vllm_mixtral-8x7b
+        model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
+        url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
+        precision: float16
+      - model: Mixtral MoE 8x22B
+        mad_tag: pyt_vllm_mixtral-8x22b
+        model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
+        url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
+        precision: float16
+      - model: Mistral 7B
+        mad_tag: pyt_vllm_mistral-7b
+        model_repo: mistralai/Mistral-7B-Instruct-v0.3
+        url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
+        precision: float16
+      - model: Mixtral MoE 8x7B FP8
+        mad_tag: pyt_vllm_mixtral-8x7b_fp8
+        model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+        precision: float8
+      - model: Mixtral MoE 8x22B FP8
+        mad_tag: pyt_vllm_mixtral-8x22b_fp8
+        model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+        precision: float8
+      - model: Mistral 7B FP8
+        mad_tag: pyt_vllm_mistral-7b_fp8
+        model_repo: amd/Mistral-7B-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
+        precision: float8
+    - group: Qwen
+      tag: qwen
+      models:
+      - model: Qwen2 7B
+        mad_tag: pyt_vllm_qwen2-7b
+        model_repo: Qwen/Qwen2-7B-Instruct
+        url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
+        precision: float16
+      - model: Qwen2 72B
+        mad_tag: pyt_vllm_qwen2-72b
+        model_repo: Qwen/Qwen2-72B-Instruct
+        url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
+        precision: float16
+      - model: QwQ-32B
+        mad_tag: pyt_vllm_qwq-32b
+        model_repo: Qwen/QwQ-32B
+        url: https://huggingface.co/Qwen/QwQ-32B
+        precision: float16
+        tunableop: true
+    - group: Databricks DBRX
+      tag: dbrx
+      models:
+      - model: DBRX Instruct
+        mad_tag: pyt_vllm_dbrx-instruct
+        model_repo: databricks/dbrx-instruct
+        url: https://huggingface.co/databricks/dbrx-instruct
+        precision: float16
+      - model: DBRX Instruct FP8
+        mad_tag: pyt_vllm_dbrx_fp8
+        model_repo: amd/dbrx-instruct-FP8-KV
+        url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
+        precision: float8
+    - group: Google Gemma
+      tag: gemma
+      models:
+      - model: Gemma 2 27B
+        mad_tag: pyt_vllm_gemma-2-27b
+        model_repo: google/gemma-2-27b
+        url: https://huggingface.co/google/gemma-2-27b
+        precision: float16
+    - group: Cohere
+      tag: cohere
+      models:
+      - model: C4AI Command R+ 08-2024
+        mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
+        model_repo: CohereForAI/c4ai-command-r-plus-08-2024
+        url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
+        precision: float16
+      - model: C4AI Command R+ 08-2024 FP8
+        mad_tag: pyt_vllm_command-r-plus_fp8
+        model_repo: amd/c4ai-command-r-plus-FP8-KV
+        url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
+        precision: float8
+    - group: DeepSeek
+      tag: deepseek
+      models:
+      - model: DeepSeek MoE 16B
+        mad_tag: pyt_vllm_deepseek-moe-16b-chat
+        model_repo: deepseek-ai/deepseek-moe-16b-chat
+        url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
+        precision: float16
+    - group: Microsoft Phi
+      tag: phi
+      models:
+      - model: Phi-4
+        mad_tag: pyt_vllm_phi-4
+        model_repo: microsoft/phi-4
+        url: https://huggingface.co/microsoft/phi-4
+    - group: TII Falcon
+      tag: falcon
+      models:
+      - model: Falcon 180B
+        mad_tag: pyt_vllm_falcon-180b
+        model_repo: tiiuae/falcon-180B
+        url: https://huggingface.co/tiiuae/falcon-180B
+        precision: float16
--- a/docs/data/how-to/rocm-for-ai/inference/pytorch-inference-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/pytorch-inference-benchmark-models.yaml
@@ -31,3 +31,11 @@ pytorch_inference_benchmark:
        model_repo: genmo/mochi-1-preview
        url: https://huggingface.co/genmo/mochi-1-preview
        precision: float16
+    - group: Wan2.1
+      tag: wan
+      models:
+      - model: Wan2.1
+        mad_tag: pyt_wan2.1_inference
+        model_repo: Wan-AI/Wan2.1-T2V-14B
+        url: https://huggingface.co/Wan-AI/Wan2.1-T2V-14B
+        precision: bfloat16
--- a/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
@@ -1,10 +1,11 @@
 vllm_benchmark:
  unified_docker:
    latest:
-      pull_tag: rocm/vllm:rocm6.4.1_vllm_0.9.0.1_20250605
-      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.0.1_20250605/images/sha256-f48beeb3d72663a93c77211eb45273d564451447c097e060befa713d565fa36c
+      # TODO: update me
+      pull_tag: rocm/vllm:rocm6.4.1_vllm_0.9.1_20250702
+      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250702/images/sha256-45068a2079cb8df554ed777141bf0c67d6627c470a897256e60c9f262677faab
      rocm_version: 6.4.1
-      vllm_version: 0.9.0.1 (0.9.0.2.dev108+g71faa1880.rocm641)
+      vllm_version: 0.9.1 (0.9.2.dev206+gb335519f2.rocm641)
      pytorch_version: 2.7.0+gitf717b2a
      hipblaslt_version: 0.15
  model_groups:
--- a/docs/how-to/gpu-performance/mi300x.rst
+++ b/docs/how-to/gpu-performance/mi300x.rst
@@ -7,21 +7,21 @@ AMD Instinct MI300X performance guides
 **************************************

 The following performance guides provide essential guidance on the necessary
-steps to properly :doc:`configure your system for AMD Instinct™ MI300X
-accelerators <../system-optimization/mi300x>`. They include detailed
-instructions on system settings and application :doc:`workload tuning
-<../rocm-for-ai/inference-optimization/workload>` to help you
-leverage the maximum capabilities of these accelerators and achieve superior
-performance.
+steps to properly `configure your system for AMD Instinct™ MI300X accelerators
+<https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+They include detailed instructions on system settings and application
+:doc:`workload tuning </how-to/rocm-for-ai/inference-optimization/workload>` to
+help you leverage the maximum capabilities of these accelerators and achieve
+superior performance.

 * `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`__
  covers essential system settings and system management practices to configure
  your AMD Instinct MI300X system for performance.

-* :doc:`../rocm-for-ai/inference-optimization/workload` covers steps to
+* :doc:`/how-to/rocm-for-ai/inference-optimization/workload` covers steps to
  optimize the performance of AMD Instinct MI300X series accelerators for HPC
  and deep learning operations.

-* :doc:`../rocm-for-ai/inference/vllm-benchmark` introduces a preconfigured
+* :doc:`/how-to/rocm-for-ai/inference/benchmark-docker/vllm` introduces a preconfigured
  environment for LLM inference, designed to help you test performance with
  popular models on AMD Instinct MI300X series accelerators.
--- a/docs/how-to/rocm-for-ai/fine-tuning/index.rst
+++ b/docs/how-to/rocm-for-ai/fine-tuning/index.rst
@@ -24,5 +24,3 @@ training, fine-tuning, and inference. It leverages popular machine learning fram
 - :doc:`Fine-tuning and inference <fine-tuning-and-inference>` using a
  :doc:`single-accelerator <single-gpu-fine-tuning-and-inference>` or
  :doc:`multi-accelerator <multi-gpu-fine-tuning-and-inference>` system.
-
-
--- a/docs/how-to/rocm-for-ai/index.rst
+++ b/docs/how-to/rocm-for-ai/index.rst
@@ -6,7 +6,7 @@
 Use ROCm for AI
 **************************

-ROCm™ is an open-source software platform that enables high-performance computing and machine learning applications. It features the ability to accelerate training, fine-tuning, and inference for AI application development. With ROCm, you can access the full power of AMD GPUs, which can significantly improve the performance and efficiency of AI workloads.
+ROCm is an open-source software platform that enables high-performance computing and machine learning applications. It features the ability to accelerate training, fine-tuning, and inference for AI application development. With ROCm, you can access the full power of AMD GPUs, which can significantly improve the performance and efficiency of AI workloads.

 You can use ROCm to perform distributed training, which enables you to train models across multiple GPUs or nodes simultaneously. Additionally, ROCm supports mixed-precision training, which can help reduce the memory and compute requirements of training workloads. For fine-tuning, ROCm provides access to various algorithms and optimization techniques. In terms of inference, ROCm provides several techniques that can help you optimize your models for deployment, such as quantization, GEMM tuning, and optimization with composable kernel.
 
--- a/docs/how-to/rocm-for-ai/inference-optimization/workload.rst
+++ b/docs/how-to/rocm-for-ai/inference-optimization/workload.rst
@@ -151,8 +151,8 @@ desired effect. Continuous iteration helps refine the performance gains and
 address any new bottlenecks that may emerge.

 ROCm provides a prebuilt optimized Docker image that has everything required to implement
-the tips in this section. It includes ROCm, vLLM, PyTorch, and tuning files in the CSV 
-format. For more information, see :doc:`../inference/vllm-benchmark`.
+the LLM inference tips in this section. It includes ROCm, PyTorch, and vLLM.
+For more information, see :doc:`/how-to/rocm-for-ai/inference/benchmark-docker/vllm`.

 .. _mi300x-profiling-tools:

@@ -343,9 +343,10 @@ The following performance tips are not *specific* to vLLM -- they are general
 but relevant in this context. You can tune the following vLLM parameters to
 achieve optimal request latency and throughput performance.

-* As described in :ref:`mi300x-env-vars`, the environment
-  variable ``HIP_FORCE_DEV_KERNARG`` can improve vLLM performance. Set it to
-  ``export HIP_FORCE_DEV_KERNARG=1``.
+* As described in `Environment variables (MI300X)
+  <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#environment-variables>`_,
+  the environment variable ``HIP_FORCE_DEV_KERNARG`` can improve vLLM
+  performance. Set it to ``export HIP_FORCE_DEV_KERNARG=1``.

 * Set the :ref:`RCCL environment variable <mi300x-rccl>` ``NCCL_MIN_NCHANNELS``
  to ``112`` to increase the number of channels on MI300X to potentially improve
@@ -410,9 +411,9 @@ for additional performance tips. :ref:`fine-tuning-llms-vllm` describes vLLM
 usage with ROCm.

 ROCm provides a prebuilt optimized Docker image for validating the performance
-of LLM inference with vLLM on the MI300X accelerator. The Docker image includes
-ROCm, vLLM, PyTorch, and tuning files in the CSV format. For more information,
-see :doc:`../inference/vllm-benchmark`.
+of LLM inference with vLLM on MI300X series accelerators. The Docker image includes
+ROCm, vLLM, and PyTorch. For more information, see
+:doc:`/how-to/rocm-for-ai/inference/benchmark-docker/vllm`.

 .. _mi300x-vllm-throughput-measurement:

@@ -1477,8 +1478,9 @@ following command: ``cat /proc/sys/kernel/numa_balancing`` and
 checking whether the output is ``0``.

 If the output is ``1``, you can disable NUMA auto-balancing by running the
-following command: ``sudo sysctl kernel.numa_balancing=0``. For more
-details, see :ref:`AMD Instinct MI300X system optimization <mi300x-disable-numa>`.
+following command: ``sudo sysctl kernel.numa_balancing=0``. For more details,
+see `AMD Instinct MI300X system optimization
+<https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#disable-numa-auto-balancing>`_.

 .. _mi300x-rccl-disable-acs:

--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.4.3.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.4.3.rst
@@ -59,7 +59,7 @@ MI300X accelerator with the prebuilt vLLM Docker image.

   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
   might hang until the periodic balancing is finalized. For more information,
-   see :ref:`AMD Instinct MI300X system optimization <mi300x-disable-numa>`.
+   see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.

   .. code-block:: shell

@@ -322,22 +322,22 @@ Further reading
 ===============

 - For application performance optimization strategies for HPC and AI workloads,
-  including inference with vLLM, see :doc:`/how-to/tuning-guides/mi300x/workload`.
+  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.

 - To learn more about the options for latency and throughput benchmark scripts,
  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.

 - To learn more about system settings and management practices to configure your system for
-  MI300X accelerators, see :doc:`/how-to/system-optimization/mi300x`.
+  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_

- To learn how to run LLM models from Hugging Face or your own model, see
-  :doc:`Using ROCm for AI </how-to/rocm-for-ai/index>`.
+- To learn how to run community models from Hugging Face on AMD GPUs, see
+  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.

- To learn how to optimize inference on LLMs, see
-  :doc:`Fine-tuning LLMs and inference optimization </how-to/llm-fine-tuning-optimization/index>`.
+- To learn how to fine-tune LLMs and optimize inference, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.

- For a list of other ready-made Docker images for ROCm, see the
-  :doc:`Docker image support matrix <rocm-install-on-linux:reference/docker-image-support-matrix>`.
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.

 Previous versions
 =================
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.4.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.4.rst
@@ -82,7 +82,7 @@ MI300X accelerator with the prebuilt vLLM Docker image.

   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
   might hang until the periodic balancing is finalized. For more information,
-   see :ref:`AMD Instinct MI300X system optimization <mi300x-disable-numa>`.
+   see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.

   .. code-block:: shell

@@ -392,25 +392,22 @@ Further reading
 ===============

 - For application performance optimization strategies for HPC and AI workloads,
-  including inference with vLLM, see :doc:`/how-to/tuning-guides/mi300x/workload`.
+  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.

 - To learn more about the options for latency and throughput benchmark scripts,
  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.

 - To learn more about system settings and management practices to configure your system for
-  MI300X accelerators, see :doc:`/how-to/system-optimization/mi300x`.
+  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_

- To learn how to run LLM models from Hugging Face or your own model, see
-  :doc:`Using ROCm for AI </how-to/rocm-for-ai/index>`.
+- To learn how to run community models from Hugging Face on AMD GPUs, see
+  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.

- To learn how to optimize inference on LLMs, see
-  :doc:`Fine-tuning LLMs and inference optimization </how-to/llm-fine-tuning-optimization/index>`.
+- To learn how to fine-tune LLMs and optimize inference, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.

- For a list of other ready-made Docker images for ROCm, see the
-  :doc:`Docker image support matrix <rocm-install-on-linux:reference/docker-image-support-matrix>`.
-
- To compare with the previous version of the ROCm vLLM Docker image for performance validation, refer to
-  `LLM inference performance validation on AMD Instinct MI300X (ROCm 6.2.0) <https://rocm.docs.amd.com/en/docs-6.2.0/how-to/performance-validation/mi300x/vllm-benchmark.html>`_.
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.

 Previous versions
 =================
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.6.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.6.6.rst
@@ -55,7 +55,7 @@ MI300X accelerator with the prebuilt vLLM Docker image.

   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
   might hang until the periodic balancing is finalized. For more information,
-   see :ref:`AMD Instinct MI300X system optimization <mi300x-disable-numa>`.
+   see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.

   .. code-block:: shell

@@ -437,22 +437,22 @@ Further reading
 ===============

 - For application performance optimization strategies for HPC and AI workloads,
-  including inference with vLLM, see :doc:`../inference-optimization/workload`.
+  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.

 - To learn more about the options for latency and throughput benchmark scripts,
  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.

 - To learn more about system settings and management practices to configure your system for
-  MI300X accelerators, see :doc:`../../system-optimization/mi300x`.
+  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_

- To learn how to run LLM models from Hugging Face or your own model, see
-  :doc:`Running models from Hugging Face <hugging-face-models>`.
+- To learn how to run community models from Hugging Face on AMD GPUs, see
+  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.

- To learn how to optimize inference on LLMs, see
-  :doc:`Inference optimization <../inference-optimization/index>`.
+- To learn how to fine-tune LLMs and optimize inference, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.

- To learn how to fine-tune LLMs, see
-  :doc:`Fine-tuning LLMs <../fine-tuning/index>`.
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.

 Previous versions
 =================
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.7.3-20250325.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.7.3-20250325.rst
@@ -130,7 +130,7 @@ vLLM inference performance testing

      To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
      might hang until the periodic balancing is finalized. For more information,
-      see :ref:`AMD Instinct MI300X system optimization <mi300x-disable-numa>`.
+   see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.

      .. code-block:: shell

@@ -305,22 +305,22 @@ Further reading
 ===============

 - For application performance optimization strategies for HPC and AI workloads,
-  including inference with vLLM, see :doc:`../inference-optimization/workload`.
+  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.

 - To learn more about the options for latency and throughput benchmark scripts,
  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.

 - To learn more about system settings and management practices to configure your system for
-  MI300X accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_

- To learn how to run LLM models from Hugging Face or your own model, see
-  :doc:`Running models from Hugging Face <hugging-face-models>`.
+- To learn how to run community models from Hugging Face on AMD GPUs, see
+  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.

- To learn how to optimize inference on LLMs, see
-  :doc:`Inference optimization <../inference-optimization/index>`.
+- To learn how to fine-tune LLMs and optimize inference, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.

- To learn how to fine-tune LLMs, see
-  :doc:`Fine-tuning LLMs <../fine-tuning/index>`.
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.

 Previous versions
 =================
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.3-20250415.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.3-20250415.rst
@@ -1,3 +1,5 @@
+:orphan:
+
 .. meta::
   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
                 ROCm vLLM Docker image.
@@ -319,22 +321,22 @@ Further reading
 ===============

 - For application performance optimization strategies for HPC and AI workloads,
-  including inference with vLLM, see :doc:`../inference-optimization/workload`.
+  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.

 - To learn more about the options for latency and throughput benchmark scripts,
  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.

 - To learn more about system settings and management practices to configure your system for
-  MI300X accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_

- To learn how to run LLM models from Hugging Face or your own model, see
-  :doc:`Running models from Hugging Face <hugging-face-models>`.
+- To learn how to run community models from Hugging Face on AMD GPUs, see
+  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.

- To learn how to optimize inference on LLMs, see
-  :doc:`Inference optimization <../inference-optimization/index>`.
+- To learn how to fine-tune LLMs and optimize inference, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.

- To learn how to fine-tune LLMs, see
-  :doc:`Fine-tuning LLMs <../fine-tuning/index>`.
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.

 Previous versions
 =================
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250513.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250513.rst
@@ -333,19 +333,19 @@ Further reading
  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.

 - To learn more about system settings and management practices to configure your system for
-  MI300X accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_

 - For application performance optimization strategies for HPC and AI workloads,
-  including inference with vLLM, see :doc:`../../../inference-optimization/workload`.
+  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.

- To learn how to run LLM models from Hugging Face or your own model, see
-  :doc:`Running models from Hugging Face <../../hugging-face-models>`.
+- To learn how to run community models from Hugging Face on AMD GPUs, see
+  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.

- To learn how to optimize inference on LLMs, see
-  :doc:`Inference optimization <../../../inference-optimization/index>`.
+- To learn how to fine-tune LLMs and optimize inference, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.

- To learn how to fine-tune LLMs, see
-  :doc:`Fine-tuning LLMs <../../../fine-tuning/index>`.
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.

 Previous versions
 =================
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250521.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250521.rst
@@ -333,22 +333,23 @@ Further reading
  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.

 - To learn more about system settings and management practices to configure your system for
-  MI300X accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_

 - For application performance optimization strategies for HPC and AI workloads,
-  including inference with vLLM, see :doc:`../../inference-optimization/workload`.
+  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.

- To learn how to run LLM models from Hugging Face or your own model, see
-  :doc:`Running models from Hugging Face <../hugging-face-models>`.
+- To learn how to run community models from Hugging Face on AMD GPUs, see
+  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.

- To learn how to optimize inference on LLMs, see
-  :doc:`Inference optimization <../../inference-optimization/index>`.
+- To learn how to fine-tune LLMs and optimize inference, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.

- To learn how to fine-tune LLMs, see
-  :doc:`Fine-tuning LLMs <../../fine-tuning/index>`.
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.

 Previous versions
 =================

 See :doc:`vllm-history` to find documentation for previous releases
 of the ``ROCm/vllm`` Docker image.
+
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250605.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250605.rst
@@ -0,0 +1,353 @@
+:orphan:
+
+.. meta::
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
+                 ROCm vLLM Docker image.
+   :keywords: model, MAD, automation, dashboarding, validate
+
+**********************************
+vLLM inference performance testing
+**********************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm vLLM
+   inference performance documentation. See :doc:`../vllm` for the latest version.
+
+.. _vllm-benchmark-unified-docker:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.0.1_20250605-benchmark-models.yaml
+
+   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
+   {% set model_groups = data.vllm_benchmark.model_groups %}
+
+   The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
+   a prebuilt, optimized environment for validating large language model (LLM)
+   inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
+   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
+   accelerators and includes the following components:
+
+   * `ROCm {{ unified_docker.rocm_version }} <https://github.com/ROCm/ROCm>`_
+
+   * `vLLM {{ unified_docker.vllm_version }} <https://docs.vllm.ai/en/latest>`_
+
+   * `PyTorch {{ unified_docker.pytorch_version }} <https://github.com/ROCm/pytorch.git>`_
+
+   * `hipBLASLt {{ unified_docker.hipblaslt_version }} <https://github.com/ROCm/hipBLASLt>`_
+
+   With this Docker image, you can quickly test the :ref:`expected
+   inference performance numbers <vllm-benchmark-performance-measurements>` for
+   MI300X series accelerators.
+
+   .. _vllm-benchmark-available-models:
+
+   Supported models
+   ================
+
+   The following models are supported for inference performance benchmarking
+   with vLLM and ROCm. Some instructions, commands, and recommendations in this
+   documentation might vary by model -- select one to get started.
+
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+        <div class="row">
+          <div class="col-2 me-2 model-param-head">Model group</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+   {% endfor %}
+          </div>
+        </div>
+
+        <div class="row mt-1">
+          <div class="col-2 me-2 model-param-head">Model</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+      {% set models = model_group.models %}
+      {% for model in models %}
+         {% if models|length % 3 == 0 %}
+            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% else %}
+            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% endif %}
+      {% endfor %}
+   {% endfor %}
+          </div>
+        </div>
+      </div>
+
+   .. _vllm-benchmark-vllm:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. note::
+
+         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
+         Some models require access authorization prior to use via an external license agreement through a third party.
+
+      {% endfor %}
+   {% endfor %}
+
+   .. note::
+
+      vLLM is a toolkit and library for LLM inference and serving. AMD implements
+      high-performance custom kernels and modules in vLLM to enhance performance.
+      See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
+      more information.
+
+   .. _vllm-benchmark-performance-measurements:
+
+   Performance measurements
+   ========================
+
+   To evaluate performance, the
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+   page provides reference throughput and latency measurements for inferencing popular AI models.
+
+   .. important::
+
+      The performance data presented in
+      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+      only reflects the latest version of this inference benchmarking environment.
+      The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
+
+   Advanced features and known issues
+   ==================================
+
+   For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
+   see the developer's guide at `<https://github.com/ROCm/vllm/tree/7bb0618b1fe725b7d4fad9e525aa44da12c94a8b/docs/dev-docker>`__.
+
+   System validation
+   =================
+
+   Before running AI workloads, it's important to validate that your AMD hardware is configured
+   correctly and performing optimally.
+
+   To optimize performance, disable automatic NUMA balancing. Otherwise, the GPU
+   might hang until the periodic balancing is finalized. For more information,
+   see the :ref:`system validation steps <rocm-for-ai-system-optimization>`.
+
+   .. code-block:: shell
+
+      # disable automatic NUMA balancing
+      sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
+      # check if NUMA balancing is disabled (returns 0 if disabled)
+      cat /proc/sys/kernel/numa_balancing
+      0
+
+   To test for optimal performance, consult the recommended :ref:`System health benchmarks
+   <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+   system's configuration.
+
+   Pull the Docker image
+   =====================
+
+   Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
+   Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull {{ unified_docker.pull_tag }}
+
+   Benchmarking
+   ============
+
+   Once the setup is complete, choose between two options to reproduce the
+   benchmark results:
+
+   .. _vllm-benchmark-mad:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. tab-set::
+
+         .. tab-item:: MAD-integrated benchmarking
+
+            Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+            directory and install the required packages on the host machine.
+
+            .. code-block:: shell
+
+               git clone https://github.com/ROCm/MAD
+               cd MAD
+               pip install -r requirements.txt
+
+            Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
+            using one GPU with the ``{{model.precision}}`` data type on the host machine.
+
+            .. code-block:: shell
+
+               export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+               python3 tools/run_models.py --tags {{model.mad_tag}} --keep-model-dir --live-output --timeout 28800
+
+            MAD launches a Docker container with the name
+            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
+            model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
+
+            Although the :ref:`available models <vllm-benchmark-available-models>` are preconfigured
+            to collect latency and throughput performance data, you can also change the benchmarking
+            parameters. See the standalone benchmarking tab for more information.
+
+            {% if model.tunableop %}
+
+            .. note::
+
+               For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
+               TunableOp automatically explores different implementations and configurations of certain PyTorch
+               operators to find the fastest one for your hardware.
+
+               By default, ``{{model.mad_tag}}`` runs with TunableOp disabled
+               (see
+               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To
+               enable it, edit the default run behavior in the ``models.json``
+               configuration before running inference -- update the model's run
+               ``args`` by changing ``--tunableop off`` to ``--tunableop on``.
+
+               Enabling TunableOp triggers a two-pass run -- a warm-up followed by the performance-collection run.
+
+            {% endif %}
+
+         .. tab-item:: Standalone benchmarking
+
+            Run the vLLM benchmark tool independently by starting the
+            `Docker container <{{ unified_docker.docker_hub_url }}>`_
+            as shown in the following snippet.
+
+            .. code-block::
+
+               docker pull {{ unified_docker.pull_tag }}
+               docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env HUGGINGFACE_HUB_CACHE=/workspace --name test {{ unified_docker.pull_tag }}
+
+            In the Docker container, clone the ROCm MAD repository and navigate to the
+            benchmark scripts directory at ``~/MAD/scripts/vllm``.
+
+            .. code-block::
+
+               git clone https://github.com/ROCm/MAD
+               cd MAD/scripts/vllm
+
+            To start the benchmark, use the following command with the appropriate options.
+
+            .. code-block::
+
+               ./vllm_benchmark_report.sh -s $test_option -m {{model.model_repo}} -g $num_gpu -d {{model.precision}}
+
+            .. list-table::
+               :header-rows: 1
+               :align: center
+
+               * - Name
+                 - Options
+                 - Description
+
+               * - ``$test_option``
+                 - latency
+                 - Measure decoding token latency
+
+               * -
+                 - throughput
+                 - Measure token generation throughput
+
+               * -
+                 - all
+                 - Measure both throughput and latency
+
+               * - ``$num_gpu``
+                 - 1 or 8
+                 - Number of GPUs
+
+               * - ``$datatype``
+                 - ``float16`` or ``float8``
+                 - Data type
+
+            .. note::
+
+               The input sequence length, output sequence length, and tensor parallel (TP) are
+               already configured. You don't need to specify them with this script.
+
+            .. note::
+
+               If you encounter the following error, pass your access-authorized Hugging
+               Face token to the gated models.
+
+               .. code-block::
+
+                  OSError: You are trying to access a gated repo.
+
+                  # pass your HF_TOKEN
+                  export HF_TOKEN=$your_personal_hf_token
+
+            Here are some examples of running the benchmark with various options.
+
+            * Latency benchmark
+
+              Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision.
+
+              .. code-block::
+
+                 ./vllm_benchmark_report.sh -s latency -m {{model.model_repo}} -g 8 -d {{model.precision}}
+
+              Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
+
+            * Throughput benchmark
+
+              Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with ``{{model.precision}}`` precision.
+
+              .. code-block:: shell
+
+                 ./vllm_benchmark_report.sh -s throughput -m {{model.model_repo}} -g 8 -d {{model.precision}}
+
+              Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
+
+            .. raw:: html
+
+               <style>
+               mjx-container[jax="CHTML"][display="true"] {
+                  text-align: left;
+                  margin: 0;
+               }
+               </style>
+
+            .. note::
+
+               Throughput is calculated as:
+
+               - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
+
+               - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
+      {% endfor %}
+   {% endfor %}
+
+Further reading
+===============
+
+- To learn more about the options for latency and throughput benchmark scripts,
+  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
+
+- To learn more about system settings and management practices to configure your system for
+  MI300X accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+
+- For application performance optimization strategies for HPC and AI workloads,
+  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
+
+- To learn how to run community models from Hugging Face on AMD GPUs, see
+  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
+
+- To learn how to fine-tune LLMs and optimize inference, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`vllm-history` to find documentation for previous releases
+of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst
@@ -18,58 +18,65 @@ previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub <https://hub.
     - PyTorch version
     - Resources

-   * - 6.4.0
-     - 0.9.0.1
+   * - 6.4.1
+     - 0.9.1
     - 2.7.0
     - 
       * :doc:`Documentation <../vllm>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250521/images/sha256-38410c51af7208897cd8b737c9bdfc126e9bc8952d4aa6b88c85482f03092a11>`_
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250702/images/sha256-45068a2079cb8df554ed777141bf0c67d6627c470a897256e60c9f262677faab>`_
+
+   * - 6.4.1
+     - 0.9.0.1
+     - 2.7.0
+     - 
+       * :doc:`Documentation <vllm-0.9.0.1-20250605>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.0.1_20250605/images/sha256-f48beeb3d72663a93c77211eb45273d564451447c097e060befa713d565fa36c>`_

   * - 6.3.1
     - 0.8.5 (0.8.6.dev)
     - 2.7.0
     - 
       * :doc:`Documentation <vllm-0.8.5-20250521>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250521/images/sha256-38410c51af7208897cd8b737c9bdfc126e9bc8952d4aa6b88c85482f03092a11>`_
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250521/images/sha256-38410c51af7208897cd8b737c9bdfc126e9bc8952d4aa6b88c85482f03092a11>`__

   * - 6.3.1
     - 0.8.5
     - 2.7.0
     - 
       * :doc:`Documentation <vllm-0.8.5-20250513>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250513/images/sha256-5c8b4436dd0464119d9df2b44c745fadf81512f18ffb2f4b5dc235c71ebe26b4>`_
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_vllm_0.8.5_20250513/images/sha256-5c8b4436dd0464119d9df2b44c745fadf81512f18ffb2f4b5dc235c71ebe26b4>`__

   * - 6.3.1
     - 0.8.3
     - 2.7.0
     - 
       * :doc:`Documentation <vllm-0.8.3-20250415>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.8.3_20250415/images/sha256-ad9062dea3483d59dedb17c67f7c49f30eebd6eb37c3fac0a171fb19696cc845>`_
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.8.3_20250415/images/sha256-ad9062dea3483d59dedb17c67f7c49f30eebd6eb37c3fac0a171fb19696cc845>`__

   * - 6.3.1
     - 0.7.3
     - 2.7.0
     - 
       * :doc:`Documentation <vllm-0.7.3-20250325>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.7.3_20250325/images/sha256-25245924f61750b19be6dcd8e787e46088a496c1fe17ee9b9e397f3d84d35640>`_
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_instinct_vllm0.7.3_20250325/images/sha256-25245924f61750b19be6dcd8e787e46088a496c1fe17ee9b9e397f3d84d35640>`__

   * - 6.3.1
     - 0.6.6
     - 2.7.0
     - 
       * :doc:`Documentation <vllm-0.6.6>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6/images/sha256-9a12ef62bbbeb5a4c30a01f702c8e025061f575aa129f291a49fbd02d6b4d6c9>`_
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.3.1_mi300_ubuntu22.04_py3.12_vllm_0.6.6/images/sha256-9a12ef62bbbeb5a4c30a01f702c8e025061f575aa129f291a49fbd02d6b4d6c9>`__

   * - 6.2.1
     - 0.6.4
     - 2.5.0
     - 
       * :doc:`Documentation <vllm-0.6.4>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4/images/sha256-ccbb74cc9e7adecb8f7bdab9555f7ac6fc73adb580836c2a35ca96ff471890d8>`_
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4/images/sha256-ccbb74cc9e7adecb8f7bdab9555f7ac6fc73adb580836c2a35ca96ff471890d8>`__

   * - 6.2.0
     - 0.4.3
     - 2.4.0
     - 
       * :doc:`Documentation <vllm-0.4.3>`
-       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.2_mi300_ubuntu22.04_py3.9_vllm_7c5fd50/images/sha256-9e4dd4788a794c3d346d7d0ba452ae5e92d39b8dfac438b2af8efdc7f15d22c0>`_
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.2_mi300_ubuntu22.04_py3.9_vllm_7c5fd50/images/sha256-9e4dd4788a794c3d346d7d0ba452ae5e92d39b8dfac438b2af8efdc7f15d22c0>`__
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference.rst
@@ -32,10 +32,10 @@ PyTorch inference performance testing

      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
        <div class="row">
-          <div class="col-2 me-2 model-param-head">Model group</div>
+          <div class="col-2 me-2 model-param-head">Model</div>
          <div class="row col-10">
   {% for model_group in model_groups %}
-            <div class="col-4 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
   {% endfor %}
          </div>
        </div>
@@ -103,7 +103,7 @@ PyTorch inference performance testing

         The Chai-1 benchmark uses a specifically selected Docker image using ROCm 6.2.3 and PyTorch 2.3.0 to address an accuracy issue.

-   .. container:: model-doc pyt_clip_inference pyt_mochi_video_inference
+   .. container:: model-doc pyt_clip_inference pyt_mochi_video_inference pyt_wan2.1_inference

      Use the following command to pull the `ROCm PyTorch Docker image <https://hub.docker.com/layers/rocm/pytorch/latest/images/sha256-05b55983e5154f46e7441897d0908d79877370adca4d1fff4899d9539d6c4969>`_ from Docker Hub.

--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
@@ -112,7 +112,7 @@ vLLM inference performance testing
   ==================================

   For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
-   see the developer's guide at `<https://github.com/ROCm/vllm/tree/7bb0618b1fe725b7d4fad9e525aa44da12c94a8b/docs/dev-docker>`__.
+   see the developer's guide at `<https://github.com/ROCm/vllm/tree/5486e7bc8523be0324ccd68f221959445b56cc2a/docs/dev-docker>`__.

   System validation
   =================
@@ -325,22 +325,22 @@ Further reading
  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.

 - To learn more about system settings and management practices to configure your system for
-  MI300X accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_
+  MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_

 - For application performance optimization strategies for HPC and AI workloads,
-  including inference with vLLM, see :doc:`../../inference-optimization/workload`.
+  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.

- To learn how to run LLM models from Hugging Face or your own model, see
-  :doc:`Running models from Hugging Face <../hugging-face-models>`.
+- To learn how to run community models from Hugging Face on AMD GPUs, see
+  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.

- To learn how to optimize inference on LLMs, see
-  :doc:`Inference optimization <../../inference-optimization/index>`.
+- To learn how to fine-tune LLMs and optimize inference, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.

- To learn how to fine-tune LLMs, see
-  :doc:`Fine-tuning LLMs <../../fine-tuning/index>`.
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.

 Previous versions
 =================

 See :doc:`previous-versions/vllm-history` to find documentation for previous releases
-of the ``ROCm/vllm`` Docker image.
+of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/index.rst
+++ b/docs/how-to/rocm-for-ai/inference/index.rst
@@ -14,14 +14,14 @@ Throughout the following topics, this section provides a comprehensive guide to
 The AI Developer Hub contains `AMD ROCm tutorials <https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/>`_ for
 training, fine-tuning, and inference. It leverages popular machine learning frameworks on AMD GPUs.

- :doc:`Installing ROCm and machine learning frameworks <install>`
+- :doc:`Installing ROCm and machine learning frameworks <../install>`

 - :doc:`Running models from Hugging Face <hugging-face-models>`

 - :doc:`LLM inference frameworks <llm-inference-frameworks>`

- :doc:`vLLM inference performance testing <vllm-benchmark>`
+- :doc:`vLLM inference performance testing <benchmark-docker/vllm>`

- :doc:`PyTorch inference performance testing <pytorch-inference-benchmark>`
+- :doc:`PyTorch inference performance testing <benchmark-docker/pytorch-inference>`

 - :doc:`Deploying your model <deploy-your-model>`
--- a/docs/how-to/rocm-for-ai/inference/llm-inference-frameworks.rst
+++ b/docs/how-to/rocm-for-ai/inference/llm-inference-frameworks.rst
@@ -141,7 +141,7 @@ Installing vLLM

   ROCm provides a prebuilt optimized Docker image for validating the performance of LLM inference with vLLM
   on the MI300X accelerator. The Docker image includes ROCm, vLLM, and PyTorch.
-   For more information, see :doc:`vllm-benchmark`.
+   For more information, see :doc:`/how-to/rocm-for-ai/inference/benchmark-docker/vllm`.

 .. _fine-tuning-llms-tgi:

--- a/docs/how-to/rocm-for-ai/install.rst
+++ b/docs/how-to/rocm-for-ai/install.rst
@@ -28,7 +28,7 @@ ROCm supports multiple :doc:`installation methods <rocm-install-on-linux:install

 * :doc:`Using your Linux distribution's package manager <rocm-install-on-linux:install/install-methods/package-manager-index>`

-* :doc:`Using the AMDGPU installer <rocm-install-on-linux:install/amdgpu-install>`
+* :doc:`Using the AMDGPU installer <rocm-install-on-linux:install/install-methods/amdgpu-installer-index>`

 * :ref:`Multi-version installation <rocm-install-on-linux:installation-types>`

--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
@@ -160,12 +160,14 @@ Download the Docker image
   .. tab-set:: 

      .. tab-item:: Ubuntu 24.04 + Python 3.12
+         :sync: py312

         .. code-block:: shell

            docker pull rocm/megatron-lm:v25.5_py312

      .. tab-item:: Ubuntu 22.04 + Python 3.10
+         :sync: py310

         .. code-block:: shell

@@ -173,9 +175,22 @@ Download the Docker image

 2. Launch the Docker container.

-   .. code-block:: shell
+   .. tab-set::

-      docker run -it --device /dev/dri --device /dev/kfd --device /dev/infiniband --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 64G --name megatron_training_env rocm/megatron-lm:v25.5
+      .. tab-item:: Ubuntu 24.04 + Python 3.12
+         :sync: py312
+
+         .. code-block:: shell
+
+            docker run -it --device /dev/dri --device /dev/kfd --device /dev/infiniband --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 128G --name megatron_training_env rocm/megatron-lm:v25.5_py312
+
+
+      .. tab-item:: Ubuntu 22.04 + Python 3.10
+         :sync: py310
+
+         .. code-block:: shell
+
+            docker run -it --device /dev/dri --device /dev/kfd --device /dev/infiniband --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 128G --name megatron_training_env rocm/megatron-lm:v25.5_py310

 3. Use these commands if you exit the ``megatron_training_env`` container and need to return to it.

--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst
@@ -22,7 +22,7 @@ previous releases of the ``ROCm/megatron-lm`` Docker image on `Docker Hub <https
     - 6.3.4
     - 2.8.0a0+gite2f9759
     - 
-       * `Documentation <../megatron-lm>`_
+       * :doc:`Documentation <../megatron-lm>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/megatron-lm/v25.5_py312/images/sha256-4506f18ba188d24189c6b1f95130b425f52c528a543bb3f420351824edceadc2>`_

   * - v25.4
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v24.12-dev.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v24.12-dev.rst
@@ -102,7 +102,8 @@ the output is ``1``, run the following command to disable NUMA auto-balancing.

   sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'

-See :ref:`mi300x-disable-numa` for more information.
+See :ref:`System validation and optimization <rocm-for-ai-system-optimization>`
+for more information.

 Hardware verification with ROCm
 -------------------------------
@@ -118,7 +119,7 @@ Run the command:

   rocm-smi --setperfdeterminism 1900

-See :ref:`mi300x-hardware-verification-with-rocm` for more information.
+See `Hardware verification with ROCm <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html#hardware-verification-with-rocm>`_ for more information.

 RCCL Bandwidth Test
 -------------------
@@ -171,7 +172,7 @@ Run on 8 GPUs (``-g 8``), scanning from 8 bytes to 10 GB:

   ./build/all_reduce_perf -b 8 -e 10G -f 2 -g 8

-.. image:: ../../data/how-to/rocm-for-ai/rccl-tests-8-gpu.png
+.. image:: /data/how-to/rocm-for-ai/rccl-tests-8-gpu.png
   :width: 800

 Using one MPI process per GPU and ``-g 1`` for performance-oriented runs on both single-node and multi-node is
@@ -181,7 +182,7 @@ recommended. So, a run on 8 GPUs looks something like:

   mpirun -np 8 --bind-to numa ./build/all_reduce_perf -b 8 -e 10G -f 2 -g 1

-.. image:: ../../data/how-to/rocm-for-ai/rccl-tests-1-mpi-process-per-gpu.png
+.. image:: /data/how-to/rocm-for-ai/rccl-tests-1-mpi-process-per-gpu.png
   :width: 800

 Running with one MPI process per GPU ensures a one-to-one mapping for CPUs and GPUs, which can be beneficial
@@ -271,7 +272,7 @@ end-of-document token, remove sentence splitting, and use the tokenizer type.
 In this case, the automatically generated output files are named ``my-gpt2_text_document.bin`` and
 ``my-gpt2_text_document.idx``.

-.. image:: ../../data/how-to/rocm-for-ai/prep-training-datasets-my-gpt2-text-document.png
+.. image:: /data/how-to/rocm-for-ai/prep-training-datasets-my-gpt2-text-document.png
   :width: 800

 .. _amd-megatron-lm-environment-setup:
@@ -469,7 +470,7 @@ Benchmarking examples

      See the sample output:

-      .. image:: ../../data/how-to/rocm-for-ai/llama2-7b-training-log-sample.png
+      .. image:: /data/how-to/rocm-for-ai/llama2-7b-training-log-sample.png
         :width: 800

   .. tab-item:: Multi node training
@@ -500,12 +501,12 @@ Benchmarking examples

      Master node:

-      .. image:: ../../data/how-to/rocm-for-ai/2-node-training-master.png
+      .. image:: /data/how-to/rocm-for-ai/2-node-training-master.png
         :width: 800

      Worker node:

-      .. image:: ../../data/how-to/rocm-for-ai/2-node-training-worker.png
+      .. image:: /data/how-to/rocm-for-ai/2-node-training-worker.png
         :width: 800

 Previous versions
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3.rst
@@ -111,7 +111,8 @@ the output is ``1``, run the following command to disable NUMA auto-balancing.

   sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'

-See :ref:`mi300x-disable-numa` for more information.
+See :ref:`System validation and optimization <rocm-for-ai-system-optimization>`
+for more information.

 .. _mi300x-amd-megatron-lm-training:

@@ -489,7 +490,7 @@ Benchmarking examples

            See the sample output:

-            .. image:: ../../../../data/how-to/rocm-for-ai/llama2-7b-training-log-sample.png
+            .. image:: /data/how-to/rocm-for-ai/llama2-7b-training-log-sample.png
               :width: 800

         .. tab-item:: Multi-node training
@@ -520,12 +521,12 @@ Benchmarking examples

            Master node:

-            .. image:: ../../../../data/how-to/rocm-for-ai/2-node-training-master.png
+            .. image:: /data/how-to/rocm-for-ai/2-node-training-master.png
               :width: 800

            Worker node:

-            .. image:: ../../../../data/how-to/rocm-for-ai/2-node-training-worker.png
+            .. image:: /data/how-to/rocm-for-ai/2-node-training-worker.png
               :width: 800

 Previous versions
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4.rst
@@ -572,7 +572,7 @@ Benchmarking examples

            See the sample output:

-            .. image:: ../../../../data/how-to/rocm-for-ai/llama2-7b-training-log-sample.png
+            .. image:: /data/how-to/rocm-for-ai/llama2-7b-training-log-sample.png
               :width: 800

         .. tab-item:: Multi-node training
@@ -603,12 +603,12 @@ Benchmarking examples

            Master node:

-            .. image:: ../../../../data/how-to/rocm-for-ai/2-node-training-master.png
+            .. image:: /data/how-to/rocm-for-ai/2-node-training-master.png
               :width: 800

            Worker node:

-            .. image:: ../../../../data/how-to/rocm-for-ai/2-node-training-worker.png
+            .. image:: /data/how-to/rocm-for-ai/2-node-training-worker.png
               :width: 800

 Previous versions
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.3.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.3.rst
@@ -80,7 +80,8 @@ the output is ``1``, run the following command to disable NUMA auto-balancing.

   sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'

-See :ref:`mi300x-disable-numa` for more information.
+See :ref:`System validation and optimization <rocm-for-ai-system-optimization>`
+for more information.

 Environment setup
 =================
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5.rst
@@ -1,3 +1,5 @@
+:orphan:
+
 .. meta::
   :description: How to train a model using PyTorch for ROCm.
   :keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
--- a/docs/how-to/rocm-for-hpc/index.rst
+++ b/docs/how-to/rocm-for-hpc/index.rst
@@ -76,14 +76,6 @@ Ubuntu versions.
          single node workstations, multi and many-core nodes, clusters of nodes via
          QMP, and classic vector computers.

-      * -
-        - `Grid <https://github.com/amd/InfinityHub-CI/tree/main/grid/>`_
-        - Grid is a library for lattice QCD calculations that employs a high-level data parallel
-          approach while using a number of techniques to target multiple types of parallelism.
-          The library currently supports MPI, OpenMP and short vector parallelism. The SIMD
-          instructions sets covered include SSE, AVX, AVX2, FMA4, IMCI and AVX512. Recent
-          releases expanded this support to include GPU offloading.
-
      * -
        - `MILC <https://github.com/amd/InfinityHub-CI/tree/main/milc/>`_
        - The MILC Code is a set of research codes developed by MIMD Lattice Computation
@@ -237,12 +229,18 @@ Ubuntu versions.
          of these applications.

      * - Tools and libraries
-        - `ROCm with GPU-aware MPI container <https://github.com/amd/InfinityHub-CI/tree/main/base-gpu-mpi-rocm-docker>`_
+        - `AMD ROCm with OpenMPI container <https://github.com/amd/InfinityHub-CI/tree/main/base-gpu-mpi-rocm-docker>`_
        - Base container for GPU-aware MPI with ROCm for HPC applications. This
          project provides a boilerplate for building and running a Docker
          container with ROCm supporting GPU-aware MPI implementations using
          OpenMPI or UCX.
-
+      
+      * - 
+        - `AMD ROCm with MPICH container <https://github.com/amd/InfinityHub-CI/tree/main/base-mpich-rocm-docker>`_
+        - Base container for GPU-aware MPI with ROCm for HPC applications. This
+          project provides a boilerplate for building and running a Docker
+          container with ROCm supporting GPU-aware MPI implementations using MPICH.
+      
      * -
        - `Kokkos <https://github.com/amd/InfinityHub-CI/tree/main/kokkos>`_
        - Kokkos is a programming model in C++ for writing performance portable
--- a/docs/how-to/tuning-guides/mi300x/index.rst
+++ b/docs/how-to/tuning-guides/mi300x/index.rst
@@ -12,8 +12,7 @@ accelerators. They include detailed instructions on system settings and
 application tuning suggestions to help you fully leverage the capabilities of
 these accelerators, thereby achieving optimal performance.

-* :doc:`../../rocm-for-ai/inference/vllm-benchmark`
-* :doc:`../../rocm-for-ai/inference-optimization/workload`
+* :doc:`/how-to/rocm-for-ai/inference-optimization/workload`
 * `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_


--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -215,9 +215,9 @@ sphinx==8.1.3
    #   sphinx-copybutton
    #   sphinx-design
    #   sphinx-external-toc
+    #   sphinx-last-updated-by-git
    #   sphinx-notfound-page
    #   sphinx-reredirects
-    #   sphinx-sitemap
    #   sphinxcontrib-datatemplates
    #   sphinxcontrib-runcmd
 sphinx-book-theme==1.1.4
@@ -228,11 +228,13 @@ sphinx-design==0.6.1
    # via rocm-docs-core
 sphinx-external-toc==1.0.1
    # via rocm-docs-core
+sphinx-last-updated-by-git==0.3.8
+    # via sphinx-sitemap
 sphinx-notfound-page==1.1.0
    # via rocm-docs-core
 sphinx-reredirects==0.1.6
    # via -r requirements.in
-sphinx-sitemap==2.6.0
+sphinx-sitemap==2.7.2
    # via -r requirements.in
 sphinxcontrib-applehelp==2.0.0
    # via sphinx
--- a/docs/what-is-rocm.rst
+++ b/docs/what-is-rocm.rst
@@ -98,7 +98,7 @@ System Management
 .. csv-table::
  :header: "Component", "Description"

-  ":doc:`AMD SMI <amdsmi:index>`", "C library for Linux that provides a user space interface for applications to monitor and control AMD devices"
+  ":doc:`AMD SMI <amdsmi:index>`", "System management interface to control AMD GPU settings, monitor performance, and retrieve device and process information"
  ":doc:`ROCm Data Center Tool <rdc:index>`", "Simplifies administration and addresses key infrastructure challenges in AMD GPUs in cluster and data-center environments"
  ":doc:`rocminfo <rocminfo:index>`", "Reports system information"
  ":doc:`ROCm SMI <rocm_smi_lib:index>`", "C library for Linux that provides a user space interface for applications to monitor and control GPU applications"
Author	SHA1	Message	Date
Daniel Su	393df3e05c	[Ex CI] hipSPARSELt monorepo enablement (#5033 )	2025-07-11 16:40:18 -04:00
Daniel Su	aa3cdcb3c3	[Ex CI] increase hipSPARSELt test timeout (#5028 )	2025-07-10 12:04:06 -04:00
Pratik Basyal	e8bb027c20	HIP 7.0 upcoming changes blog link updated (#5021 )	2025-07-10 09:53:44 -04:00
Pratik Basyal	544186aef8	ROCm for HPC table update for Develop (#5015 ) (#5016 ) (#5019 ) * ROCm for HPC table update for 6.4.0 (#5015) (#5016) * 6.4.0 updates synced * Minor change * Link update	2025-07-09 14:57:53 -04:00
Peter Park	22524eeaa5	fix xrefs in vllm-0.9.0.1-20250605.rst (#5017 )	2025-07-09 14:38:24 -04:00
Peter Park	d471b04cd5	Update vLLM Docker doc for 07/02	2025-07-09 11:38:27 -04:00
Di Nguyen	1c7cff8a47	Merge pull request #5011 from ROCm/zenguyen/disable-device-merge-inplace-rocprim [rocPRIM] Disable device_merge_inplace unit test for rocPRIM	2025-07-09 09:12:08 -06:00
Daniel Su	84c664074f	[Ex CI] add OS to copyHIP filenames (#5012 )	2025-07-09 10:37:23 -04:00
NguyenNhuDi	7c6083d840	disabled device_merge_inplace	2025-07-08 14:08:53 -06:00
Daniel Su	94099b1398	[Ex CI] rocPyDecode: fix test running (#5002 )	2025-07-08 14:32:30 -04:00
Peter Park	3b3fc4894b	Fix xrefs and Sphinx warnings in documentation Fix xrefs and Sphinx warnings in documentation	2025-07-08 13:22:53 -04:00
Daniel Su	8aba1d2318	[Ex CI] fix printed artifact download links (#4998 )	2025-07-04 14:41:33 -04:00
Mirza Halilčević	e9e75cfc46	Merge pull request #4963 from ROCm/pybind11 Add pybind11 as a pip module requirement for azure	2025-07-04 13:35:24 +02:00
Peter Park	58b3ad0509	Fix Docker run commands in Megatron-LM Docker doc (#4996 ) * fix megatron-lm docker run commands * update --shm-size option	2025-07-02 14:19:27 -04:00
Daniel Su	523d8520f3	[Ex CI] rocBLAS: increase test timeout to 2 hours (#4995 )	2025-07-02 12:16:50 -04:00
Peter Park	d0c8ba0805	Add Wan2.1 to PyTorch inference Docker documentation (#4984 ) * add wan2.1 to pyt inference models * update group name * fix container tag * fix group name * change documented data type to bfloat16 * fix col width	2025-07-02 09:58:37 -04:00
ammallya	73de8a3e46	Removing failing checkout step	2025-07-01 11:25:17 -07:00
Daniel Su	1fc312f90f	[Ex CI] fix hardcoded gfx in MIOpen CK script (#4993 )	2025-06-30 15:34:54 -04:00
Daniel Su	fde2647ccd	[Ex CI] migrate rocBLAS to monorepo (#4987 )	2025-06-30 15:16:58 -04:00
Daniel Su	798c8debb5	[Ex CI] consolidate artifact extraction and deletion in deps-rocm (#4961 )	2025-06-30 14:12:52 -04:00
dependabot[bot]	393ba600c2	Build(deps): Bump sphinx-sitemap from 2.6.0 to 2.7.2 in /docs/sphinx (#4985 ) Bumps [sphinx-sitemap](https://github.com/jdillard/sphinx-sitemap) from 2.6.0 to 2.7.2. - [Release notes](https://github.com/jdillard/sphinx-sitemap/releases) - [Changelog](https://github.com/jdillard/sphinx-sitemap/blob/master/CHANGELOG.rst) - [Commits](https://github.com/jdillard/sphinx-sitemap/compare/v2.6.0...v2.7.2) --- updated-dependencies: - dependency-name: sphinx-sitemap dependency-version: 2.7.2 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2025-06-30 09:33:28 -06:00
Daniel Su	c64c545b52	[Ex CI] hipBLASLt: build some archs on medium pool (#4986 )	2025-06-30 11:32:35 -04:00
Daniel Su	76ee1d720f	[Ex CI] rocAL: switch to medium pool (#4983 )	2025-06-27 13:41:07 -04:00
Daniel Su	5adc040367	[Ex CI] migrate hipBLAS-common & hipBLASLt pipeline IDs (#4982 )	2025-06-27 12:09:58 -04:00
Daniel Su	061da8f306	[Ex CI] enable almalinux8 and gfx1100 builds for hipBLASLt, rocBLAS, rocSOLVER (#4955 )	2025-06-27 10:39:30 -04:00
Daniel Su	e26767bca6	[Ex CI] Tensile: add boost filesystem (#4980 )	2025-06-27 10:38:31 -04:00
Daniel Su	7b6f1800d4	[Ex CI] fix miopen-get-ck for new artifact naming scheme (#4979 )	2025-06-26 15:49:13 -04:00
Pratik Basyal	a6221937f2	KMD UMD support footnote update ROCm 640 (#4973 ) (#4976 ) * KMD UMD support footnote update ROCm 640 * Histotical footnote	2025-06-26 15:34:21 -04:00
Mirza Halilcevic	9b102061f4	Add pybind11 as a pip module requirement for azure.	2025-06-24 08:06:52 -05:00