From c34fddb26a296d5b95aa98ade938bf301b38f7d4 Mon Sep 17 00:00:00 2001
From: David Dixon <165835255+davidd-amd@users.noreply.github.com>
Date: Tue, 2 Sep 2025 13:28:19 -0600
Subject: [PATCH 01/53] Add boost deps (#5235)

---
 .azuredevops/components/hipBLASLt.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.azuredevops/components/hipBLASLt.yml b/.azuredevops/components/hipBLASLt.yml
index d8bfbd0bb..b2633e84d 100644
--- a/.azuredevops/components/hipBLASLt.yml
+++ b/.azuredevops/components/hipBLASLt.yml
@@ -35,6 +35,8 @@ parameters:
     - ccache
     - gfortran
     - git
+    - libboost-filesystem-dev
+    - libboost-program-options-dev
     - libdrm-dev
     - liblapack-dev
     - libmsgpack-dev

From b6647dfb2231e8991fb2ee5e80ae706b97080b16 Mon Sep 17 00:00:00 2001
From: David Dixon <165835255+davidd-amd@users.noreply.github.com>
Date: Wed, 3 Sep 2025 11:35:53 -0600
Subject: [PATCH 02/53] Add spdlog source builds (#5247)

---
 .azuredevops/dependencies/spdlog.yml | 64 ++++++++++++++++++++++++++++
 .azuredevops/tag-builds/spdlog.yml   | 16 +++++++
 2 files changed, 80 insertions(+)
 create mode 100644 .azuredevops/dependencies/spdlog.yml
 create mode 100644 .azuredevops/tag-builds/spdlog.yml

diff --git a/.azuredevops/dependencies/spdlog.yml b/.azuredevops/dependencies/spdlog.yml
new file mode 100644
index 000000000..74f997fb5
--- /dev/null
+++ b/.azuredevops/dependencies/spdlog.yml
@@ -0,0 +1,64 @@
+parameters:
+- name: checkoutRepo
+  type: string
+  default: 'self'
+- name: checkoutRef
+  type: string
+  default: ''
+- name: aptPackages
+  type: object
+  default:
+    - cmake
+    - git
+    - ninja-build
+    - libfmt-dev
+
+- name: jobMatrix
+  type: object
+  default:
+    buildJobs:
+      - { os: ubuntu2204, packageManager: apt, spdlogVersion: "v1.9.2"}
+      - { os: almalinux8, packageManager: dnf, spdlogVersion: "v1.5.0"}
+
+jobs:
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
+  - job: spdlog_${{ job.os }}
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    pool:
+      vmImage: 'ubuntu-22.04'
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
+    workspace:
+      clean: all
+    steps:
+    - checkout: none
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - task: Bash@3
+      displayName: Clone spdlog ${{ job.spdlogVersion }}
+      inputs:
+        targetType: inline
+        script: git clone https://github.com/gabime/spdlog.git -b ${{ job.spdlogVersion }}
+        workingDirectory: $(Agent.BuildDirectory)
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        os: ${{ job.os }}
+        cmakeBuildDir: $(Agent.BuildDirectory)/spdlog/build
+        cmakeSourceDir: $(Agent.BuildDirectory)/spdlog
+        useAmdclang: false
+        extraBuildFlags: >-
+          -DCMAKE_BUILD_TYPE=Release
+          -DSPDLOG_USE_STD_FORMAT=OFF
+          -DSPDLOG_FMT_EXTERNAL_HO=ON
+          -DSPDLOG_INSTALL=ON
+          -GNinja
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
+        os: ${{ job.os }}
diff --git a/.azuredevops/tag-builds/spdlog.yml b/.azuredevops/tag-builds/spdlog.yml
new file mode 100644
index 000000000..0d8de151e
--- /dev/null
+++ b/.azuredevops/tag-builds/spdlog.yml
@@ -0,0 +1,16 @@
+variables:
+- group: common
+- template: /.azuredevops/variables-global.yml
+
+resources:
+  repositories:
+  - repository: pipelines_repo
+    type: github
+    endpoint: ROCm
+    name: ROCm/ROCm
+
+trigger: none
+pr: none
+
+jobs:
+  - template: ${{ variables.CI_DEPENDENCIES_PATH }}/spdlog.yml

From df3ea802908d1815b6f53271a59cc176337feba0 Mon Sep 17 00:00:00 2001
From: amd-hsivasun <hsivasun@amd.com>
Date: Wed, 20 Aug 2025 16:52:43 -0400
Subject: [PATCH 03/53] Enable Roctracer Monorepo

---
 .azuredevops/components/roctracer.yml | 130 ++++++++++++++++----------
 1 file changed, 80 insertions(+), 50 deletions(-)

diff --git a/.azuredevops/components/roctracer.yml b/.azuredevops/components/roctracer.yml
index d00c03ecc..503cd18bd 100644
--- a/.azuredevops/components/roctracer.yml
+++ b/.azuredevops/components/roctracer.yml
@@ -8,6 +8,22 @@ parameters:
 - name: checkoutRef
   type: string
   default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -65,6 +81,10 @@ parameters:
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
   - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}_${{ job.target }}
     variables:
     - group: common
     - template: /.azuredevops/variables-global.yml
@@ -87,6 +107,7 @@ jobs:
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
       parameters:
         checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
       parameters:
         checkoutRef: ${{ parameters.checkoutRef }}
@@ -94,6 +115,8 @@ jobs:
         gpuTarget: ${{ job.target }}
         aggregatePipeline: ${{ parameters.aggregatePipeline }}
         os: ${{ job.os }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
     # the linker flags will not affect ubuntu2204 builds as the paths do not exist
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
       parameters:
@@ -109,10 +132,13 @@ jobs:
           -GNinja
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
       parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
         os: ${{ job.os }}
         gpuTarget: ${{ job.target }}
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
       parameters:
+        componentName: ${{ parameters.componentName }}
         os: ${{ job.os }}
         gpuTarget: ${{ job.target }}
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
@@ -123,53 +149,57 @@ jobs:
     #     gpuTarget: ${{ job.target }}
     #     registerROCmPackages: true
 
-- ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
-    dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
-    condition:
-      and(succeeded(),
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
-        eq(${{ parameters.aggregatePipeline }}, False)
-      )
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool: ${{ job.target }}_test_pool
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-        packageManager: ${{ job.packageManager }}
-        registerROCmPackages: true
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-      parameters:
-        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmTestDependencies }}
-        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: roctracer
-        testExecutable: $(Agent.BuildDirectory)/rocm/share/roctracer/run_tests.sh
-        testParameters: ''
-        testDir: $(Agent.BuildDirectory)
-        testPublishResults: false
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-        environment: test
-        gpuTarget: ${{ job.target }}
-        registerROCmPackages: true
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
+    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
+      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
+      condition:
+        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
+          eq(${{ parameters.aggregatePipeline }}, False)
+        )
+      variables:
+      - group: common
+      - template: /.azuredevops/variables-global.yml
+      pool: ${{ job.target }}_test_pool
+      workspace:
+        clean: all
+      steps:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+          packageManager: ${{ job.packageManager }}
+          registerROCmPackages: true
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+        parameters:
+          preTargetFilter: ${{ parameters.componentName }}
+          gpuTarget: ${{ job.target }}
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
+          dependencyList: ${{ parameters.rocmTestDependencies }}
+          gpuTarget: ${{ job.target }}
+          os: ${{ job.os }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: ${{ parameters.componentName }}
+          testExecutable: $(Agent.BuildDirectory)/rocm/share/roctracer/run_tests.sh
+          testParameters: ''
+          testDir: $(Agent.BuildDirectory)
+          testPublishResults: false
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+          environment: test
+          gpuTarget: ${{ job.target }}
+          registerROCmPackages: true

From 2628812fc42be2574d47a938be248d8352062121 Mon Sep 17 00:00:00 2001
From: amd-hsivasun <hsivasun@amd.com>
Date: Thu, 21 Aug 2025 18:18:00 -0400
Subject: [PATCH 04/53] [Ex CI] Enable rocprofiler-compute monorepo

---
 .../components/rocprofiler-compute.yml        | 183 ++++++++++--------
 1 file changed, 107 insertions(+), 76 deletions(-)

diff --git a/.azuredevops/components/rocprofiler-compute.yml b/.azuredevops/components/rocprofiler-compute.yml
index ed83b277a..6f307747f 100644
--- a/.azuredevops/components/rocprofiler-compute.yml
+++ b/.azuredevops/components/rocprofiler-compute.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: rocPRIM
 - name: checkoutRepo
   type: string
   default: 'self'
 - name: checkoutRef
   type: string
   default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -77,7 +96,11 @@ parameters:
 
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: rocprofiler_compute_build_${{ job.target }}
+  - job: ${{ parameters.componentName }}_build_${{ job.target }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}_${{ job.target }}
     variables:
     - group: common
     - template: /.azuredevops/variables-global.yml
@@ -94,15 +117,19 @@ jobs:
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
       parameters:
         checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
       parameters:
         extraBuildFlags: >-
           -GNinja
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
       parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
         gpuTarget: ${{ job.target }}
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
       parameters:
+        componentName: ${{ parameters.componentName }}
         gpuTarget: ${{ job.target }}
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
     # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
@@ -111,78 +138,82 @@ jobs:
     #     pipModules: ${{ parameters.pipModules }}
     #     gpuTarget: ${{ job.target }}
 
-- ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: rocprofiler_compute_test_${{ job.target }}
-    timeoutInMinutes: 120
-    dependsOn: rocprofiler_compute_build_${{ job.target }}
-    condition:
-      and(succeeded(),
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
-        eq(${{ parameters.aggregatePipeline }}, False)
-      )
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    - name: PYTHON_VERSION
-      value: 3.10
-    pool: ${{ job.target }}_test_pool
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      parameters:
-        checkoutRepo: ${{ parameters.checkoutRepo }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-      parameters:
-        gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmTestDependencies }}
-        gpuTarget: ${{ job.target }}
-    - task: Bash@3
-      displayName: Add en_US.UTF-8 locale
-      inputs:
-        targetType: inline
-        script: |
-          sudo locale-gen en_US.UTF-8
-          sudo update-locale
-          locale -a
-    - task: Bash@3
-      displayName: Add ROCm binaries to PATH
-      inputs:
-        targetType: inline
-        script: |
-          echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/bin"
-          echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/llvm/bin"
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-      parameters:
-        extraBuildFlags: >-
-          -DCMAKE_HIP_ARCHITECTURES=${{ job.target }}
-          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
-          -DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm/lib/cmake/hip
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-          -DROCM_PATH=$(Agent.BuildDirectory)/rocm
-          -DCMAKE_BUILD_TYPE=Release
-          -DENABLE_TESTS=ON
-          -DINSTALL_TESTS=ON
-          -GNinja
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: rocprofiler-compute
-        testDir: $(Build.BinariesDirectory)/libexec/rocprofiler-compute
-        testExecutable: ROCM_PATH=$(Agent.BuildDirectory)/rocm ctest
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-        environment: test
-        gpuTarget: ${{ job.target }}
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
+    - job: ${{ parameters.componentName }}_test_${{ job.target }}
+      timeoutInMinutes: 120
+      dependsOn: ${{ parameters.componentName }}_build_${{ job.target }}
+      condition:
+        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
+          eq(${{ parameters.aggregatePipeline }}, False)
+        )
+      variables:
+      - group: common
+      - template: /.azuredevops/variables-global.yml
+      - name: PYTHON_VERSION
+        value: 3.10
+      pool: ${{ job.target }}_test_pool
+      workspace:
+        clean: all
+      steps:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+        parameters:
+          checkoutRepo: ${{ parameters.checkoutRepo }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+        parameters:
+          preTargetFilter: ${{ parameters.componentName }}
+          gpuTarget: ${{ job.target }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
+          dependencyList: ${{ parameters.rocmTestDependencies }}
+          gpuTarget: ${{ job.target }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+      - task: Bash@3
+        displayName: Add en_US.UTF-8 locale
+        inputs:
+          targetType: inline
+          script: |
+            sudo locale-gen en_US.UTF-8
+            sudo update-locale
+            locale -a
+      - task: Bash@3
+        displayName: Add ROCm binaries to PATH
+        inputs:
+          targetType: inline
+          script: |
+            echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/bin"
+            echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/llvm/bin"
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+        parameters:
+          extraBuildFlags: >-
+            -DCMAKE_HIP_ARCHITECTURES=${{ job.target }}
+            -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
+            -DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm/lib/cmake/hip
+            -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+            -DROCM_PATH=$(Agent.BuildDirectory)/rocm
+            -DCMAKE_BUILD_TYPE=Release
+            -DENABLE_TESTS=ON
+            -DINSTALL_TESTS=ON
+            -GNinja
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: ${{ parameters.componentName }}
+          testDir: $(Build.BinariesDirectory)/libexec/rocprofiler-compute
+          testExecutable: ROCM_PATH=$(Agent.BuildDirectory)/rocm ctest
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+          environment: test
+          gpuTarget: ${{ job.target }}

From ceabccad83b83ffc3a2a74eddec1138f2947dee8 Mon Sep 17 00:00:00 2001
From: amd-hsivasun <hsivasun@amd.com>
Date: Mon, 25 Aug 2025 11:10:20 -0400
Subject: [PATCH 05/53] Fixed componentName

---
 .azuredevops/components/rocprofiler-compute.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.azuredevops/components/rocprofiler-compute.yml b/.azuredevops/components/rocprofiler-compute.yml
index 6f307747f..b80939806 100644
--- a/.azuredevops/components/rocprofiler-compute.yml
+++ b/.azuredevops/components/rocprofiler-compute.yml
@@ -1,7 +1,7 @@
 parameters:
 - name: componentName
   type: string
-  default: rocPRIM
+  default: rocprofiler-compute
 - name: checkoutRepo
   type: string
   default: 'self'

From b0abc43c469ab14cb19be5f2dd87e61897432109 Mon Sep 17 00:00:00 2001
From: amd-hsivasun <hsivasun@amd.com>
Date: Mon, 25 Aug 2025 11:22:18 -0400
Subject: [PATCH 06/53] Added sparseCheckout to testjob template

---
 .azuredevops/components/rocprofiler-compute.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.azuredevops/components/rocprofiler-compute.yml b/.azuredevops/components/rocprofiler-compute.yml
index b80939806..4ccb47c65 100644
--- a/.azuredevops/components/rocprofiler-compute.yml
+++ b/.azuredevops/components/rocprofiler-compute.yml
@@ -166,6 +166,7 @@ jobs:
       - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
         parameters:
           checkoutRepo: ${{ parameters.checkoutRepo }}
+          sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
       - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
         parameters:
           preTargetFilter: ${{ parameters.componentName }}

From bff5c4a955542d97b9394a585684000f378706de Mon Sep 17 00:00:00 2001
From: amd-hsivasun <hsivasun@amd.com>
Date: Mon, 25 Aug 2025 11:27:05 -0400
Subject: [PATCH 07/53] Fixed sparseCheckoutDir

---
 .azuredevops/components/rocprofiler-compute.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.azuredevops/components/rocprofiler-compute.yml b/.azuredevops/components/rocprofiler-compute.yml
index 4ccb47c65..71cee6f30 100644
--- a/.azuredevops/components/rocprofiler-compute.yml
+++ b/.azuredevops/components/rocprofiler-compute.yml
@@ -166,7 +166,7 @@ jobs:
       - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
         parameters:
           checkoutRepo: ${{ parameters.checkoutRepo }}
-          sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
+          sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
       - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
         parameters:
           preTargetFilter: ${{ parameters.componentName }}

From e68d9e9ce2a0e7848e6fb4b41e55af4edaef9211 Mon Sep 17 00:00:00 2001
From: amd-hsivasun <hsivasun@amd.com>
Date: Mon, 25 Aug 2025 11:32:36 -0400
Subject: [PATCH 08/53] Update rocprofiler-compute.yml

---
 .azuredevops/components/rocprofiler-compute.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.azuredevops/components/rocprofiler-compute.yml b/.azuredevops/components/rocprofiler-compute.yml
index 71cee6f30..bb3282be0 100644
--- a/.azuredevops/components/rocprofiler-compute.yml
+++ b/.azuredevops/components/rocprofiler-compute.yml
@@ -1,7 +1,7 @@
 parameters:
 - name: componentName
   type: string
-  default: rocprofiler-compute
+  default: rocprofiler_compute
 - name: checkoutRepo
   type: string
   default: 'self'

From c486c39b50fe27c41aebdcf014555457a3495cd4 Mon Sep 17 00:00:00 2001
From: amd-hsivasun <hsivasun@amd.com>
Date: Mon, 25 Aug 2025 11:36:28 -0400
Subject: [PATCH 09/53] Update rocprofiler-compute.yml

Reverted Component name and updated job names
---
 .azuredevops/components/rocprofiler-compute.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.azuredevops/components/rocprofiler-compute.yml b/.azuredevops/components/rocprofiler-compute.yml
index bb3282be0..d1bbaf3f6 100644
--- a/.azuredevops/components/rocprofiler-compute.yml
+++ b/.azuredevops/components/rocprofiler-compute.yml
@@ -1,7 +1,7 @@
 parameters:
 - name: componentName
   type: string
-  default: rocprofiler_compute
+  default: rocprofiler-compute
 - name: checkoutRepo
   type: string
   default: 'self'
@@ -96,7 +96,7 @@ parameters:
 
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: ${{ parameters.componentName }}_build_${{ job.target }}
+  - job: rocprofiler_compute_build_${{ job.target }}
     ${{ if parameters.buildDependsOn }}:
       dependsOn:
         - ${{ each build in parameters.buildDependsOn }}:
@@ -140,7 +140,7 @@ jobs:
 
 - ${{ if eq(parameters.unifiedBuild, False) }}:
   - ${{ each job in parameters.jobMatrix.testJobs }}:
-    - job: ${{ parameters.componentName }}_test_${{ job.target }}
+    - job: rocprofiler_compute_test_${{ job.target }}
       timeoutInMinutes: 120
       dependsOn: ${{ parameters.componentName }}_build_${{ job.target }}
       condition:

From 07cb61f96942334f5a152cfdb252188dabbcf855 Mon Sep 17 00:00:00 2001
From: amd-hsivasun <hsivasun@amd.com>
Date: Mon, 25 Aug 2025 11:38:53 -0400
Subject: [PATCH 10/53] Update testjob dependsOn

---
 .azuredevops/components/rocprofiler-compute.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.azuredevops/components/rocprofiler-compute.yml b/.azuredevops/components/rocprofiler-compute.yml
index d1bbaf3f6..d15414469 100644
--- a/.azuredevops/components/rocprofiler-compute.yml
+++ b/.azuredevops/components/rocprofiler-compute.yml
@@ -142,7 +142,7 @@ jobs:
   - ${{ each job in parameters.jobMatrix.testJobs }}:
     - job: rocprofiler_compute_test_${{ job.target }}
       timeoutInMinutes: 120
-      dependsOn: ${{ parameters.componentName }}_build_${{ job.target }}
+      dependsOn: rocprofiler_compute_build_${{ job.target }}
       condition:
         and(succeeded(),
           eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),

From f1be2d291a2651805e4151642aac10af07a66cf4 Mon Sep 17 00:00:00 2001
From: David Dixon <165835255+davidd-amd@users.noreply.github.com>
Date: Wed, 3 Sep 2025 13:26:18 -0600
Subject: [PATCH 11/53] Add fmtlib version that works with spdlog (#5249)

---
 .azuredevops/dependencies/fmtlib.yml          | 67 +++++++++++++++++++
 .azuredevops/dependencies/spdlog.yml          | 17 +++--
 .azuredevops/tag-builds/fmtlib.yml            | 23 +++++++
 .azuredevops/tag-builds/spdlog.yml            |  9 ++-
 .../templates/steps/dependencies-vendor.yml   |  2 +
 5 files changed, 112 insertions(+), 6 deletions(-)
 create mode 100644 .azuredevops/dependencies/fmtlib.yml
 create mode 100644 .azuredevops/tag-builds/fmtlib.yml

diff --git a/.azuredevops/dependencies/fmtlib.yml b/.azuredevops/dependencies/fmtlib.yml
new file mode 100644
index 000000000..c1ee707c4
--- /dev/null
+++ b/.azuredevops/dependencies/fmtlib.yml
@@ -0,0 +1,67 @@
+parameters:
+- name: checkoutRepo
+  type: string
+  default: 'self'
+- name: checkoutRef
+  type: string
+  default: ''
+- name: fmtlibVersion
+  type: string
+  default: ''
+- name: aptPackages
+  type: object
+  default:
+    - cmake
+    - git
+    - ninja-build
+    - libfmt-dev
+
+- name: jobMatrix
+  type: object
+  default:
+    buildJobs:
+      - { os: ubuntu2204, packageManager: apt}
+      - { os: almalinux8, packageManager: dnf}
+
+jobs:
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
+  - job: fmtlib_${{ job.os }}
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    pool:
+      vmImage: 'ubuntu-22.04'
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
+    workspace:
+      clean: all
+    steps:
+    - checkout: none
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - task: Bash@3
+      displayName: Clone fmtlib ${{ parameters.fmtlibVersion }}
+      inputs:
+        targetType: inline
+        script: git clone https://github.com/fmtlib/fmt.git -b ${{ parameters.fmtlibVersion }}
+        workingDirectory: $(Agent.BuildDirectory)
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        os: ${{ job.os }}
+        cmakeBuildDir: $(Agent.BuildDirectory)/fmt/build
+        cmakeSourceDir: $(Agent.BuildDirectory)/fmt
+        useAmdclang: false
+        extraBuildFlags: >-
+          -DCMAKE_BUILD_TYPE=Release
+          -DFMT_SYSTEM_HEADERS=ON
+          -DFMT_INSTALL=ON
+          -DFMT_TEST=OFF
+          -GNinja
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
+        os: ${{ job.os }}
diff --git a/.azuredevops/dependencies/spdlog.yml b/.azuredevops/dependencies/spdlog.yml
index 74f997fb5..f561f8a52 100644
--- a/.azuredevops/dependencies/spdlog.yml
+++ b/.azuredevops/dependencies/spdlog.yml
@@ -5,20 +5,22 @@ parameters:
 - name: checkoutRef
   type: string
   default: ''
+- name: spdlogVersion
+  type: string
+  default: ''
 - name: aptPackages
   type: object
   default:
     - cmake
     - git
     - ninja-build
-    - libfmt-dev
 
 - name: jobMatrix
   type: object
   default:
     buildJobs:
-      - { os: ubuntu2204, packageManager: apt, spdlogVersion: "v1.9.2"}
-      - { os: almalinux8, packageManager: dnf, spdlogVersion: "v1.5.0"}
+      - { os: ubuntu2204, packageManager: apt}
+      - { os: almalinux8, packageManager: dnf}
 
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
@@ -41,11 +43,15 @@ jobs:
         aptPackages: ${{ parameters.aptPackages }}
         packageManager: ${{ job.packageManager }}
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
+      parameters:
+        dependencyList:
+          - fmtlib
     - task: Bash@3
-      displayName: Clone spdlog ${{ job.spdlogVersion }}
+      displayName: Clone spdlog ${{ parameters.spdlogVersion }}
       inputs:
         targetType: inline
-        script: git clone https://github.com/gabime/spdlog.git -b ${{ job.spdlogVersion }}
+        script: git clone https://github.com/gabime/spdlog.git -b ${{ parameters.spdlogVersion }}
         workingDirectory: $(Agent.BuildDirectory)
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
       parameters:
@@ -54,6 +60,7 @@ jobs:
         cmakeSourceDir: $(Agent.BuildDirectory)/spdlog
         useAmdclang: false
         extraBuildFlags: >-
+          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/vendor
           -DCMAKE_BUILD_TYPE=Release
           -DSPDLOG_USE_STD_FORMAT=OFF
           -DSPDLOG_FMT_EXTERNAL_HO=ON
diff --git a/.azuredevops/tag-builds/fmtlib.yml b/.azuredevops/tag-builds/fmtlib.yml
new file mode 100644
index 000000000..37d807b67
--- /dev/null
+++ b/.azuredevops/tag-builds/fmtlib.yml
@@ -0,0 +1,23 @@
+variables:
+- group: common
+- template: /.azuredevops/variables-global.yml
+
+parameters:
+- name: fmtlibVersion
+  type: string
+  default: "11.1.3"
+
+resources:
+  repositories:
+  - repository: pipelines_repo
+    type: github
+    endpoint: ROCm
+    name: ROCm/ROCm
+
+trigger: none
+pr: none
+
+jobs:
+  - template: ${{ variables.CI_DEPENDENCIES_PATH }}/fmtlib.yml
+    parameters:
+      fmtlibVersion: ${{ parameters.fmtlibVersion }}
diff --git a/.azuredevops/tag-builds/spdlog.yml b/.azuredevops/tag-builds/spdlog.yml
index 0d8de151e..3fbf62288 100644
--- a/.azuredevops/tag-builds/spdlog.yml
+++ b/.azuredevops/tag-builds/spdlog.yml
@@ -2,6 +2,11 @@ variables:
 - group: common
 - template: /.azuredevops/variables-global.yml
 
+parameters:
+- name: fmtlibVersion
+  type: string
+  default: "v1.15.1"
+
 resources:
   repositories:
   - repository: pipelines_repo
@@ -13,4 +18,6 @@ trigger: none
 pr: none
 
 jobs:
-  - template: ${{ variables.CI_DEPENDENCIES_PATH }}/spdlog.yml
+  - template: ${{ variables.CI_DEPENDENCIES_PATH }}/fmtlib.yml
+    parameters:
+      fmtlibVersion: ${{ parameters.fmtlibVersion }}
diff --git a/.azuredevops/templates/steps/dependencies-vendor.yml b/.azuredevops/templates/steps/dependencies-vendor.yml
index 8d885b553..615adafd8 100644
--- a/.azuredevops/templates/steps/dependencies-vendor.yml
+++ b/.azuredevops/templates/steps/dependencies-vendor.yml
@@ -8,10 +8,12 @@ parameters:
   type: object
   default:
     boost: 250
+    fmtlib: 341
     grpc: 72
     gtest: 73
     half560: 68
     lapack: 69
+    spdlog: 340
 
 steps:
 - ${{ each dependency in parameters.dependencyList }}:

From 2b0ce5e5c20bc14b7373967891572780a1289955 Mon Sep 17 00:00:00 2001
From: David Dixon <165835255+davidd-amd@users.noreply.github.com>
Date: Wed, 3 Sep 2025 13:59:41 -0600
Subject: [PATCH 12/53] Fix typo (#5250)

---
 .azuredevops/tag-builds/spdlog.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.azuredevops/tag-builds/spdlog.yml b/.azuredevops/tag-builds/spdlog.yml
index 3fbf62288..300079340 100644
--- a/.azuredevops/tag-builds/spdlog.yml
+++ b/.azuredevops/tag-builds/spdlog.yml
@@ -3,7 +3,7 @@ variables:
 - template: /.azuredevops/variables-global.yml
 
 parameters:
-- name: fmtlibVersion
+- name: spdlogVersion
   type: string
   default: "v1.15.1"
 
@@ -18,6 +18,6 @@ trigger: none
 pr: none
 
 jobs:
-  - template: ${{ variables.CI_DEPENDENCIES_PATH }}/fmtlib.yml
+  - template: ${{ variables.CI_DEPENDENCIES_PATH }}/spdlog.yml
     parameters:
-      fmtlibVersion: ${{ parameters.fmtlibVersion }}
+      spdlogVersion: ${{ parameters.spdlogVersion }}

From 3aab9e1bc54851c653ac7cf7613e0418b9942a84 Mon Sep 17 00:00:00 2001
From: Joseph Macaranas <145489236+jayhawk-commits@users.noreply.github.com>
Date: Wed, 3 Sep 2025 16:58:17 -0400
Subject: [PATCH 13/53] Modify sparseCheckoutDirectories in checkout.yml
 (#5251)

Added 'shared' to sparseCheckoutDirectories parameter.
---
 .azuredevops/templates/steps/checkout.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.azuredevops/templates/steps/checkout.yml b/.azuredevops/templates/steps/checkout.yml
index f021cbc40..4c5d58f56 100644
--- a/.azuredevops/templates/steps/checkout.yml
+++ b/.azuredevops/templates/steps/checkout.yml
@@ -20,7 +20,7 @@ steps:
     retryCountOnTaskFailure: 3
     fetchFilter: blob:none
     ${{ if ne(parameters.sparseCheckoutDir, '') }}:
-      sparseCheckoutDirectories: ${{ parameters.sparseCheckoutDir }}
+      sparseCheckoutDirectories: ${{ parameters.sparseCheckoutDir }} shared
       path: sparse
   - ${{ if ne(parameters.sparseCheckoutDir, '') }}:
     - task: Bash@3

From 9e1a82d327bf0e2713a78c9e5e3370b9fb82d133 Mon Sep 17 00:00:00 2001
From: David Dixon <165835255+davidd-amd@users.noreply.github.com>
Date: Wed, 3 Sep 2025 20:11:38 -0600
Subject: [PATCH 14/53] Add libdivide (#5252)

---
 .azuredevops/dependencies/libdivide.yml | 64 +++++++++++++++++++++++++
 .azuredevops/tag-builds/libdivide.yml   | 23 +++++++++
 2 files changed, 87 insertions(+)
 create mode 100644 .azuredevops/dependencies/libdivide.yml
 create mode 100644 .azuredevops/tag-builds/libdivide.yml

diff --git a/.azuredevops/dependencies/libdivide.yml b/.azuredevops/dependencies/libdivide.yml
new file mode 100644
index 000000000..e20a1ccea
--- /dev/null
+++ b/.azuredevops/dependencies/libdivide.yml
@@ -0,0 +1,64 @@
+parameters:
+- name: checkoutRepo
+  type: string
+  default: 'self'
+- name: checkoutRef
+  type: string
+  default: ''
+- name: libdivideVersion
+  type: string
+  default: ''
+- name: aptPackages
+  type: object
+  default:
+    - cmake
+    - git
+    - ninja-build
+
+- name: jobMatrix
+  type: object
+  default:
+    buildJobs:
+      - { os: ubuntu2204, packageManager: apt}
+      - { os: almalinux8, packageManager: dnf}
+
+jobs:
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
+  - job: libdivide_${{ job.os }}
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    pool:
+      vmImage: 'ubuntu-22.04'
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
+    workspace:
+      clean: all
+    steps:
+    - checkout: none
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - task: Bash@3
+      displayName: Clone libdivide ${{ parameters.libdivideVersion }}
+      inputs:
+        targetType: inline
+        script: git clone https://github.com/ridiculousfish/libdivide.git -b ${{ parameters.libdivideVersion }}
+        workingDirectory: $(Agent.BuildDirectory)
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        os: ${{ job.os }}
+        cmakeBuildDir: $(Agent.BuildDirectory)/libdivide/build
+        cmakeSourceDir: $(Agent.BuildDirectory)/libdivide
+        useAmdclang: false
+        extraBuildFlags: >-
+          -DCMAKE_BUILD_TYPE=Release
+          -DLIBDIVIDE_BUILD_TESTS=OFF
+          -GNinja
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
+        os: ${{ job.os }}
diff --git a/.azuredevops/tag-builds/libdivide.yml b/.azuredevops/tag-builds/libdivide.yml
new file mode 100644
index 000000000..7ae199743
--- /dev/null
+++ b/.azuredevops/tag-builds/libdivide.yml
@@ -0,0 +1,23 @@
+variables:
+- group: common
+- template: /.azuredevops/variables-global.yml
+
+parameters:
+- name: libdivideVersion
+  type: string
+  default: master
+
+resources:
+  repositories:
+  - repository: pipelines_repo
+    type: github
+    endpoint: ROCm
+    name: ROCm/ROCm
+
+trigger: none
+pr: none
+
+jobs:
+  - template: ${{ variables.CI_DEPENDENCIES_PATH }}/libdivide.yml
+    parameters:
+      libdivideVersion: ${{ parameters.libdivideVersion }}

From 2f401895757814beee25df978c7ade905cf7c164 Mon Sep 17 00:00:00 2001
From: David Dixon <165835255+davidd-amd@users.noreply.github.com>
Date: Thu, 4 Sep 2025 18:48:34 -0600
Subject: [PATCH 15/53] add catch2 (#5257)

---
 .azuredevops/dependencies/catch2.yml          | 63 +++++++++++++++++++
 .azuredevops/tag-builds/catch2.yml            | 23 +++++++
 .../templates/steps/dependencies-vendor.yml   |  4 +-
 3 files changed, 89 insertions(+), 1 deletion(-)
 create mode 100644 .azuredevops/dependencies/catch2.yml
 create mode 100644 .azuredevops/tag-builds/catch2.yml

diff --git a/.azuredevops/dependencies/catch2.yml b/.azuredevops/dependencies/catch2.yml
new file mode 100644
index 000000000..aaf1d41be
--- /dev/null
+++ b/.azuredevops/dependencies/catch2.yml
@@ -0,0 +1,63 @@
+parameters:
+- name: checkoutRepo
+  type: string
+  default: 'self'
+- name: checkoutRef
+  type: string
+  default: ''
+- name: catch2Version
+  type: string
+  default: ''
+- name: aptPackages
+  type: object
+  default:
+    - cmake
+    - git
+    - ninja-build
+
+- name: jobMatrix
+  type: object
+  default:
+    buildJobs:
+      - { os: ubuntu2204, packageManager: apt}
+      - { os: almalinux8, packageManager: dnf}
+
+jobs:
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
+  - job: catch2_${{ job.os }}
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    pool:
+      vmImage: 'ubuntu-22.04'
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
+    workspace:
+      clean: all
+    steps:
+    - checkout: none
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - task: Bash@3
+      displayName: Clone catch2 ${{ parameters.catch2Version }}
+      inputs:
+        targetType: inline
+        script: git clone https://github.com/catchorg/Catch2.git -b ${{ parameters.catch2Version }}
+        workingDirectory: $(Agent.BuildDirectory)
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        os: ${{ job.os }}
+        cmakeBuildDir: $(Agent.BuildDirectory)/Catch2/build
+        cmakeSourceDir: $(Agent.BuildDirectory)/Catch2
+        useAmdclang: false
+        extraBuildFlags: >-
+          -DCMAKE_BUILD_TYPE=Release
+          -GNinja
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
+        os: ${{ job.os }}
diff --git a/.azuredevops/tag-builds/catch2.yml b/.azuredevops/tag-builds/catch2.yml
new file mode 100644
index 000000000..ded20ab86
--- /dev/null
+++ b/.azuredevops/tag-builds/catch2.yml
@@ -0,0 +1,23 @@
+variables:
+- group: common
+- template: /.azuredevops/variables-global.yml
+
+parameters:
+- name: catch2Version
+  type: string
+  default: "v3.7.0"
+
+resources:
+  repositories:
+  - repository: pipelines_repo
+    type: github
+    endpoint: ROCm
+    name: ROCm/ROCm
+
+trigger: none
+pr: none
+
+jobs:
+  - template: ${{ variables.CI_DEPENDENCIES_PATH }}/catch2.yml
+    parameters:
+      catch2Version: ${{ parameters.catch2Version }}
diff --git a/.azuredevops/templates/steps/dependencies-vendor.yml b/.azuredevops/templates/steps/dependencies-vendor.yml
index 615adafd8..10086e38e 100644
--- a/.azuredevops/templates/steps/dependencies-vendor.yml
+++ b/.azuredevops/templates/steps/dependencies-vendor.yml
@@ -8,11 +8,13 @@ parameters:
   type: object
   default:
     boost: 250
+    catch2: 343
     fmtlib: 341
     grpc: 72
     gtest: 73
     half560: 68
     lapack: 69
+    libdivide: 342
     spdlog: 340
 
 steps:
@@ -31,7 +33,7 @@ steps:
     inputs:
       archiveFilePatterns: '$(Pipeline.Workspace)/d/**/*.tar.gz'
       destinationFolder: $(Agent.BuildDirectory)/vendor
-      cleanDestinationFolder: true
+      cleanDestinationFolder: false
       overwriteExistingFiles: true
   - task: DeleteFiles@1
     displayName: Clean up ${{ dependency }}

From e5345a9ccad64909a03eeed0db87f9f1e38a3dca Mon Sep 17 00:00:00 2001
From: Joseph Macaranas <145489236+jayhawk-commits@users.noreply.github.com>
Date: Fri, 5 Sep 2025 10:12:39 -0400
Subject: [PATCH 16/53] External CI: rocdecode downstream builds (#5254)

- Trigger downstream build of rocpydecode within rocdecode pipelines.
- Copying similar variables as other pipelines even though these projects are not in the super-repos.
---
 .azuredevops/components/rocDecode.yml   | 44 +++++++++++++++++++++++++
 .azuredevops/components/rocPyDecode.yml | 36 +++++++++++++++-----
 2 files changed, 72 insertions(+), 8 deletions(-)

diff --git a/.azuredevops/components/rocDecode.yml b/.azuredevops/components/rocDecode.yml
index ee1d5ccfc..f71c3cf48 100644
--- a/.azuredevops/components/rocDecode.yml
+++ b/.azuredevops/components/rocDecode.yml
@@ -8,6 +8,22 @@ parameters:
 - name: checkoutRef
   type: string
   default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -56,10 +72,23 @@ parameters:
     testJobs:
       - { os: ubuntu2204, packageManager: apt, target: gfx942 }
       - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+- name: downstreamComponentMatrix
+  type: object
+  default:
+    - rocPyDecode:
+      name: rocPyDecode
+      sparseCheckoutDir: ''
+      skipUnifiedBuild: 'false'
+      buildDependsOn:
+        - rocDecode_build
 
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
   - job: ${{ parameters.componentName }}_build_${{ job.os }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}
     variables:
     - group: common
     - template: /.azuredevops/variables-global.yml
@@ -83,12 +112,15 @@ jobs:
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
       parameters:
         checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
       parameters:
         checkoutRef: ${{ parameters.checkoutRef }}
         dependencyList: ${{ parameters.rocmDependencies }}
         os: ${{ job.os }}
         aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
       parameters:
         os: ${{ job.os }}
@@ -169,3 +201,15 @@ jobs:
         registerROCmPackages: true
         environment: test
         gpuTarget: ${{ job.target }}
+
+- ${{ if parameters.triggerDownstreamJobs }}:
+  - ${{ each component in parameters.downstreamComponentMatrix }}:
+    - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
+      - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
+        parameters:
+          checkoutRepo: ${{ parameters.checkoutRepo }}
+          sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
+          buildDependsOn: ${{ component.buildDependsOn }}
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
+          triggerDownstreamJobs: true
+          unifiedBuild: ${{ parameters.unifiedBuild }}
diff --git a/.azuredevops/components/rocPyDecode.yml b/.azuredevops/components/rocPyDecode.yml
index 6e85a43ef..615148a49 100644
--- a/.azuredevops/components/rocPyDecode.yml
+++ b/.azuredevops/components/rocPyDecode.yml
@@ -5,6 +5,22 @@ parameters:
 - name: checkoutRef
   type: string
   default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -47,19 +63,19 @@ parameters:
   type: object
   default:
     buildJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
     testJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
 
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
   - job: rocPyDecode_build_${{ job.target }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}
     variables:
     - group: common
     - template: /.azuredevops/variables-global.yml
@@ -74,16 +90,20 @@ jobs:
       parameters:
         aptPackages: ${{ parameters.aptPackages }}
         pipModules: ${{ parameters.pipModules }}
+        packageManager: ${{ job.packageManager }}
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
       parameters:
         checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
       parameters:
         checkoutRef: ${{ parameters.checkoutRef }}
         dependencyList: ${{ parameters.rocmDependencies }}
         gpuTarget: ${{ job.target }}
         aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
     - task: Bash@3
       displayName: 'Save Python Package Paths'
       inputs:

From 76fd6b22902a5494b6516b237d84f67e4f0463f8 Mon Sep 17 00:00:00 2001
From: Matt Williams <matt.williams@amd.com>
Date: Fri, 5 Sep 2025 11:45:06 -0400
Subject: [PATCH 17/53] Updating broken link (#5258)

---
 docs/how-to/rocm-for-ai/inference-optimization/workload.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/how-to/rocm-for-ai/inference-optimization/workload.rst b/docs/how-to/rocm-for-ai/inference-optimization/workload.rst
index 0580e7434..bc9463f58 100644
--- a/docs/how-to/rocm-for-ai/inference-optimization/workload.rst
+++ b/docs/how-to/rocm-for-ai/inference-optimization/workload.rst
@@ -939,7 +939,7 @@ hipBLASLt benchmarking
 The GEMM library
 `hipBLASLt <https://rocm.docs.amd.com/projects/hipBLASLt/en/latest/index.html>`_
 provides a benchmark tool for its supported operations. Refer to the
-`documentation <https://github.com/ROCm/hipBLASLt/blob/develop/clients/benchmarks/README.md>`_
+`documentation <https://github.com/ROCm/hipBLASLt/blob/develop/clients/bench/README.md>`_
 for details.
 
 * Example 1: Benchmark mix fp8 GEMM

From 4bc1bf00c600326949962c5dd86fd643b3098e87 Mon Sep 17 00:00:00 2001
From: Peter Park <peter.park@amd.com>
Date: Fri, 5 Sep 2025 12:07:51 -0400
Subject: [PATCH 18/53] Update PyTorch training benchmark docker doc to 25.7
 (#5255)

* Update PyTorch training benchmark docker doc to 25.7

* update .wordlist.txt

* update conf.py

* update data sheet

* fix sphinx warnings
---
 .wordlist.txt                                 |   1 +
 docs/conf.py                                  |   4 +
 ...torch-training-v25.6-benchmark-models.yaml | 120 ++++
 .../pytorch-training-benchmark-models.yaml    | 120 ++--
 docs/how-to/deep-learning-rocm.rst            |  90 ++-
 .../previous-versions/vllm-0.9.1-20250715.rst |   4 +-
 .../inference/benchmark-docker/vllm.rst       |   4 +-
 docs/how-to/rocm-for-ai/install.rst           |   4 +-
 .../megatron-lm-v24.12-dev.rst                |   4 +-
 .../previous-versions/megatron-lm-v25.3.rst   |   6 +-
 .../previous-versions/megatron-lm-v25.4.rst   |   6 +-
 .../pytorch-training-history.rst              |  10 +-
 .../pytorch-training-v25.5.rst                |   5 +
 .../pytorch-training-v25.6.rst                | 456 ++++++++++++++
 .../benchmark-docker/pytorch-training.rst     | 585 +++++++++++-------
 15 files changed, 1079 insertions(+), 340 deletions(-)
 create mode 100644 docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.6-benchmark-models.yaml
 create mode 100644 docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.6.rst

diff --git a/.wordlist.txt b/.wordlist.txt
index 09236fa95..4eb5df599 100644
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -918,6 +918,7 @@ toolchain
 toolchains
 toolset
 toolsets
+torchtitan
 torchvision
 tqdm
 tracebacks
diff --git a/docs/conf.py b/docs/conf.py
index 6f3979312..6e7fa5e61 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -124,11 +124,15 @@ article_pages = [
     {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3", "os": ["linux"]},
     {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4", "os": ["linux"]},
     {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.5", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.6", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-primus-migration-guide", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/primus-megatron", "os": ["linux"]},
     {"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]},
     {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history", "os": ["linux"]},
     {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.3", "os": ["linux"]},
     {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.4", "os": ["linux"]},
     {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.6", "os": ["linux"]},
     {"file": "how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext", "os": ["linux"]},
     {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history", "os": ["linux"]},
     {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4", "os": ["linux"]},
diff --git a/docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.6-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.6-benchmark-models.yaml
new file mode 100644
index 000000000..df0a198d5
--- /dev/null
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.6-benchmark-models.yaml
@@ -0,0 +1,120 @@
+unified_docker:
+  latest:
+    pull_tag: rocm/pytorch-training:v25.6
+    docker_hub_url: https://hub.docker.com/r/rocm/pytorch-training/tags
+    rocm_version: 6.4.1
+    pytorch_version: 2.8.0a0+git7d205b2
+    python_version: 3.10.17
+    transformer_engine_version: 1.14.0+2f85f5f2
+    flash_attention_version: 3.0.0.post1
+    hipblaslt_version: 0.15.0-8c6919d
+    triton_version: 3.3.0
+model_groups:
+  - group: Pre-training
+    tag: pre-training
+    models:
+    - model: Llama 3.1 8B
+      mad_tag: pyt_train_llama-3.1-8b
+      model_repo: Llama-3.1-8B
+      url: https://huggingface.co/meta-llama/Llama-3.1-8B
+      precision: BF16
+      training_modes: [pretrain]
+    - model: Llama 3.1 70B
+      mad_tag: pyt_train_llama-3.1-70b
+      model_repo: Llama-3.1-70B
+      url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
+      precision: BF16
+      training_modes: [pretrain]
+    - model: FLUX.1-dev
+      mad_tag: pyt_train_flux
+      model_repo: Flux
+      url: https://huggingface.co/black-forest-labs/FLUX.1-dev
+      precision: BF16
+      training_modes: [pretrain]
+  - group: Fine-tuning
+    tag: fine-tuning
+    models:
+    - model: Llama 4 Scout 17B-16E
+      mad_tag: pyt_train_llama-4-scout-17b-16e
+      model_repo: Llama-4-17B_16E
+      url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3.3 70B
+      mad_tag: pyt_train_llama-3.3-70b
+      model_repo: Llama-3.3-70B
+      url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
+    - model: Llama 3.2 1B
+      mad_tag: pyt_train_llama-3.2-1b
+      model_repo: Llama-3.2-1B
+      url: https://huggingface.co/meta-llama/Llama-3.2-1B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3.2 3B
+      mad_tag: pyt_train_llama-3.2-3b
+      model_repo: Llama-3.2-3B
+      url: https://huggingface.co/meta-llama/Llama-3.2-3B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3.2 Vision 11B
+      mad_tag: pyt_train_llama-3.2-vision-11b
+      model_repo: Llama-3.2-Vision-11B
+      url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision
+      precision: BF16
+      training_modes: [finetune_fw]
+    - model: Llama 3.2 Vision 90B
+      mad_tag: pyt_train_llama-3.2-vision-90b
+      model_repo: Llama-3.2-Vision-90B
+      url: https://huggingface.co/meta-llama/Llama-3.2-90B-Vision
+      precision: BF16
+      training_modes: [finetune_fw]
+    - model: Llama 3.1 8B
+      mad_tag: pyt_train_llama-3.1-8b
+      model_repo: Llama-3.1-8B
+      url: https://huggingface.co/meta-llama/Llama-3.1-8B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3.1 70B
+      mad_tag: pyt_train_llama-3.1-70b
+      model_repo: Llama-3.1-70B
+      url: https://huggingface.co/meta-llama/Llama-3.1-70B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
+    - model: Llama 3.1 405B
+      mad_tag: pyt_train_llama-3.1-405b
+      model_repo: Llama-3.1-405B
+      url: https://huggingface.co/meta-llama/Llama-3.1-405B
+      precision: BF16
+      training_modes: [finetune_qlora, HF_finetune_lora]
+    - model: Llama 3 8B
+      mad_tag: pyt_train_llama-3-8b
+      model_repo: Llama-3-8B
+      url: https://huggingface.co/meta-llama/Meta-Llama-3-8B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3 70B
+      mad_tag: pyt_train_llama-3-70b
+      model_repo: Llama-3-70B
+      url: https://huggingface.co/meta-llama/Meta-Llama-3-70B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 2 7B
+      mad_tag: pyt_train_llama-2-7b
+      model_repo: Llama-2-7B
+      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
+    - model: Llama 2 13B
+      mad_tag: pyt_train_llama-2-13b
+      model_repo: Llama-2-13B
+      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 2 70B
+      mad_tag: pyt_train_llama-2-70b
+      model_repo: Llama-2-70B
+      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
+      precision: BF16
+      training_modes: [finetune_lora, finetune_qlora, HF_finetune_lora]
diff --git a/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
index df0a198d5..dc19843be 100644
--- a/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
@@ -1,38 +1,17 @@
-unified_docker:
-  latest:
-    pull_tag: rocm/pytorch-training:v25.6
-    docker_hub_url: https://hub.docker.com/r/rocm/pytorch-training/tags
-    rocm_version: 6.4.1
-    pytorch_version: 2.8.0a0+git7d205b2
-    python_version: 3.10.17
-    transformer_engine_version: 1.14.0+2f85f5f2
-    flash_attention_version: 3.0.0.post1
-    hipblaslt_version: 0.15.0-8c6919d
-    triton_version: 3.3.0
+dockers:
+  - pull_tag: rocm/pytorch-training:v25.7
+    docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.7/images/sha256-cc6fd840ab89cb81d926fc29eca6d075aee9875a55a522675a4b9231c9a0a712
+    components:
+      ROCm: 6.4.2
+      PyTorch: 2.8.0a0+gitd06a406
+      Python: 3.10.18
+      Transformer Engine: 2.2.0.dev0+94e53dd8
+      Flash Attention: 3.0.0.post1
+      hipBLASLt: 1.1.0-4b9a52edfc
+      Triton: 3.3.0
 model_groups:
-  - group: Pre-training
-    tag: pre-training
-    models:
-    - model: Llama 3.1 8B
-      mad_tag: pyt_train_llama-3.1-8b
-      model_repo: Llama-3.1-8B
-      url: https://huggingface.co/meta-llama/Llama-3.1-8B
-      precision: BF16
-      training_modes: [pretrain]
-    - model: Llama 3.1 70B
-      mad_tag: pyt_train_llama-3.1-70b
-      model_repo: Llama-3.1-70B
-      url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
-      precision: BF16
-      training_modes: [pretrain]
-    - model: FLUX.1-dev
-      mad_tag: pyt_train_flux
-      model_repo: Flux
-      url: https://huggingface.co/black-forest-labs/FLUX.1-dev
-      precision: BF16
-      training_modes: [pretrain]
-  - group: Fine-tuning
-    tag: fine-tuning
+  - group: Meta Llama
+    tag: llama
     models:
     - model: Llama 4 Scout 17B-16E
       mad_tag: pyt_train_llama-4-scout-17b-16e
@@ -75,19 +54,19 @@ model_groups:
       model_repo: Llama-3.1-8B
       url: https://huggingface.co/meta-llama/Llama-3.1-8B
       precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
+      training_modes: [pretrain, finetune_fw, finetune_lora, HF_pretrain]
     - model: Llama 3.1 70B
       mad_tag: pyt_train_llama-3.1-70b
       model_repo: Llama-3.1-70B
-      url: https://huggingface.co/meta-llama/Llama-3.1-70B
+      url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
       precision: BF16
-      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
+      training_modes: [pretrain, finetune_fw, finetune_lora]
     - model: Llama 3.1 405B
       mad_tag: pyt_train_llama-3.1-405b
       model_repo: Llama-3.1-405B
       url: https://huggingface.co/meta-llama/Llama-3.1-405B
       precision: BF16
-      training_modes: [finetune_qlora, HF_finetune_lora]
+      training_modes: [finetune_qlora]
     - model: Llama 3 8B
       mad_tag: pyt_train_llama-3-8b
       model_repo: Llama-3-8B
@@ -117,4 +96,67 @@ model_groups:
       model_repo: Llama-2-70B
       url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
       precision: BF16
-      training_modes: [finetune_lora, finetune_qlora, HF_finetune_lora]
+      training_modes: [finetune_lora, finetune_qlora]
+  - group: OpenAI
+    tag: openai
+    models:
+    - model: GPT OSS 20B
+      mad_tag: pyt_train_gpt_oss_20b
+      model_repo: GPT-OSS-20B
+      url: https://huggingface.co/openai/gpt-oss-20b
+      precision: BF16
+      training_modes: [HF_finetune_lora]
+    - model: GPT OSS 120B
+      mad_tag: pyt_train_gpt_oss_120b
+      model_repo: GPT-OSS-120B
+      url: https://huggingface.co/openai/gpt-oss-120b
+      precision: BF16
+      training_modes: [HF_finetune_lora]
+  - group: Qwen
+    tag: qwen
+    models:
+    - model: Qwen 3 8B
+      mad_tag: pyt_train_qwen3-8b
+      model_repo: Qwen3-8B
+      url: https://huggingface.co/Qwen/Qwen3-8B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Qwen 3 32B
+      mad_tag: pyt_train_qwen3-32b
+      model_repo: Qwen3-32
+      url: https://huggingface.co/Qwen/Qwen3-32B
+      precision: BF16
+      training_modes: [finetune_lora]
+    - model: Qwen 2.5 32B
+      mad_tag: pyt_train_qwen2.5-32b
+      model_repo: Qwen2.5-32B
+      url: https://huggingface.co/Qwen/Qwen2.5-32B
+      precision: BF16
+      training_modes: [finetune_lora]
+    - model: Qwen 2.5 72B
+      mad_tag: pyt_train_qwen2.5-72b
+      model_repo: Qwen2.5-72B
+      url: https://huggingface.co/Qwen/Qwen2.5-72B
+      precision: BF16
+      training_modes: [finetune_lora]
+    - model: Qwen 2 1.5B
+      mad_tag: pyt_train_qwen2-1.5b
+      model_repo: Qwen2-1.5B
+      url: https://huggingface.co/Qwen/Qwen2-1.5B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Qwen 2 7B
+      mad_tag: pyt_train_qwen2-7b
+      model_repo: Qwen2-7B
+      url: https://huggingface.co/Qwen/Qwen2-7B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+  - group: Flux
+    tag: flux
+    models:
+    - model: FLUX.1-dev
+      mad_tag: pyt_train_flux
+      model_repo: Flux
+      url: https://huggingface.co/black-forest-labs/FLUX.1-dev
+      precision: BF16
+      training_modes: [pretrain]
diff --git a/docs/how-to/deep-learning-rocm.rst b/docs/how-to/deep-learning-rocm.rst
index 16dad363c..fb1d55a3c 100644
--- a/docs/how-to/deep-learning-rocm.rst
+++ b/docs/how-to/deep-learning-rocm.rst
@@ -23,93 +23,92 @@ The table below summarizes information about ROCm-enabled deep learning framewor
       - Installation options
       - GitHub
 
-    * - `PyTorch <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/pytorch-compatibility.html>`_
+    * - `PyTorch <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/pytorch-compatibility.html>`__
       - .. raw:: html
-         
+
           <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html"><i class="fas fa-link fa-lg"></i></a>
       - 
-        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-a-docker-image-with-pytorch-pre-installed>`_ 
-        - `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-a-wheels-package>`_
-        - `ROCm Base Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-the-pytorch-rocm-base-docker-image>`_ 
-        - `Upstream Docker file <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-the-pytorch-upstream-dockerfile>`_
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-a-docker-image-with-pytorch-pre-installed>`__
+        - `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-a-wheels-package>`__
+        - `ROCm Base Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-the-pytorch-rocm-base-docker-image>`__
+        - `Upstream Docker file <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-the-pytorch-upstream-dockerfile>`__
       - .. raw:: html
-         
+
           <a href="https://github.com/ROCm/pytorch"><i class="fab fa-github fa-lg"></i></a>
-   
-    * - `TensorFlow <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/tensorflow-compatibility.html>`_
+
+    * - `TensorFlow <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/tensorflow-compatibility.html>`__
       - .. raw:: html
-         
+
           <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html"><i class="fas fa-link fa-lg"></i></a>
       - 
-        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html#using-a-docker-image-with-tensorflow-pre-installed>`_
-        - `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html#using-a-wheels-package>`_
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html#using-a-docker-image-with-tensorflow-pre-installed>`__
+        - `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html#using-a-wheels-package>`__
 
       - .. raw:: html
-         
+
           <a href="https://github.com/ROCm/tensorflow-upstream"><i class="fab fa-github fa-lg"></i></a> 
 
-    * - `JAX <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/jax-compatibility.html>`_
+    * - `JAX <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/jax-compatibility.html>`__
       - .. raw:: html
-         
+
           <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/jax-install.html"><i class="fas fa-link fa-lg"></i></a>
       - 
-        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/jax-install.html#using-a-prebuilt-docker-image>`_
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/jax-install.html#using-a-prebuilt-docker-image>`__
       - .. raw:: html
-         
+
           <a href="https://github.com/ROCm/jax"><i class="fab fa-github fa-lg"></i></a>
-   
-    * - `verl <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/verl-compatibility.html>`_
+
+    * - `verl <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/verl-compatibility.html>`__
       - .. raw:: html
-         
+
           <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/verl-install.html"><i class="fas fa-link fa-lg"></i></a>
       - 
-        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/verl-install.html#use-a-prebuilt-docker-image-with-verl-pre-installed>`_
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/verl-install.html#use-a-prebuilt-docker-image-with-verl-pre-installed>`__
       - .. raw:: html
-         
+
           <a href="https://github.com/ROCm/verl"><i class="fab fa-github fa-lg"></i></a>
 
-    * - `Stanford Megatron-LM <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/stanford-megatron-lm-compatibility.html>`_
+    * - `Stanford Megatron-LM <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/stanford-megatron-lm-compatibility.html>`__
       - .. raw:: html
-         
+
           <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/stanford-megatron-lm-install.html"><i class="fas fa-link fa-lg"></i></a>
       - 
-        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/stanford-megatron-lm-install.html#use-a-prebuilt-docker-image-with-stanford-megatron-lm-pre-installed>`_
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/stanford-megatron-lm-install.html#use-a-prebuilt-docker-image-with-stanford-megatron-lm-pre-installed>`__
       - .. raw:: html
-         
+
           <a href="https://github.com/ROCm/Stanford-Megatron-LM"><i class="fab fa-github fa-lg"></i></a>
-   
-    * - `DGL <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/dgl-compatibility.html>`_
+
+    * - `DGL <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/dgl-compatibility.html>`__
       - .. raw:: html
-         
+
           <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/dgl-install.html"><i class="fas fa-link fa-lg"></i></a>
       - 
-        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/dgl-install.html#use-a-prebuilt-docker-image-with-dgl-pre-installed>`_
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/dgl-install.html#use-a-prebuilt-docker-image-with-dgl-pre-installed>`__
       - .. raw:: html
-         
+
           <a href="https://github.com/ROCm/dgl"><i class="fab fa-github fa-lg"></i></a> 
 
-    * - `Megablocks <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/megablocks-compatibility.html>`_
+    * - `Megablocks <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/megablocks-compatibility.html>`__
       - .. raw:: html
-         
+
           <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/megablocks-install.html"><i class="fas fa-link fa-lg"></i></a>
       - 
-        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/megablocks-install.html#using-a-prebuilt-docker-image-with-megablocks-pre-installed>`_
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/megablocks-install.html#using-a-prebuilt-docker-image-with-megablocks-pre-installed>`__
       - .. raw:: html
-         
+
           <a href="https://github.com/ROCm/megablocks"><i class="fab fa-github fa-lg"></i></a>
-   
-    * - `Taichi <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/taichi-compatibility.html>`_
+
+    * - `Taichi <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/taichi-compatibility.html>`__
       - .. raw:: html
-         
+
           <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html"><i class="fas fa-link fa-lg"></i></a>
       - 
-        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html#use-a-prebuilt-docker-image-with-taichi-pre-installed>`_ 
-        - `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html#use-a-wheels-package>`_
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html#use-a-prebuilt-docker-image-with-taichi-pre-installed>`__
+        - `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html#use-a-wheels-package>`__
 
       - .. raw:: html
-         
-          <a href="https://github.com/ROCm/taichi"><i class="fab fa-github fa-lg"></i></a>      
 
+          <a href="https://github.com/ROCm/taichi"><i class="fab fa-github fa-lg"></i></a>
 
 Learn how to use your ROCm deep learning environment for training, fine-tuning, inference, and performance optimization
 through the following guides.
@@ -124,10 +123,3 @@ through the following guides.
 
 * :doc:`Use ROCm for AI inference optimization <rocm-for-ai/inference-optimization/index>`
 
-
-
-
-
-
-
-
diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715.rst
index 9e0f4443a..34df0359d 100644
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715.rst
@@ -46,7 +46,7 @@ vLLM inference performance testing
         - {{ unified_docker.hipblaslt_version }}
 
 With this Docker image, you can quickly test the :ref:`expected
-inference performance numbers <vllm-benchmark-performance-measurements>` for
+inference performance numbers <vllm-benchmark-performance-measurements-715>` for
 MI300X series accelerators.
 
 What's new
@@ -219,7 +219,7 @@ system's configuration.
             ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
             model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
 
-            Although the :ref:`available models <vllm-benchmark-available-models>` are preconfigured
+            Although the :ref:`available models <vllm-benchmark-available-models-715>` are preconfigured
             to collect latency and throughput performance data, you can also change the benchmarking
             parameters. See the standalone benchmarking tab for more information.
 
diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
index 02c992620..9f3bd608d 100644
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
@@ -39,7 +39,7 @@ vLLM inference performance testing
         - {{ unified_docker.hipblaslt_version }}
 
 With this Docker image, you can quickly test the :ref:`expected
-inference performance numbers <vllm-benchmark-performance-measurements>` for
+inference performance numbers <vllm-benchmark-performance-measurements-812>` for
 MI300X series accelerators.
 
 What's new
@@ -208,7 +208,7 @@ system's configuration.
             and ``{{ model.mad_tag }}_serving.csv``.
 
             Although the :ref:`available models
-            <vllm-benchmark-available-models>` are preconfigured to collect
+            <vllm-benchmark-available-models-812>` are preconfigured to collect
             offline throughput and online serving performance data, you can
             also change the benchmarking parameters. See the standalone
             benchmarking tab for more information.
diff --git a/docs/how-to/rocm-for-ai/install.rst b/docs/how-to/rocm-for-ai/install.rst
index 6847d06b4..cb949cb31 100644
--- a/docs/how-to/rocm-for-ai/install.rst
+++ b/docs/how-to/rocm-for-ai/install.rst
@@ -22,9 +22,9 @@ If you’re new to ROCm, refer to the :doc:`ROCm quick start install guide for L
 <rocm-install-on-linux:install/quick-start>`.
 
 If you’re using a Radeon GPU for graphics-accelerated applications, refer to the
-`Radeon installation instructions <https://rocm.docs.amd.com/projects/radeon/en/docs-6.1.3/docs/install/native_linux/install-radeon.html>`_.
+`Radeon installation instructions <https://rocm.docs.amd.com/projects/radeon/en/latest/docs/install/native_linux/howto_native_linux.html>`_.
 
-You can install ROCm on :ref:`compatible systems <rocm-install-on-linux:reference/system-requirements>` via your Linux
+You can install ROCm on :doc:`compatible systems <rocm-install-on-linux:reference/system-requirements>` via your Linux
 distribution's package manager. See the following documentation resources to get started:
 
 * :doc:`ROCm installation overview <rocm-install-on-linux:install/install-overview>`
diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v24.12-dev.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v24.12-dev.rst
index a9d99378e..c18b1dfea 100644
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v24.12-dev.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v24.12-dev.rst
@@ -18,7 +18,7 @@ Training a model with ROCm Megatron-LM
 The ROCm Megatron-LM framework is a specialized fork of the robust Megatron-LM, designed to
 enable efficient training of large-scale language models on AMD GPUs. By leveraging AMD Instinct™ MI300X
 accelerators, AMD Megatron-LM delivers enhanced scalability, performance, and resource utilization for AI
-workloads. It is purpose-built to :ref:`support models <amd-megatron-lm-model-support>`
+workloads. It is purpose-built to :ref:`support models <amd-megatron-lm-model-support-24-12>`
 like Meta's Llama 2, Llama 3, and Llama 3.1, enabling developers to train next-generation AI models with greater
 efficiency. See the GitHub repository at `<https://github.com/ROCm/Megatron-LM>`__.
 
@@ -67,7 +67,7 @@ Megatron-LM provides the following key features to train large language models e
 
 - Pre-training
 
-.. _amd-megatron-lm-model-support:
+.. _amd-megatron-lm-model-support-24-12:
 
 The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.
 
diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3.rst
index 3a2f23322..e039aff8a 100644
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3.rst
@@ -67,7 +67,7 @@ Megatron-LM provides the following key features to train large language models e
 
 - Pre-training
 
-.. _amd-megatron-lm-model-support:
+.. _amd-megatron-lm-model-support-25-3:
 
 The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.
 
@@ -278,7 +278,7 @@ handle a variety of input sequences, including unseen words or domain-specific t
    .. tab-item:: Llama
       :sync: llama
 
-      To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``Llama2Tokenizer``.
+      To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support-25-3>`, use the ``Llama2Tokenizer``.
 
       To train any of Llama 3 and Llama 3.1 models that this Docker image supports, use the ``HuggingFaceTokenizer``.
       Set the Hugging Face model link in the ``TOKENIZER_MODEL`` variable.
@@ -292,7 +292,7 @@ handle a variety of input sequences, including unseen words or domain-specific t
    .. tab-item:: DeepSeek V2
       :sync: deepseek
 
-      To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``DeepSeekV2Tokenizer``.
+      To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support-25-3>`, use the ``DeepSeekV2Tokenizer``.
 
 Multi-node training
 ^^^^^^^^^^^^^^^^^^^
diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4.rst
index 76e5eb716..9d7c7ecd6 100644
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4.rst
@@ -67,7 +67,7 @@ Megatron-LM provides the following key features to train large language models e
 
 - Pre-training
 
-.. _amd-megatron-lm-model-support:
+.. _amd-megatron-lm-model-support-25-4:
 
 The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
 
@@ -291,7 +291,7 @@ or ``${DATA_DIR}/tokenizer_llama2``.
    .. tab-item:: Llama
       :sync: llama
 
-      To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``Llama2Tokenizer``
+      To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support-25-4>`, use the ``Llama2Tokenizer``
       or the default ``HuggingFaceTokenizer``.
 
       To train any of Llama 3 and Llama 3.1 models that this Docker image supports, use the ``HuggingFaceTokenizer``.
@@ -320,7 +320,7 @@ or ``${DATA_DIR}/tokenizer_llama2``.
    .. tab-item:: DeepSeek V2
       :sync: deepseek
 
-      To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``DeepSeekV2Tokenizer``.
+      To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support-25-4>`, use the ``DeepSeekV2Tokenizer``.
 
 Multi-node training
 ^^^^^^^^^^^^^^^^^^^
diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst
index 1535f1d43..07d640159 100644
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst
@@ -16,12 +16,20 @@ previous releases of the ``ROCm/pytorch-training`` Docker image on `Docker Hub <
      - Components
      - Resources
 
+   * - v25.7
+     - 
+       * ROCm 6.4.2
+       * PyTorch 2.8.0a0+gitd06a406
+     - 
+       * :doc:`Documentation <../pytorch-training>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.7/images/sha256-cc6fd840ab89cb81d926fc29eca6d075aee9875a55a522675a4b9231c9a0a712>`__
+
    * - v25.6
      - 
        * ROCm 6.3.4
        * PyTorch 2.8.0a0+git7d205b2
      - 
-       * :doc:`Documentation <../pytorch-training>`
+       * :doc:`Documentation <pytorch-training-v25.6>`
        * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.6/images/sha256-a4cea3c493a4a03d199a3e81960ac071d79a4a7a391aa9866add3b30a7842661>`__
 
    * - v25.5
diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5.rst
index a43297657..e68a1092b 100644
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5.rst
@@ -437,3 +437,8 @@ Once the setup is complete, choose between two options to start benchmarking:
 
            ./pytorch_benchmark_report.sh -t HF_finetune_lora -p BF16 -m Llama-2-70B
 
+Previous versions
+=================
+
+See :doc:`pytorch-training-history` to find documentation for previous releases
+of the ``ROCm/pytorch-training`` Docker image.
diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.6.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.6.rst
new file mode 100644
index 000000000..f9bc57a43
--- /dev/null
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.6.rst
@@ -0,0 +1,456 @@
+:orphan:
+
+.. meta::
+   :description: How to train a model using PyTorch for ROCm.
+   :keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
+
+**************************************
+Training a model with PyTorch for ROCm
+**************************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm vLLM
+   performance benchmark documentation. See :doc:`../pytorch-training` for the latest version.
+
+PyTorch is an open-source machine learning framework that is widely used for
+model training with GPU-optimized components for transformer-based models.
+
+The `PyTorch for ROCm training Docker <https://hub.docker.com/layers/rocm/pytorch-training/v25.6/images/sha256-a4cea3c493a4a03d199a3e81960ac071d79a4a7a391aa9866add3b30a7842661>`_
+(``rocm/pytorch-training:v25.6``) image provides a prebuilt optimized environment for fine-tuning and pretraining a
+model on AMD Instinct MI325X and MI300X accelerators. It includes the following software components to accelerate
+training workloads:
+
++--------------------------+--------------------------------+
+| Software component       | Version                        |
++==========================+================================+
+| ROCm                     | 6.3.4                          |
++--------------------------+--------------------------------+
+| PyTorch                  | 2.8.0a0+git7d205b2             |
++--------------------------+--------------------------------+
+| Python                   | 3.10.17                        |
++--------------------------+--------------------------------+
+| Transformer Engine       | 1.14.0+2f85f5f2                |
++--------------------------+--------------------------------+
+| Flash Attention          | 3.0.0.post1                    |
++--------------------------+--------------------------------+
+| hipBLASLt                | 0.15.0-8c6919d                 |
++--------------------------+--------------------------------+
+| Triton                   | 3.3.0                          |
++--------------------------+--------------------------------+
+
+.. _amd-pytorch-training-model-support-v256:
+
+Supported models
+================
+
+The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X accelerators.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.6-benchmark-models.yaml
+
+   {% set unified_docker = data.unified_docker.latest %}
+   {% set model_groups = data.model_groups %}
+
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+        <div class="row">
+          <div class="col-2 me-2 model-param-head">Workload</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+            <div class="col-6 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+   {% endfor %}
+          </div>
+        </div>
+
+        <div class="row mt-1">
+          <div class="col-2 me-2 model-param-head">Model</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+      {% set models = model_group.models %}
+      {% for model in models %}
+         {% if models|length % 3 == 0 %}
+            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% else %}
+            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% endif %}
+      {% endfor %}
+   {% endfor %}
+          </div>
+        </div>
+      </div>
+
+   .. note::
+
+      Some models require an external license agreement through a third party (for example, Meta).
+
+   .. _amd-pytorch-training-performance-measurements-v256:
+
+   Performance measurements
+   ========================
+
+   To evaluate performance, the
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
+   page provides reference throughput and latency measurements for training
+   popular AI models.
+
+   .. note::
+
+      The performance data presented in
+      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
+      should not be interpreted as the peak performance achievable by AMD
+      Instinct MI325X and MI300X accelerators or ROCm software.
+
+   System validation
+   =================
+
+   Before running AI workloads, it's important to validate that your AMD hardware is configured
+   correctly and performing optimally.
+
+   If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+   can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+   optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+   before starting training.
+
+   To test for optimal performance, consult the recommended :ref:`System health benchmarks
+   <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+   system's configuration.
+
+   This Docker image is optimized for specific model configurations outlined
+   below. Performance can vary for other training workloads, as AMD
+   doesn’t validate configurations and run conditions outside those described.
+
+   Benchmarking
+   ============
+
+   Once the setup is complete, choose between two options to start benchmarking:
+
+   .. tab-set::
+
+      .. tab-item:: MAD-integrated benchmarking
+
+         Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+         directory and install the required packages on the host machine.
+
+         .. code-block:: shell
+
+            git clone https://github.com/ROCm/MAD
+            cd MAD
+            pip install -r requirements.txt
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+         .. container:: model-doc {{ model.mad_tag }}
+
+            For example, use this command to run the performance benchmark test on the {{ model.model }} model
+            using one GPU with the {{ model.precision }} data type on the host machine.
+
+            .. code-block:: shell
+
+               export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+               madengine run \
+                   --tags {{ model.mad_tag }} \
+                   --keep-model-dir \
+                   --live-output \
+                   --timeout 28800
+
+            MAD launches a Docker container with the name
+            ``container_ci-{{ model.mad_tag }}``, for example. The latency and throughput reports of the
+            model are collected in the following path: ``~/MAD/perf.csv``.
+
+      {% endfor %}
+   {% endfor %}
+
+      .. tab-item:: Standalone benchmarking
+
+         .. rubric:: Download the Docker image and required packages
+
+         Use the following command to pull the Docker image from Docker Hub.
+
+         .. code-block:: shell
+
+            docker pull {{ unified_docker.pull_tag }}
+
+         Run the Docker container.
+
+         .. code-block:: shell
+
+            docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env {{ unified_docker.pull_tag }}
+
+         Use these commands if you exit the ``training_env`` container and need to return to it.
+
+         .. code-block:: shell
+
+            docker start training_env
+            docker exec -it training_env bash
+
+         In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
+         repository and navigate to the benchmark scripts directory
+         ``/workspace/MAD/scripts/pytorch_train``.
+
+         .. code-block:: shell
+
+            git clone https://github.com/ROCm/MAD
+            cd MAD/scripts/pytorch_train
+
+         .. rubric:: Prepare training datasets and dependencies
+
+         The following benchmarking examples require downloading models and datasets
+         from Hugging Face. To ensure successful access to gated repos, set your
+         ``HF_TOKEN``.
+
+         .. code-block:: shell
+
+            export HF_TOKEN=$your_personal_hugging_face_access_token
+
+         Run the setup script to install libraries and datasets needed for benchmarking.
+
+         .. code-block:: shell
+
+            ./pytorch_benchmark_setup.sh
+
+         .. container:: model-doc pyt_train_llama-3.1-8b
+
+            ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 8B:
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Library
+                 - Reference
+
+               * - ``accelerate``
+                 - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
+
+               * - ``datasets``
+                 - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
+
+         .. container:: model-doc pyt_train_llama-3.1-70b
+
+            ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 70B:
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Library
+                 - Reference
+
+               * - ``datasets``
+                 - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
+
+               * - ``torchdata``
+                 - `TorchData <https://pytorch.org/data/beta/index.html>`_
+
+               * - ``tomli``
+                 - `Tomli <https://pypi.org/project/tomli/>`_
+
+               * - ``tiktoken``
+                 - `tiktoken <https://github.com/openai/tiktoken>`_
+
+               * - ``blobfile``
+                 - `blobfile <https://pypi.org/project/blobfile/>`_
+
+               * - ``tabulate``
+                 - `tabulate <https://pypi.org/project/tabulate/>`_
+
+               * - ``wandb``
+                 - `Weights & Biases <https://github.com/wandb/wandb>`_
+
+               * - ``sentencepiece``
+                 - `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
+
+               * - ``tensorboard``
+                 - `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
+
+         .. container:: model-doc pyt_train_flux
+
+            ``pytorch_benchmark_setup.sh`` installs the following libraries for FLUX:
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Library
+                 - Reference
+
+               * - ``accelerate``
+                 - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
+
+               * - ``datasets``
+                 - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
+
+               * - ``sentencepiece``
+                 - `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
+
+               * - ``tensorboard``
+                 - `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
+
+               * - ``csvkit``
+                 - `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1
+
+               * - ``deepspeed``
+                 - `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2
+
+               * - ``diffusers``
+                 - `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0
+
+               * - ``GitPython``
+                 - `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44
+
+               * - ``opencv-python-headless``
+                 - `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84
+
+               * - ``peft``
+                 - `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0
+
+               * - ``protobuf``
+                 - `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2
+
+               * - ``pytest``
+                 - `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4
+
+               * - ``python-dotenv``
+                 - `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1
+
+               * - ``seaborn``
+                 - `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2
+
+               * - ``transformers``
+                 - `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0
+
+         ``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face:
+
+         * `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+         {% if model_group.tag == "pre-training" and model.mad_tag in ["pyt_train_llama-3.1-8b", "pyt_train_llama-3.1-70b", "pyt_train_flux"] %}
+
+         .. container:: model-doc {{ model.mad_tag }}
+
+            .. rubric:: Pretraining
+
+            To start the pre-training benchmark, use the following command with the
+            appropriate options. See the following list of options and their descriptions.
+
+            .. code-block:: shell
+
+               ./pytorch_benchmark_report.sh -t pretrain -m {{ model.model_repo }} -p $datatype -s $sequence_length
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Name
+                 - Options
+                 - Description
+
+            {% if model.mad_tag == "pyt_train_llama-3.1-8b" %}
+               * - ``$datatype``
+                 - ``BF16`` or ``FP8``
+                 - Only Llama 3.1 8B supports FP8 precision.
+            {% else %}
+               * - ``$datatype``
+                 - ``BF16``
+                 - Only Llama 3.1 8B supports FP8 precision.
+            {% endif %}
+
+               * - ``$sequence_length``
+                 - Sequence length for the language model.
+                 - Between 2048 and 8192. 8192 by default.
+
+            {% if model.mad_tag == "pyt_train_flux" %}
+            .. container:: model-doc {{ model.mad_tag }}
+
+               .. note::
+
+                  Occasionally, downloading the Flux dataset might fail. In the event of this
+                  error, manually download it from Hugging Face at
+                  `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
+                  and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
+                  the required dataset.
+            {% endif %}
+         {% endif %}
+
+         {% if model_group.tag == "fine-tuning" %}
+         .. container:: model-doc {{ model.mad_tag }}
+
+            .. rubric:: Fine-tuning
+
+            To start the fine-tuning benchmark, use the following command with the
+            appropriate options. See the following list of options and their descriptions.
+
+            .. code-block:: shell
+
+               ./pytorch_benchmark_report.sh -t $training_mode -m {{ model.model_repo }} -p BF16 -s $sequence_length
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Name
+                 - Options
+                 - Description
+
+               * - ``$training_mode``
+                 - ``finetune_fw``
+                 - Full weight fine-tuning (BF16 supported)
+
+               * -
+                 - ``finetune_lora``
+                 - LoRA fine-tuning (BF16 supported)
+
+               * -
+                 - ``finetune_qlora``
+                 - QLoRA fine-tuning (BF16 supported)
+
+               * -
+                 - ``HF_finetune_lora``
+                 - LoRA fine-tuning with Hugging Face PEFT
+
+               * - ``$datatype``
+                 - ``BF16``
+                 - All models support BF16.
+
+               * - ``$sequence_length``
+                 - Between 2048 and 16384.
+                 - Sequence length for the language model.
+
+            .. note::
+
+               {{ model.model }} currently supports the following fine-tuning methods:
+
+            {% for method in model.training_modes %}
+               * ``{{ method }}``
+            {% endfor %}
+            {% if model.training_modes|length < 4 %}
+
+               The upstream `torchtune <https://github.com/pytorch/torchtune>`_ repository
+               does not currently provide YAML configuration files for other combinations of
+               model to fine-tuning method
+               However, you can still configure your own YAML files to enable support for
+               fine-tuning methods not listed here by following existing patterns in the
+               ``/workspace/torchtune/recipes/configs`` directory.
+            {% endif %}
+         {% endif %}
+      {% endfor %}
+   {% endfor %}
+
+               .. rubric:: Benchmarking examples
+
+               For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.
+
+Further reading
+===============
+
+- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
+
+- To learn more about system settings and management practices to configure your system for
+  AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`pytorch-training-history` to find documentation for previous releases
+of the ``ROCm/pytorch-training`` Docker image.
diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
index 46b9daf2f..e7258e07b 100644
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
@@ -9,28 +9,25 @@ Training a model with PyTorch for ROCm
 PyTorch is an open-source machine learning framework that is widely used for
 model training with GPU-optimized components for transformer-based models.
 
-The `PyTorch for ROCm training Docker <https://hub.docker.com/r/rocm/pytorch-training/tags>`_
-(``rocm/pytorch-training:v25.6``) image provides a prebuilt optimized environment for fine-tuning and pretraining a
-model on AMD Instinct MI325X and MI300X accelerators. It includes the following software components to accelerate
-training workloads:
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
 
-+--------------------------+--------------------------------+
-| Software component       | Version                        |
-+==========================+================================+
-| ROCm                     | 6.3.4                          |
-+--------------------------+--------------------------------+
-| PyTorch                  | 2.8.0a0+git7d205b2             |
-+--------------------------+--------------------------------+
-| Python                   | 3.10.17                        |
-+--------------------------+--------------------------------+
-| Transformer Engine       | 1.14.0+2f85f5f2                |
-+--------------------------+--------------------------------+
-| Flash Attention          | 3.0.0.post1                    |
-+--------------------------+--------------------------------+
-| hipBLASLt                | 0.15.0-8c6919d                 |
-+--------------------------+--------------------------------+
-| Triton                   | 3.3.0                          |
-+--------------------------+--------------------------------+
+   {% set dockers = data.dockers %}
+   {% set docker = dockers[0] %}
+   The `PyTorch for ROCm training Docker <{{ docker.docker_hub_url }}>`__
+   (``{{ docker.pull_tag }}``) image provides a prebuilt optimized environment for fine-tuning and pretraining a
+   model on AMD Instinct MI325X and MI300X accelerators. It includes the following software components to accelerate
+   training workloads:
+
+   .. list-table::
+      :header-rows: 1
+
+      * - Software component
+        - Version
+
+      {% for component_name, component_version in docker.components.items() %}
+      * - {{ component_name }}
+        - {{ component_version }}
+      {% endfor %}
 
 .. _amd-pytorch-training-model-support:
 
@@ -38,26 +35,27 @@ Supported models
 ================
 
 The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X accelerators.
+Some instructions, commands, and training recommendations in this documentation might
+vary by model -- select one to get started.
 
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
 
-   {% set unified_docker = data.unified_docker.latest %}
+   {% set unified_docker = data.dockers[0] %}
    {% set model_groups = data.model_groups %}
-
    .. raw:: html
 
       <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
         <div class="row">
-          <div class="col-2 me-2 model-param-head">Workload</div>
+          <div class="col-2 me-2 model-param-head">Model group</div>
           <div class="row col-10">
    {% for model_group in model_groups %}
-            <div class="col-6 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
    {% endfor %}
           </div>
         </div>
 
         <div class="row mt-1">
-          <div class="col-2 me-2 model-param-head">Model</div>
+          <div class="col-2 me-2 model-param-head">Model variant</div>
           <div class="row col-10">
    {% for model_group in model_groups %}
       {% set models = model_group.models %}
@@ -73,84 +71,116 @@ The following models are pre-optimized for performance on the AMD Instinct MI325
         </div>
       </div>
 
-   .. note::
 
-      Some models require an external license agreement through a third party (for example, Meta).
+   .. _amd-pytorch-training-supported-training-modes:
 
-   .. _amd-pytorch-training-performance-measurements:
+   The following table lists supported training modes per model.
 
-   Performance measurements
-   ========================
+   .. dropdown:: Supported training modes
 
-   To evaluate performance, the
+      .. list-table::
+         :header-rows: 1
+
+         * - Model
+           - Supported training modes
+
+      {% for model_group in model_groups %}
+         {% set models = model_group.models %}
+         {% for model in models %}
+         * - {{ model.model }}
+           - ``{{ model.training_modes | join('``, ``') }}``
+
+         {% endfor %}
+      {% endfor %}
+
+      .. note::
+
+         Some model and fine-tuning combinations are not listed. This is
+         because the `upstream torchtune repository <https://github.com/pytorch/torchtune>`__
+         doesn't provide default YAML configurations for them.
+         For advanced usage, you can create a custom configuration to enable
+         unlisted fine-tuning methods by using an existing file in the
+         ``/workspace/torchtune/recipes/configs`` directory as a template.
+
+.. _amd-pytorch-training-performance-measurements:
+
+Performance measurements
+========================
+
+To evaluate performance, the
+`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
+page provides reference throughput and latency measurements for training
+popular AI models.
+
+.. note::
+
+   The performance data presented in
    `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
-   page provides reference throughput and latency measurements for training
-   popular AI models.
+   should not be interpreted as the peak performance achievable by AMD
+   Instinct MI325X and MI300X accelerators or ROCm software.
 
-   .. note::
+System validation
+=================
 
-      The performance data presented in
-      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
-      should not be interpreted as the peak performance achievable by AMD
-      Instinct MI325X and MI300X accelerators or ROCm software.
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
 
-   System validation
-   =================
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting training.
 
-   Before running AI workloads, it's important to validate that your AMD hardware is configured
-   correctly and performing optimally.
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.
 
-   If you have already validated your system settings, including aspects like NUMA auto-balancing, you
-   can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
-   optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
-   before starting training.
+This Docker image is optimized for specific model configurations outlined
+below. Performance can vary for other training workloads, as AMD
+doesn’t test configurations and run conditions outside those described.
 
-   To test for optimal performance, consult the recommended :ref:`System health benchmarks
-   <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
-   system's configuration.
+Run training
+============
 
-   This Docker image is optimized for specific model configurations outlined
-   below. Performance can vary for other training workloads, as AMD
-   doesn’t validate configurations and run conditions outside those described.
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
 
-   Benchmarking
-   ============
+   {% set unified_docker = data.dockers[0] %}
+   {% set model_groups = data.model_groups %}
 
-   Once the setup is complete, choose between two options to start benchmarking:
+   Once the setup is complete, choose between two options to start benchmarking training:
 
    .. tab-set::
 
       .. tab-item:: MAD-integrated benchmarking
 
-         Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
-         directory and install the required packages on the host machine.
+         1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+            directory and install the required packages on the host machine.
 
-         .. code-block:: shell
+            .. code-block:: shell
 
-            git clone https://github.com/ROCm/MAD
-            cd MAD
-            pip install -r requirements.txt
+               git clone https://github.com/ROCm/MAD
+               cd MAD
+               pip install -r requirements.txt
 
    {% for model_group in model_groups %}
       {% for model in model_group.models %}
 
          .. container:: model-doc {{ model.mad_tag }}
 
-            For example, use this command to run the performance benchmark test on the {{ model.model }} model
-            using one GPU with the {{ model.precision }} data type on the host machine.
+            2. For example, use this command to run the performance benchmark test on the {{ model.model }} model
+               using one node with the {{ model.precision }} data type on the host machine.
 
-            .. code-block:: shell
+               .. code-block:: shell
 
-               export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
-               madengine run \
-                   --tags {{ model.mad_tag }} \
-                   --keep-model-dir \
-                   --live-output \
-                   --timeout 28800
+                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+                  madengine run \
+                      --tags {{ model.mad_tag }} \
+                      --keep-model-dir \
+                      --live-output \
+                      --timeout 28800
 
-            MAD launches a Docker container with the name
-            ``container_ci-{{ model.mad_tag }}``, for example. The latency and throughput reports of the
-            model are collected in the following path: ``~/MAD/perf.csv``.
+               MAD launches a Docker container with the name
+               ``container_ci-{{ model.mad_tag }}``. The latency and throughput reports of the
+               model are collected in ``~/MAD/perf.csv``.
 
       {% endfor %}
    {% endfor %}
@@ -159,222 +189,213 @@ The following models are pre-optimized for performance on the AMD Instinct MI325
 
          .. rubric:: Download the Docker image and required packages
 
-         Use the following command to pull the Docker image from Docker Hub.
+         1. Use the following command to pull the Docker image from Docker Hub.
 
-         .. code-block:: shell
+            .. code-block:: shell
 
-            docker pull {{ unified_docker.pull_tag }}
+               docker pull {{ unified_docker.pull_tag }}
 
-         Run the Docker container.
+         2. Run the Docker container.
 
-         .. code-block:: shell
+            .. code-block:: shell
 
-            docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env {{ unified_docker.pull_tag }}
+               docker run -it \
+                   --device /dev/dri \
+                   --device /dev/kfd \
+                   --network host \
+                   --ipc host \
+                   --group-add video \
+                   --cap-add SYS_PTRACE \
+                   --security-opt seccomp=unconfined \
+                   --privileged \
+                   -v $HOME:$HOME \
+                   -v $HOME/.ssh:/root/.ssh \
+                   --shm-size 64G \
+                   --name training_env \
+                   {{ unified_docker.pull_tag }}
 
-         Use these commands if you exit the ``training_env`` container and need to return to it.
+            Use these commands if you exit the ``training_env`` container and need to return to it.
 
-         .. code-block:: shell
+            .. code-block:: shell
 
-            docker start training_env
-            docker exec -it training_env bash
+               docker start training_env
+               docker exec -it training_env bash
 
-         In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
-         repository and navigate to the benchmark scripts directory
-         ``/workspace/MAD/scripts/pytorch_train``.
+         3. In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
+            repository and navigate to the benchmark scripts directory
+            ``/workspace/MAD/scripts/pytorch_train``.
 
-         .. code-block:: shell
+            .. code-block:: shell
 
-            git clone https://github.com/ROCm/MAD
-            cd MAD/scripts/pytorch_train
+               git clone https://github.com/ROCm/MAD
+               cd MAD/scripts/pytorch_train
 
          .. rubric:: Prepare training datasets and dependencies
 
-         The following benchmarking examples require downloading models and datasets
-         from Hugging Face. To ensure successful access to gated repos, set your
-         ``HF_TOKEN``.
+         1. The following benchmarking examples require downloading models and datasets
+            from Hugging Face. To ensure successful access to gated repos, set your
+            ``HF_TOKEN``.
 
-         .. code-block:: shell
+            .. code-block:: shell
 
-            export HF_TOKEN=$your_personal_hugging_face_access_token
+               export HF_TOKEN=$your_personal_hugging_face_access_token
 
-         Run the setup script to install libraries and datasets needed for benchmarking.
+         2. Run the setup script to install libraries and datasets needed for benchmarking.
 
-         .. code-block:: shell
+            .. code-block:: shell
 
-            ./pytorch_benchmark_setup.sh
+               ./pytorch_benchmark_setup.sh
 
-         .. container:: model-doc pyt_train_llama-3.1-8b
+            .. container:: model-doc pyt_train_llama-3.1-8b
 
-            ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 8B:
+               ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 8B:
 
-            .. list-table::
-               :header-rows: 1
+               .. list-table::
+                  :header-rows: 1
 
-               * - Library
-                 - Reference
+                  * - Library
+                    - Reference
 
-               * - ``accelerate``
-                 - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
+                  * - ``accelerate``
+                    - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
 
-               * - ``datasets``
-                 - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
+                  * - ``datasets``
+                    - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
 
-         .. container:: model-doc pyt_train_llama-3.1-70b
+            .. container:: model-doc pyt_train_llama-3.1-70b
 
-            ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 70B:
+               ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 70B:
 
-            .. list-table::
-               :header-rows: 1
+               .. list-table::
+                  :header-rows: 1
 
-               * - Library
-                 - Reference
+                  * - Library
+                    - Reference
 
-               * - ``datasets``
-                 - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
+                  * - ``datasets``
+                    - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
 
-               * - ``torchdata``
-                 - `TorchData <https://pytorch.org/data/beta/index.html>`_
+                  * - ``torchdata``
+                    - `TorchData <https://pytorch.org/data/beta/index.html>`_
 
-               * - ``tomli``
-                 - `Tomli <https://pypi.org/project/tomli/>`_
+                  * - ``tomli``
+                    - `Tomli <https://pypi.org/project/tomli/>`_
 
-               * - ``tiktoken``
-                 - `tiktoken <https://github.com/openai/tiktoken>`_
+                  * - ``tiktoken``
+                    - `tiktoken <https://github.com/openai/tiktoken>`_
 
-               * - ``blobfile``
-                 - `blobfile <https://pypi.org/project/blobfile/>`_
+                  * - ``blobfile``
+                    - `blobfile <https://pypi.org/project/blobfile/>`_
 
-               * - ``tabulate``
-                 - `tabulate <https://pypi.org/project/tabulate/>`_
+                  * - ``tabulate``
+                    - `tabulate <https://pypi.org/project/tabulate/>`_
 
-               * - ``wandb``
-                 - `Weights & Biases <https://github.com/wandb/wandb>`_
+                  * - ``wandb``
+                    - `Weights & Biases <https://github.com/wandb/wandb>`_
 
-               * - ``sentencepiece``
-                 - `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
+                  * - ``sentencepiece``
+                    - `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
 
-               * - ``tensorboard``
-                 - `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
+                  * - ``tensorboard``
+                    - `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
 
-         .. container:: model-doc pyt_train_flux
+            .. container:: model-doc pyt_train_flux
 
-            ``pytorch_benchmark_setup.sh`` installs the following libraries for FLUX:
+               ``pytorch_benchmark_setup.sh`` installs the following libraries for FLUX:
 
-            .. list-table::
-               :header-rows: 1
+               .. list-table::
+                  :header-rows: 1
 
-               * - Library
-                 - Reference
+                  * - Library
+                    - Reference
 
-               * - ``accelerate``
-                 - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
+                  * - ``accelerate``
+                    - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
 
-               * - ``datasets``
-                 - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
+                  * - ``datasets``
+                    - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
 
-               * - ``sentencepiece``
-                 - `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
+                  * - ``sentencepiece``
+                    - `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
 
-               * - ``tensorboard``
-                 - `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
+                  * - ``tensorboard``
+                    - `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
 
-               * - ``csvkit``
-                 - `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1
+                  * - ``csvkit``
+                    - `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1
 
-               * - ``deepspeed``
-                 - `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2
+                  * - ``deepspeed``
+                    - `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2
 
-               * - ``diffusers``
-                 - `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0
+                  * - ``diffusers``
+                    - `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0
 
-               * - ``GitPython``
-                 - `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44
+                  * - ``GitPython``
+                    - `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44
 
-               * - ``opencv-python-headless``
-                 - `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84
+                  * - ``opencv-python-headless``
+                    - `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84
 
-               * - ``peft``
-                 - `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0
+                  * - ``peft``
+                    - `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0
 
-               * - ``protobuf``
-                 - `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2
+                  * - ``protobuf``
+                    - `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2
 
-               * - ``pytest``
-                 - `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4
+                  * - ``pytest``
+                    - `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4
 
-               * - ``python-dotenv``
-                 - `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1
+                  * - ``python-dotenv``
+                    - `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1
 
-               * - ``seaborn``
-                 - `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2
+                  * - ``seaborn``
+                    - `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2
 
-               * - ``transformers``
-                 - `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0
+                  * - ``transformers``
+                    - `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0
 
-         ``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face:
+            ``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face:
 
-         * `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_
+            * `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_
 
    {% for model_group in model_groups %}
       {% for model in model_group.models %}
-         {% if model_group.tag == "pre-training" and model.mad_tag in ["pyt_train_llama-3.1-8b", "pyt_train_llama-3.1-70b", "pyt_train_flux"] %}
+         {% set training_modes = model.training_modes %}
+         {% set training_mode_descs = {
+            "pretrain": "Benchmark pre-training.",
+            "HF_pretrain": "Llama 3.1 8B pre-training with FP8 precision."
+         } %}
+         {% set available_modes = training_modes | select("in", ["pretrain", "HF_pretrain"]) | list %}
+         {% if available_modes %}
 
          .. container:: model-doc {{ model.mad_tag }}
 
-            .. rubric:: Pretraining
+            .. rubric:: Pre-training
 
             To start the pre-training benchmark, use the following command with the
             appropriate options. See the following list of options and their descriptions.
 
             .. code-block:: shell
 
-               ./pytorch_benchmark_report.sh -t pretrain -m {{ model.model_repo }} -p $datatype -s $sequence_length
-
-            .. list-table::
-               :header-rows: 1
-
-               * - Name
-                 - Options
-                 - Description
-
-            {% if model.mad_tag == "pyt_train_llama-3.1-8b" %}
-               * - ``$datatype``
-                 - ``BF16`` or ``FP8``
-                 - Only Llama 3.1 8B supports FP8 precision.
-            {% else %}
-               * - ``$datatype``
-                 - ``BF16``
-                 - Only Llama 3.1 8B supports FP8 precision.
-            {% endif %}
-
-               * - ``$sequence_length``
-                 - Sequence length for the language model.
-                 - Between 2048 and 8192. 8192 by default.
+               ./pytorch_benchmark_report.sh -t {% if available_modes | length == 1 %}{{ available_modes[0] }}{% else %}$training_mode{% endif %} \
+                   -m {{ model.model_repo }} \
+                   -p $datatype \
+                   -s $sequence_length
 
             {% if model.mad_tag == "pyt_train_flux" %}
             .. container:: model-doc {{ model.mad_tag }}
 
                .. note::
 
+                  Currently, FLUX models are not supported out-of-the-box on {{ unified_docker.pull_tag }}.
+                  To use FLUX, refer to the previous version of the ``pytorch-training`` Docker: :doc:`previous-versions/pytorch-training-v25.6`
+
                   Occasionally, downloading the Flux dataset might fail. In the event of this
                   error, manually download it from Hugging Face at
                   `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
                   and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
                   the required dataset.
             {% endif %}
-         {% endif %}
-
-         {% if model_group.tag == "fine-tuning" %}
-         .. container:: model-doc {{ model.mad_tag }}
-
-            .. rubric:: Fine-tuning
-
-            To start the fine-tuning benchmark, use the following command with the
-            appropriate options. See the following list of options and their descriptions.
-
-            .. code-block:: shell
-
-               ./pytorch_benchmark_report.sh -t $training_mode -m {{ model.model_repo }} -p BF16 -s $sequence_length
 
             .. list-table::
                :header-rows: 1
@@ -383,53 +404,143 @@ The following models are pre-optimized for performance on the AMD Instinct MI325
                  - Options
                  - Description
 
-               * - ``$training_mode``
-                 - ``finetune_fw``
-                 - Full weight fine-tuning (BF16 supported)
-
-               * -
-                 - ``finetune_lora``
-                 - LoRA fine-tuning (BF16 supported)
-
-               * -
-                 - ``finetune_qlora``
-                 - QLoRA fine-tuning (BF16 supported)
-
-               * -
-                 - ``HF_finetune_lora``
-                 - LoRA fine-tuning with Hugging Face PEFT
+               {% for mode in available_modes %}
+               * - {% if loop.first %}``$training_mode``{% endif %}
+                 - ``{{ mode }}``
+                 - {{ training_mode_descs[mode] }}
+               {% endfor %}
 
                * - ``$datatype``
-                 - ``BF16``
-                 - All models support BF16.
+                 - ``BF16``{% if model.mad_tag == "pyt_train_llama-3.1-8b" %} or ``FP8``{% endif %}
+                 - Only Llama 3.1 8B supports FP8 precision.
+
+               * - ``$sequence_length``
+                 - Sequence length for the language model.
+                 - Between 2048 and 8192. 8192 by default.
+         {% endif %}
+
+         {% set training_mode_descs = {
+            "finetune_fw": "Full weight fine-tuning (BF16 and FP8 supported).",
+            "finetune_lora": "LoRA fine-tuning (BF16 supported).",
+            "finetune_qlora": "QLoRA fine-tuning (BF16 supported).",
+            "HF_finetune_lora": "LoRA fine-tuning with Hugging Face PEFT.",
+         } %}
+         {% set available_modes = training_modes | select("in", ["finetune_fw", "finetune_lora", "finetune_qlora", "HF_finetune_lora"]) | list %}
+         {% if available_modes %}
+         .. container:: model-doc {{ model.mad_tag }}
+
+            .. rubric:: Fine-tuning
+
+            To start the fine-tuning benchmark, use the following command with the
+            appropriate options. See the following list of options and their descriptions.
+            See :ref:`supported training modes <amd-pytorch-training-supported-training-modes>`.
+
+            .. code-block:: shell
+
+               ./pytorch_benchmark_report.sh -t $training_mode \
+                   -m {{ model.model_repo }} \
+                   -p $datatype \
+                   -s $sequence_length
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Name
+                 - Options
+                 - Description
+
+               {% for mode in available_modes %}
+               * - {% if loop.first %}``$training_mode``{% endif %}
+                 - ``{{ mode }}``
+                 - {{ training_mode_descs[mode] }}
+               {% endfor %}
+
+               * - ``$datatype``
+                 - ``BF16``{% if "finetune_fw" in available_modes %} or ``FP8``{% endif %}
+                 - All models support BF16.{% if "finetune_fw" in available_modes %} FP8 is only available for full weight fine-tuning.{% endif %}
 
                * - ``$sequence_length``
                  - Between 2048 and 16384.
                  - Sequence length for the language model.
 
+            {% if model.mad_tag in ["pyt_train_llama3.2-vision-11b", "pyt_train_llama-3.2-vision-90b"] %}
             .. note::
 
-               {{ model.model }} currently supports the following fine-tuning methods:
+               For LoRA and QLoRA support with vision models (Llama 3.2 11B and 90B),
+               use the following torchtune commit for compatibility:
 
-            {% for method in model.training_modes %}
-               * ``{{ method }}``
-            {% endfor %}
-            {% if model.training_modes|length < 4 %}
+               .. code-block:: shell
+
+                  git checkout 48192e23188b1fc524dd6d127725ceb2348e7f0e
+
+            {% elif model.mad_tag in ["pyt_train_llama-2-7b", "pyt_train_llama-2-13b", "pyt_train_llama-2-70b"] %}
+            .. note::
+
+               You might encounter the following error with Llama 2: ``ValueError: seq_len (16384) of
+               input tensor should be smaller than max_seq_len (4096)``.
+               This error indicates that an input sequence is longer than the model's maximum context window.
+
+               Ensure your tokenized input does not exceed the model's ``max_seq_len`` (4096
+               tokens in this case). You can resolve this by truncating the input or splitting
+               it into smaller chunks before passing it to the model.
+
+               Note on reproducibility: The results in this guide are based on
+               commit ``b4c98ac`` from the upstream
+               `<https://github.com/pytorch/torchtune>`__ repository. For the
+               latest updates, you can use the main branch.
 
-               The upstream `torchtune <https://github.com/pytorch/torchtune>`_ repository
-               does not currently provide YAML configuration files for other combinations of
-               model to fine-tuning method
-               However, you can still configure your own YAML files to enable support for
-               fine-tuning methods not listed here by following existing patterns in the
-               ``/workspace/torchtune/recipes/configs`` directory.
             {% endif %}
          {% endif %}
       {% endfor %}
    {% endfor %}
 
-               .. rubric:: Benchmarking examples
+            .. rubric:: Benchmarking examples
 
-               For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.
+            For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.
+
+Multi-node training
+-------------------
+
+Pre-training
+~~~~~~~~~~~~
+
+Multi-node training with torchtitan is supported. The provided SLURM script is pre-configured for Llama 3 70B.
+
+To launch the training job on a SLURM cluster for Llama 3 70B, run the following commands from the MAD repository.
+
+.. code-block:: shell
+
+   # In the MAD repository
+   cd scripts/pytorch_train
+   sbatch run_slurm_train.sh
+
+Fine-tuning
+~~~~~~~~~~~
+
+Multi-node training with torchtune is supported. The provided SLURM script is pre-configured for Llama 3.3 70B.
+
+To launch the training job on a SLURM cluster for Llama 3.3 70B, run the following commands from the MAD repository.
+
+.. code-block:: shell
+
+   huggingface-cli login # Get access to HF Llama model space
+   huggingface-cli download meta-llama/Llama-3.3-70B-Instruct --local-dir ./models/Llama-3.3-70B-Instruct # Download the Llama 3.3 model locally
+   # In the MAD repository
+   cd scripts/pytorch_train
+   sbatch Torchtune_Multinode.sh
+
+.. note::
+
+   Information regarding benchmark setup:
+
+   * By default, Llama 3.3 70B is fine-tuned using ``alpaca_dataset``.
+   * You can adjust the torchtune `YAML configuration file
+     <https://github.com/pytorch/torchtune/blob/main/recipes/configs/llama3_3/70B_full_multinode.yaml>`__
+     if you're using a different model.
+   * The number of nodes and other parameters can be tuned in the SLURM script ``Torchtune_Multinode.sh``.
+   * Set the ``mounting_paths`` inside the SLURM script.
+
+Once the run is finished, you can find the log files in the ``result_torchtune/`` directory.
 
 Further reading
 ===============

From 94476f34ca445c38417e37dd98215f4def28516a Mon Sep 17 00:00:00 2001
From: Joseph Macaranas <145489236+jayhawk-commits@users.noreply.github.com>
Date: Mon, 8 Sep 2025 11:32:10 -0400
Subject: [PATCH 19/53] [External CI] Add amdgpu deps to rocpydecode pipeline
 (#5267)

---
 .azuredevops/components/rocDecode.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.azuredevops/components/rocDecode.yml b/.azuredevops/components/rocDecode.yml
index f71c3cf48..3b4bc6a71 100644
--- a/.azuredevops/components/rocDecode.yml
+++ b/.azuredevops/components/rocDecode.yml
@@ -8,6 +8,9 @@ parameters:
 - name: checkoutRef
   type: string
   default: ''
+- name: rocPyDecodeRepo
+  type: string
+  default: rocpydecode_repo
 # monorepo related parameters
 - name: sparseCheckoutDir
   type: string
@@ -207,7 +210,7 @@ jobs:
     - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
       - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
         parameters:
-          checkoutRepo: ${{ parameters.checkoutRepo }}
+          checkoutRepo: ${{ parameters.rocPyDecodeRepo }}
           sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
           buildDependsOn: ${{ component.buildDependsOn }}
           downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}

From 4f531836966339d217007429da9a7b0d2ccb8496 Mon Sep 17 00:00:00 2001
From: Peter Park <peter.park@amd.com>
Date: Mon, 8 Sep 2025 21:42:56 -0400
Subject: [PATCH 20/53] docs: Add JAX MaxText benchmark v25.7 (#5182)

* Update previous versions

* Add data file

* fix filename and anchors

* add templates

* update .wordlist.txt

* Update template and data

add missing step

fix fmt

* update template

* fix data

* add jax 0.6.0

* update history

* update quantized training note
---
 .wordlist.txt                                 |   2 +
 .../jax-maxtext-benchmark-models.yaml         |  72 +++
 .../training/benchmark-docker/jax-maxtext.rst | 481 ++++++++++--------
 .../previous-versions/jax-maxtext-history.rst |  13 +-
 .../previous-versions/jax-maxtext-v25.4.rst   |   2 +-
 .../previous-versions/jax-maxtext-v25.5.rst   | 385 ++++++++++++++
 6 files changed, 734 insertions(+), 221 deletions(-)
 create mode 100644 docs/data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
 create mode 100644 docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.5.rst

diff --git a/.wordlist.txt b/.wordlist.txt
index 4eb5df599..289fc276e 100644
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -293,6 +293,7 @@ Multicore
 Multithreaded
 MyEnvironment
 MyST
+NANOO
 NBIO
 NBIOs
 NCCL
@@ -742,6 +743,7 @@ logits
 lossy
 macOS
 matchers
+maxtext
 megatron
 microarchitecture
 migraphx
diff --git a/docs/data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
new file mode 100644
index 000000000..5ca21898c
--- /dev/null
+++ b/docs/data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
@@ -0,0 +1,72 @@
+dockers:
+  - pull_tag: rocm/jax-training:maxtext-v25.7
+    docker_hub_url: https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7/images/sha256-45f4c727d4019a63fc47313d3a5f5a5105569539294ddfd2d742218212ae9025
+    components:
+      ROCm: 6.4.1
+      JAX: 0.5.0
+      Python: 3.10.12
+      Transformer Engine: 2.1.0+90d703dd
+      hipBLASLt: 1.x.x
+  - pull_tag: rocm/jax-training:maxtext-v25.7-jax060
+    docker_hub_url: https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7/images/sha256-45f4c727d4019a63fc47313d3a5f5a5105569539294ddfd2d742218212ae9025
+    components:
+      ROCm: 6.4.1
+      JAX: 0.6.0
+      Python: 3.10.12
+      Transformer Engine: 2.1.0+90d703dd
+      hipBLASLt: 1.1.0-499ece1c21
+model_groups:
+  - group: Meta Llama
+    tag: llama
+    models:
+      - model: Llama 3.3 70B
+        mad_tag: jax_maxtext_train_llama-3.3-70b
+        model_repo: Llama-3.3-70B
+        precision: bf16
+        doc_options: ["single-node"]
+      - model: Llama 3.1 8B
+        mad_tag: jax_maxtext_train_llama-3.1-8b
+        model_repo: Llama-3.1-8B
+        precision: bf16
+        doc_options: ["single-node"]
+      - model: Llama 3.1 70B
+        mad_tag: jax_maxtext_train_llama-3.1-70b
+        model_repo: Llama-3.1-70B
+        precision: bf16
+        doc_options: ["single-node"]
+      - model: Llama 3 8B
+        mad_tag: jax_maxtext_train_llama-3-8b
+        multinode_training_script: llama3_8b_multinode.sh
+        doc_options: ["multi-node"]
+      - model: Llama 3 70B
+        mad_tag: jax_maxtext_train_llama-3-70b
+        multinode_training_script: llama3_70b_multinode.sh
+        doc_options: ["multi-node"]
+      - model: Llama 2 7B
+        mad_tag: jax_maxtext_train_llama-2-7b
+        model_repo: Llama-2-7B
+        precision: bf16
+        multinode_training_script: llama2_7b_multinode.sh
+        doc_options: ["single-node", "multi-node"]
+      - model: Llama 2 70B
+        mad_tag: jax_maxtext_train_llama-2-70b
+        model_repo: Llama-2-70B
+        precision: bf16
+        multinode_training_script: llama2_70b_multinode.sh
+        doc_options: ["single-node", "multi-node"]
+  - group: DeepSeek
+    tag: deepseek
+    models:
+      - model: DeepSeek-V2-Lite (16B)
+        mad_tag: jax_maxtext_train_deepseek-v2-lite-16b
+        model_repo: DeepSeek-V2-lite
+        precision: bf16
+        doc_options: ["single-node"]
+  - group: Mistral AI
+    tag: mistral
+    models:
+      - model: Mixtral 8x7B
+        mad_tag: jax_maxtext_train_mixtral-8x7b
+        model_repo: Mixtral-8x7B
+        precision: bf16
+        doc_options: ["single-node"]
diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
index bb364e42a..a85f5af56 100644
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
@@ -2,9 +2,9 @@
    :description: How to train a model using JAX MaxText for ROCm.
    :keywords: ROCm, AI, LLM, train, jax, torch, Llama, flux, tutorial, docker
 
-**************************************
-Training a model with MaxText for ROCm
-**************************************
+******************************************
+Training a model with JAX MaxText for ROCm
+******************************************
 
 MaxText is a high-performance, open-source framework built on the Google JAX
 machine learning library to train LLMs at scale. The MaxText framework for
@@ -12,70 +12,108 @@ ROCm is an optimized fork of the upstream
 `<https://github.com/AI-Hypercomputer/maxtext>`__ enabling efficient AI workloads
 on AMD MI300X series accelerators.
 
-The MaxText for ROCm training Docker (``rocm/jax-training:maxtext-v25.5``) image
+The MaxText for ROCm training Docker image
 provides a prebuilt environment for training on AMD Instinct MI300X and MI325X accelerators,
 including essential components like JAX, XLA, ROCm libraries, and MaxText utilities.
 It includes the following software components:
 
-+--------------------------+--------------------------------+
-| Software component       | Version                        |
-+==========================+================================+
-| ROCm                     | 6.3.4                          |
-+--------------------------+--------------------------------+
-| JAX                      | 0.4.35                         |
-+--------------------------+--------------------------------+
-| Python                   | 3.10.12                        |
-+--------------------------+--------------------------------+
-| Transformer Engine       | 1.12.0.dev0+b8b92dc            |
-+--------------------------+--------------------------------+
-| hipBLASLt                | 0.13.0-ae9c477a                |
-+--------------------------+--------------------------------+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
 
-Supported features and models
-=============================
+   {% set dockers = data.dockers %}
+   .. tab-set::
 
-MaxText provides the following key features to train large language models efficiently:
+      {% for docker in dockers %}
+      {% set jax_version = docker.components["JAX"] %}
+
+      .. tab-item:: JAX {{ jax_version }}
+         :sync: {{ docker.pull_tag }}
+
+         .. list-table::
+            :header-rows: 1
+
+            * - Software component
+              - Version
+
+            {% for component_name, component_version in docker.components.items() %}
+            * - {{ component_name }}
+              - {{ component_version }}
+
+            {% endfor %}
+         {% if jax_version == "0.6.0" %}
+         .. note::
+
+            Shardy is a new config in JAX 0.6.0. You might get related errors if it's
+            not configured correctly. For now you can turn it off by setting
+            ``shardy=False`` during the training run. You can also follow the `migration
+            guide <https://docs.jax.dev/en/latest/shardy_jax_migration.html>`__ to enable
+            it.
+
+            The provided multi-node training scripts in this documentation are
+            not currently supported with JAX 0.6.0. For multi-node training, use the JAX 0.5.0
+            Docker image.
+         {% endif %}
+
+      {% endfor %}
+
+MaxText with on ROCm provides the following key features to train large language models efficiently:
 
 - Transformer Engine (TE)
 
-- Flash Attention (FA) 3
+- Flash Attention (FA) 3 -- with or without sequence input packing
 
 - GEMM tuning
 
 - Multi-node support
 
-.. _amd-maxtext-model-support:
+- NANOO FP8 quantization support
 
-The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
+.. _amd-maxtext-model-support-v257:
 
-* Llama 3.3 70B
+Supported models
+================
 
-* Llama 3.1 8B
+The following models are pre-optimized for performance on AMD Instinct MI300
+series accelerators. Some instructions, commands, and available training
+configurations in this documentation might vary by model -- select one to get
+started.
 
-* Llama 3.1 70B
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
 
-* Llama 3 8B
+   {% set model_groups = data.model_groups %}
+   .. raw:: html
 
-* Llama 3 70B
+         <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+           <div class="row">
+             <div class="col-2 me-2 model-param-head">Model</div>
+             <div class="row col-10">
+      {% for model_group in model_groups %}
+               <div class="col-4 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+      {% endfor %}
+             </div>
+           </div>
 
-* Llama 2 7B
-
-* Llama 2 70B
-
-* DeepSeek-V2-Lite
+           <div class="row mt-1">
+             <div class="col-2 me-2 model-param-head">Model variant</div>
+             <div class="row col-10">
+      {% for model_group in model_groups %}
+         {% set models = model_group.models %}
+         {% for model in models %}
+            {% if models|length % 3 == 0 %}
+               <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% else %}
+               <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% endif %}
+         {% endfor %}
+      {% endfor %}
+             </div>
+           </div>
+         </div>
 
 .. note::
 
    Some models, such as Llama 3, require an external license agreement through
    a third party (for example, Meta).
 
-Unsupported features
---------------------
-
-Currently, MaxText's default packed input format is not supported. Using this format
-with the current Docker image results in incorrect attention calculations
-across different input sequences. Support for packed input format is planned for a future release.
-
 System validation
 =================
 
@@ -98,14 +136,14 @@ This Docker image is optimized for specific model configurations outlined
 as follows. Performance can vary for other training workloads, as AMD
 doesn’t validate configurations and run conditions outside those described.
 
-.. _amd-maxtext-multi-node-setup:
+.. _amd-maxtext-multi-node-setup-v257:
 
 Multi-node setup
 ----------------
 
 For multi-node environments, ensure you have all the necessary packages for
 your network device, such as, RDMA. If you're not using a multi-node setup
-with RDMA, skip ahead to :ref:`amd-maxtext-download-docker`.
+with RDMA, skip ahead to :ref:`amd-maxtext-get-started-v257`.
 
 1. Install the following packages to build and install the RDMA driver.
 
@@ -180,196 +218,203 @@ with RDMA, skip ahead to :ref:`amd-maxtext-download-docker`.
          # If using Mellanox NIC
          export NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_8,mlx5_9
 
-.. _amd-maxtext-download-docker:
+.. _amd-maxtext-get-started-v257:
 
-Pull the Docker image
----------------------
+Benchmarking
+============
 
-1. Use the following command to pull the Docker image from Docker Hub.
+Once the setup is complete, choose between two options to reproduce the
+benchmark results:
 
-   .. code-block:: shell
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
 
-      docker pull rocm/jax-training:maxtext-v25.5
+   .. _vllm-benchmark-mad:
 
-2. Use the following command to launch the Docker container. Note that the benchmarking scripts
-   used in the :ref:`following section <amd-maxtext-get-started>` automatically launch the Docker container
-   and execute the benchmark.
+   {% set dockers = data.dockers %}
+   {% set model_groups = data.model_groups %}
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
 
-   .. code-block:: shell
+   .. container:: model-doc {{model.mad_tag}}
 
-      docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME/.ssh:/root/.ssh --shm-size 128G --name maxtext_training rocm/jax-training:maxtext-v25.5
+      .. tab-set::
 
-.. _amd-maxtext-get-started:
+         {% if model.mad_tag and "single-node" in model.doc_options %}
+         .. tab-item:: MAD-integrated benchmarking
 
-Getting started
+            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+               directory and install the required packages on the host machine.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD
+                  pip install -r requirements.txt
+
+            2. Use this command to run the performance benchmark test on the {{ model.model }} model
+               using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
+
+               .. code-block:: shell
+
+                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+                  madengine run \
+                      --tags {{model.mad_tag}} \
+                      --keep-model-dir \
+                      --live-output \
+                      --timeout 28800
+
+            MAD launches a Docker container with the name
+            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
+            model are collected in the following path: ``~/MAD/perf.csv/``.
+         {% endif %}
+
+         .. tab-item:: Standalone benchmarking
+
+            .. rubric:: Download the Docker image and required scripts
+
+            Run the JAX MaxText benchmark tool independently by starting the
+            Docker container as shown in the following snippet.
+
+            .. tab-set::
+               {% for docker in dockers %}
+               {% set jax_version = docker.components["JAX"] %}
+
+               .. tab-item:: JAX {{ jax_version }}
+                  :sync: {{ docker.pull_tag }}
+
+                  .. code-block:: shell
+
+                     docker pull {{ docker.pull_tag }}
+               {% endfor %}
+
+            {% if model.model_repo and "single-node" in model.doc_options %}
+            .. rubric:: Single node training
+
+            1. Set up environment variables.
+
+               .. code-block:: shell
+
+                  export MAD_SECRETS_HFTOKEN=<Your Hugging Face token>
+                  export HF_HOME=<Location of saved/cached Hugging Face models>
+
+               ``MAD_SECRETS_HFTOKEN`` is your Hugging Face access token to access models, tokenizers, and data.
+               See `User access tokens <https://huggingface.co/docs/hub/en/security-tokens>`__.
+
+               ``HF_HOME`` is where ``huggingface_hub`` will store local data. See `huggingface_hub CLI <https://huggingface.co/docs/huggingface_hub/main/en/guides/cli#huggingface-cli-download>`__.
+               If you already have downloaded or cached Hugging Face artifacts, set this variable to that path.
+               Downloaded files typically get cached to ``~/.cache/huggingface``.
+
+            2. Launch the Docker container.
+
+               .. tab-set::
+                  {% for docker in dockers %}
+                  {% set jax_version = docker.components["JAX"] %}
+
+                  .. tab-item:: JAX {{ jax_version }}
+                     :sync: {{ docker.pull_tag }}
+
+                     .. code-block:: shell
+
+                        docker run -it \
+                            --device=/dev/dri \
+                            --device=/dev/kfd \
+                            --network host \
+                            --ipc host \
+                            --group-add video \
+                            --cap-add=SYS_PTRACE \
+                            --security-opt seccomp=unconfined \
+                            --privileged \
+                            -v $HOME:$HOME \
+                            -v $HOME/.ssh:/root/.ssh \
+                            -v $HF_HOME:/hf_cache \
+                            -e HF_HOME=/hf_cache \
+                            -e MAD_SECRETS_HFTOKEN=$MAD_SECRETS_HFTOKEN
+                            --shm-size 64G \
+                            --name training_env \
+                            {{ docker.pull_tag }}
+                  {% endfor %}
+
+            3. In the Docker container, clone the ROCm MAD repository and navigate to the
+               benchmark scripts directory at ``MAD/scripts/jax-maxtext``.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD/scripts/jax-maxtext
+
+            4. Run the setup scripts to install libraries and datasets needed
+               for benchmarking.
+
+               .. code-block:: shell
+
+                  ./jax-maxtext_benchmark_setup.sh -m {{ model.model_repo }}
+
+            5. To run the training benchmark without quantization, use the following command:
+
+               .. code-block:: shell
+
+                  ./jax-maxtext_benchmark_report.sh -m {{ model.model_repo }}
+
+               For quantized training, use the following command:
+
+               .. code-block:: shell
+
+                  ./jax-maxtext_benchmark_report.sh -m {{ model.model_repo }} -q nanoo_fp8
+
+               .. important::
+
+                  Quantized training is not supported with the JAX 0.6.0 Docker image; support
+                  will be added in a future release. For quantized training, use the JAX 0.5.0
+                  Docker image: ``rocm/jax-training:maxtext-v25.7``.
+
+            {% endif %}
+            {% if model.multinode_training_script and "multi-node" in model.doc_options %}
+            .. rubric:: Multi-node training
+
+            The following examples use SLURM to run on multiple nodes.
+
+            .. note::
+
+               The following scripts will launch the Docker container and run the
+               benchmark. Run them outside of any Docker container.
+
+            1. Make sure ``$HF_HOME`` is set before running the test. See
+               `ROCm benchmarking <https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/readme.md>`__
+               for more details on downloading the Llama models before running the
+               benchmark.
+
+            2. To run multi-node training for {{ model.model }}, 
+               use the
+               `multi-node training script <https://github.com/ROCm/MAD/blob/develop/scripts/jax-maxtext/gpu-rocm/{{ model.multinode_training_script }}>`__
+               under the ``scripts/jax-maxtext/gpu-rocm/`` directory.
+
+            3. Run the multi-node training benchmark script.
+
+               .. code-block:: shell
+
+                  sbatch -N <num_nodes> {{ model.multinode_training_script }}
+
+         {% else %}
+            .. rubric:: Multi-node training
+
+            For multi-node training examples, choose a model from :ref:`amd-maxtext-model-support-v257`
+            with an available `multi-node training script <https://github.com/ROCm/MAD/tree/develop/scripts/jax-maxtext/gpu-rocm>`__.
+         {% endif %}
+      {% endfor %}
+   {% endfor %}
+
+Further reading
 ===============
 
-The following examples demonstrate how to get started with single node
-and multi-node training using the benchmarking scripts provided at
-`<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/>`__.
+- See the ROCm/maxtext benchmarking README at `<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/readme.md>`__.
 
-.. important::
+- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
 
-   The provided scripts launch a Docker container and execute a benchmark. Ensure you run these commands outside of any existing Docker container.
+- To learn more about system settings and management practices to configure your system for
+  AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
 
-Before running any benchmarks, ensure the ``$HF_HOME`` environment variable is
-set correctly and points to your Hugging Face cache directory. Refer to the
-README at `<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/>`__
-for more detailed instructions.
-
-Single node training benchmarking examples
-------------------------------------------
-
-* Example 1: Single node training with Llama 2 7B
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_7b.sh
-
-  Run the single node training benchmark:
-
-  .. code-block:: shell
-
-     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama2_7b.sh
-
-* Example 2: Single node training with Llama 2 70B
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_70b.sh
-
-  Run the single node training benchmark:
-
-  .. code-block:: shell
-
-     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama2_70b.sh
-
-* Example 3: Single node training with Llama 3 8B
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_8b.sh
-
-  Run the single node training benchmark:
-
-  .. code-block:: shell
-
-     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama3_8b.sh
-
-* Example 4: Single node training with Llama 3 70B
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_70b.sh
-
-  Run the single node training benchmark:
-
-  .. code-block:: shell
-
-     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama3_70b.sh
-
-* Example 5: Single node training with Llama 3.3 70B
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3.3_70b.sh
-
-  Run the single node training benchmark:
-
-  .. code-block:: shell
-
-     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama3.3_70b.sh
-
-* Example 6: Single node training with DeepSeek V2 16B
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/deepseek_v2_16b.sh
-
-  Run the single node training benchmark:
-
-  .. code-block:: shell
-
-     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./deepseek_v2_16b.sh
-
-  .. note::
-
-     The reported TFLOP/s by MaxText for DeepSeek is not accurate. Use
-     the tokens/s as a performance indicator.
-
-Multi-node training benchmarking examples
------------------------------------------
-
-The following examples use SLURM for running on multiple nodes -- the commands might need to be adjusted for your
-own cluster setup.
-
-* Example 1: Multi-node training with Llama 2 7B
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_7b_multinode.sh
-
-  Run the multi-node training benchmark. For example:
-
-  .. code-block:: shell
-
-     sbatch -N <num_nodes> llama2_7b_multinode.sh
-
-* Example 2: Multi-node training with Llama 2 70B
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_70b_multinode.sh
-
-  Run the multi-node training benchmark. For example:
-
-  .. code-block:: shell
-
-     sbatch -N <num_nodes> llama2_70b_multinode.sh
-
-* Example 3: Multi-node training with Llama 3 8B model
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_8b_multinode.sh
-
-  Run the multi-node training benchmark. For example:
-
-  .. code-block:: shell
-
-     sbatch -N <num_nodes> llama3_8b_multinode.sh
-
-* Example 4: Multi-node training with Llama 3 70B model
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_70b_multinode.sh
-
-  Run the multi-node training benchmark. For example:
-
-  .. code-block:: shell
-
-     sbatch -N <num_nodes> llama3_70b_multinode.sh
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
 
 Previous versions
 =================
diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history.rst
index b67d1ac3a..e4d039356 100644
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history.rst
@@ -17,12 +17,21 @@ previous releases of the ``ROCm/jax-training`` Docker image on `Docker Hub <http
      - Components
      - Resources
 
-   * - 25.5 (latest)
+   * - 25.7 (latest)
+     - 
+       * ROCm 6.4.1
+       * JAX 0.6.0, 0.5.0
+     - 
+       * :doc:`Documentation <../jax-maxtext>`
+       * `Docker Hub (JAX 0.6.0) <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7-jax060/images/sha256-7352212ae033a76dca2b9dceffc23c1b5f1a61a7a560082cf747a9bf1acfc9ce>`__
+       * `Docker Hub (JAX 0.5.0) <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7/images/sha256-45f4c727d4019a63fc47313d3a5f5a5105569539294ddfd2d742218212ae9025>`__
+
+   * - 25.5
      - 
        * ROCm 6.3.4
        * JAX 0.4.35
      - 
-       * :doc:`Documentation <../jax-maxtext>`
+       * :doc:`Documentation <jax-maxtext-v25.5>`
        * `Docker Hub <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.5/images/sha256-4e0516358a227cae8f552fb866ec07e2edcf244756f02e7b40212abfbab5217b>`__
 
    * - 25.4
diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4.rst
index 03836c9fc..3fe728c35 100644
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4.rst
@@ -51,7 +51,7 @@ MaxText provides the following key features to train large language models effic
 
 - Multi-node support
 
-.. _amd-maxtext-model-support:
+.. _amd-maxtext-model-support-v254:
 
 The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
 
diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.5.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.5.rst
new file mode 100644
index 000000000..d5051d28c
--- /dev/null
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.5.rst
@@ -0,0 +1,385 @@
+:orphan:
+
+.. meta::
+   :description: How to train a model using JAX MaxText for ROCm.
+   :keywords: ROCm, AI, LLM, train, jax, torch, Llama, flux, tutorial, docker
+
+**************************************
+Training a model with MaxText for ROCm
+**************************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm JAX MaxText
+   training performance documentation. See :doc:`../jax-maxtext` for the latest version.
+
+MaxText is a high-performance, open-source framework built on the Google JAX
+machine learning library to train LLMs at scale. The MaxText framework for
+ROCm is an optimized fork of the upstream
+`<https://github.com/AI-Hypercomputer/maxtext>`__ enabling efficient AI workloads
+on AMD MI300X series accelerators.
+
+The MaxText for ROCm training Docker (``rocm/jax-training:maxtext-v25.5``) image
+provides a prebuilt environment for training on AMD Instinct MI300X and MI325X accelerators,
+including essential components like JAX, XLA, ROCm libraries, and MaxText utilities.
+It includes the following software components:
+
++--------------------------+--------------------------------+
+| Software component       | Version                        |
++==========================+================================+
+| ROCm                     | 6.3.4                          |
++--------------------------+--------------------------------+
+| JAX                      | 0.4.35                         |
++--------------------------+--------------------------------+
+| Python                   | 3.10.12                        |
++--------------------------+--------------------------------+
+| Transformer Engine       | 1.12.0.dev0+b8b92dc            |
++--------------------------+--------------------------------+
+| hipBLASLt                | 0.13.0-ae9c477a                |
++--------------------------+--------------------------------+
+
+Supported features and models
+=============================
+
+MaxText provides the following key features to train large language models efficiently:
+
+- Transformer Engine (TE)
+
+- Flash Attention (FA) 3
+
+- GEMM tuning
+
+- Multi-node support
+
+.. _amd-maxtext-model-support-v255:
+
+The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
+
+* Llama 3.3 70B
+
+* Llama 3.1 8B
+
+* Llama 3.1 70B
+
+* Llama 3 8B
+
+* Llama 3 70B
+
+* Llama 2 7B
+
+* Llama 2 70B
+
+* DeepSeek-V2-Lite
+
+.. note::
+
+   Some models, such as Llama 3, require an external license agreement through
+   a third party (for example, Meta).
+
+Unsupported features
+--------------------
+
+Currently, MaxText's default packed input format is not supported. Using this format
+with the current Docker image results in incorrect attention calculations
+across different input sequences. Support for packed input format is planned for a future release.
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting training.
+
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
+Environment setup
+=================
+
+This Docker image is optimized for specific model configurations outlined
+as follows. Performance can vary for other training workloads, as AMD
+doesn’t validate configurations and run conditions outside those described.
+
+.. _amd-maxtext-multi-node-setup-v255:
+
+Multi-node setup
+----------------
+
+For multi-node environments, ensure you have all the necessary packages for
+your network device, such as, RDMA. If you're not using a multi-node setup
+with RDMA, skip ahead to :ref:`amd-maxtext-download-docker`.
+
+1. Install the following packages to build and install the RDMA driver.
+
+   .. code-block:: shell
+
+      sudo apt install iproute2 -y
+      sudo apt install -y linux-headers-"$(uname-r)" libelf-dev
+      sudo apt install -y gcc make libtool autoconf librdmacm-dev rdmacm-utils infiniband-diags ibverbs-utils perftest ethtool libibverbs-dev rdma-core strace libibmad5 libibnetdisc5 ibverbs-providers libibumad-dev libibumad3 libibverbs1 libnl-3-dev libnl-route-3-dev
+
+   Refer to your NIC manufacturer's documentation for further steps on
+   compiling and installing the RoCE driver. For example, for Broadcom,
+   see `Compiling Broadcom NIC software from source <https://docs.broadcom.com/doc/957608-AN2XX#G3.484341>`_
+   in `Ethernet networking guide for AMD Instinct MI300X GPU clusters <https://docs.broadcom.com/doc/957608-AN2XX>`_.
+
+2. Set the following environment variables.
+
+   a. Master address
+
+      Change ``localhost`` to the master node's resolvable hostname or IP address:
+
+      .. code-block:: bash
+
+         export MASTER_ADDR="${MASTER_ADDR:-localhost}"
+
+   b. Number of nodes
+
+      Set the number of nodes you want to train on (for example, ``2``, ``4``, or ``8``):
+
+      .. code-block:: bash
+
+         export NNODES="${NNODES:-1}"
+
+   c. Node ranks
+
+      Set the rank of each node (``0`` for master, ``1`` for the first worker node, and so on)
+      Node ranks should be unique across all nodes in the cluster.
+
+      .. code-block:: bash
+
+         export NODE_RANK="${NODE_RANK:-0}"
+
+   d. Network interface
+
+      Update the network interface in the script to match your system's network interface. To
+      find your network interface, run the following (outside of any Docker container):
+
+      .. code-block:: bash
+
+         ip a
+
+      Look for an active interface with an IP address in the same subnet as
+      your other nodes. Then, update the following variable in the script, for
+      example:
+
+      .. code-block:: bash
+
+         export NCCL_SOCKET_IFNAME=ens50f0np0
+
+      This variable specifies which network interface to use for inter-node communication.
+      Setting this variable to the incorrect interface can result in communication failures
+      or significantly reduced performance.
+
+   e. RDMA interface
+
+      Ensure the :ref:`required packages <amd-maxtext-multi-node-setup>` are installed on all nodes.
+      Then, set the RDMA interfaces to use for communication.
+
+      .. code-block:: bash
+
+         # If using Broadcom NIC
+         export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
+         # If using Mellanox NIC
+         export NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_8,mlx5_9
+
+.. _amd-maxtext-download-docker-v255:
+
+Pull the Docker image
+---------------------
+
+1. Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull rocm/jax-training:maxtext-v25.5
+
+2. Use the following command to launch the Docker container. Note that the benchmarking scripts
+   used in the :ref:`following section <amd-maxtext-get-started>` automatically launch the Docker container
+   and execute the benchmark.
+
+   .. code-block:: shell
+
+      docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME/.ssh:/root/.ssh --shm-size 128G --name maxtext_training rocm/jax-training:maxtext-v25.5
+
+.. _amd-maxtext-get-started-v255:
+
+Getting started
+===============
+
+The following examples demonstrate how to get started with single node
+and multi-node training using the benchmarking scripts provided at
+`<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/>`__.
+
+.. important::
+
+   The provided scripts launch a Docker container and execute a benchmark. Ensure you run these commands outside of any existing Docker container.
+
+Before running any benchmarks, ensure the ``$HF_HOME`` environment variable is
+set correctly and points to your Hugging Face cache directory. Refer to the
+README at `<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/>`__
+for more detailed instructions.
+
+Single node training benchmarking examples
+------------------------------------------
+
+* Example 1: Single node training with Llama 2 7B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_7b.sh
+
+  Run the single node training benchmark:
+
+  .. code-block:: shell
+
+     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama2_7b.sh
+
+* Example 2: Single node training with Llama 2 70B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_70b.sh
+
+  Run the single node training benchmark:
+
+  .. code-block:: shell
+
+     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama2_70b.sh
+
+* Example 3: Single node training with Llama 3 8B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_8b.sh
+
+  Run the single node training benchmark:
+
+  .. code-block:: shell
+
+     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama3_8b.sh
+
+* Example 4: Single node training with Llama 3 70B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_70b.sh
+
+  Run the single node training benchmark:
+
+  .. code-block:: shell
+
+     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama3_70b.sh
+
+* Example 5: Single node training with Llama 3.3 70B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3.3_70b.sh
+
+  Run the single node training benchmark:
+
+  .. code-block:: shell
+
+     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama3.3_70b.sh
+
+* Example 6: Single node training with DeepSeek V2 16B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/deepseek_v2_16b.sh
+
+  Run the single node training benchmark:
+
+  .. code-block:: shell
+
+     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./deepseek_v2_16b.sh
+
+  .. note::
+
+     The reported TFLOP/s by MaxText for DeepSeek is not accurate. Use
+     the tokens/s as a performance indicator.
+
+Multi-node training benchmarking examples
+-----------------------------------------
+
+The following examples use SLURM for running on multiple nodes -- the commands might need to be adjusted for your
+own cluster setup.
+
+* Example 1: Multi-node training with Llama 2 7B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_7b_multinode.sh
+
+  Run the multi-node training benchmark. For example:
+
+  .. code-block:: shell
+
+     sbatch -N <num_nodes> llama2_7b_multinode.sh
+
+* Example 2: Multi-node training with Llama 2 70B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_70b_multinode.sh
+
+  Run the multi-node training benchmark. For example:
+
+  .. code-block:: shell
+
+     sbatch -N <num_nodes> llama2_70b_multinode.sh
+
+* Example 3: Multi-node training with Llama 3 8B model
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_8b_multinode.sh
+
+  Run the multi-node training benchmark. For example:
+
+  .. code-block:: shell
+
+     sbatch -N <num_nodes> llama3_8b_multinode.sh
+
+* Example 4: Multi-node training with Llama 3 70B model
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_70b_multinode.sh
+
+  Run the multi-node training benchmark. For example:
+
+  .. code-block:: shell
+
+     sbatch -N <num_nodes> llama3_70b_multinode.sh
+
+Previous versions
+=================
+
+See :doc:`jax-maxtext-history` to find documentation for previous releases
+of the ``ROCm/jax-training`` Docker image.

From db43d18c3725ba53c00544971139ba9b743f1536 Mon Sep 17 00:00:00 2001
From: anisha-amd <anisha.sankar@amd.com>
Date: Tue, 9 Sep 2025 11:02:30 -0400
Subject: [PATCH 21/53] Docs: frameworks compatibility- ray and llama.cpp
 (#5273)

---
 .wordlist.txt                                 |   1 +
 .../compatibility-matrix-historical-6.0.csv   |   2 +
 docs/compatibility/compatibility-matrix.rst   |   2 +
 .../llama-cpp-compatibility.rst               | 151 ++++++++++++++++++
 .../ml-compatibility/ray-compatibility.rst    | 105 ++++++++++++
 docs/conf.py                                  |   2 +
 docs/how-to/deep-learning-rocm.rst            |  22 +++
 docs/sphinx/_toc.yml.in                       |  12 +-
 8 files changed, 293 insertions(+), 4 deletions(-)
 create mode 100644 docs/compatibility/ml-compatibility/llama-cpp-compatibility.rst
 create mode 100644 docs/compatibility/ml-compatibility/ray-compatibility.rst

diff --git a/.wordlist.txt b/.wordlist.txt
index 289fc276e..5370f4752 100644
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -501,6 +501,7 @@ Unhandled
 VALU
 VBIOS
 VCN
+verl's
 VGPR
 VGPRs
 VM
diff --git a/docs/compatibility/compatibility-matrix-historical-6.0.csv b/docs/compatibility/compatibility-matrix-historical-6.0.csv
index b8f7b6ba2..54f5ceb50 100644
--- a/docs/compatibility/compatibility-matrix-historical-6.0.csv
+++ b/docs/compatibility/compatibility-matrix-historical-6.0.csv
@@ -35,6 +35,8 @@ ROCm Version,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6
       :doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat]_,N/A,N/A,N/A,2.4.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,
       :doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>` [#megablocks_compat]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.7.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
       :doc:`Taichi <../compatibility/ml-compatibility/taichi-compatibility>` [#taichi_compat]_,N/A,N/A,N/A,N/A,N/A,1.8.0b1,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`Ray <../compatibility/ml-compatibility/ray-compatibility>` [#ray_compat]_,N/A,N/A,2.48.0.post0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`llama.cpp <../compatibility/ml-compatibility/llama-cpp-compatibility>` [#llama-cpp_compat]_,N/A,N/A,N/A,b5997,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
       `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.2,1.2,1.2,1.2,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.14.1,1.14.1
 ,,,,,,,,,,,,,,,,,,
       ,,,,,,,,,,,,,,,,,,
diff --git a/docs/compatibility/compatibility-matrix.rst b/docs/compatibility/compatibility-matrix.rst
index 797e2894e..fb1ffad43 100644
--- a/docs/compatibility/compatibility-matrix.rst
+++ b/docs/compatibility/compatibility-matrix.rst
@@ -246,6 +246,8 @@ Expand for full historical view of:
    .. [#dgl_compat] DGL is only supported on ROCm 6.4.0.
    .. [#megablocks_compat] Megablocks is only supported on ROCm 6.3.0.
    .. [#taichi_compat] Taichi is only supported on ROCm 6.3.2.
+   .. [#ray_compat] Ray is only supported on ROCm 6.4.1.
+   .. [#llama-cpp_compat] llama.cpp is only supported on ROCm 6.4.0.
    .. [#kfd_support-past-60] As of ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The tested user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and kernel-space support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
    .. [#ROCT-rocr-past-60] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.
    
diff --git a/docs/compatibility/ml-compatibility/llama-cpp-compatibility.rst b/docs/compatibility/ml-compatibility/llama-cpp-compatibility.rst
new file mode 100644
index 000000000..fd1356d32
--- /dev/null
+++ b/docs/compatibility/ml-compatibility/llama-cpp-compatibility.rst
@@ -0,0 +1,151 @@
+:orphan:
+
+.. meta::
+    :description: llama.cpp deep learning framework compatibility
+    :keywords: GPU, GGML, llama.cpp compatibility
+
+.. version-set:: rocm_version latest
+
+********************************************************************************
+llama.cpp compatibility
+********************************************************************************
+
+`llama.cpp <https://github.com/ggml-org/llama.cpp>`__ is an open-source framework 
+for Large Language Model (LLM) inference that runs on both central processing units 
+(CPUs) and graphics processing units (GPUs). It is written in plain C/C++, providing 
+a simple, dependency-free setup. 
+
+The framework supports multiple quantization options, from 1.5-bit to 8-bit integers, 
+to speed up inference and reduce memory usage. Originally built as a CPU-first library, 
+llama.cpp is easy to integrate with other programming environments and is widely 
+adopted across diverse platforms, including consumer devices. 
+
+ROCm support for llama.cpp is upstreamed, and you can build the official source code
+with ROCm support:
+
+- ROCm support for llama.cpp is hosted in the official `https://github.com/ROCm/llama.cpp 
+  <https://github.com/ROCm/llama.cpp>`_ repository.
+
+- Due to independent compatibility considerations, this location differs from the 
+  `https://github.com/ggml-org/llama.cpp <https://github.com/ggml-org/llama.cpp>`_ upstream repository.
+
+- To install llama.cpp, use the prebuilt :ref:`Docker image <llama-cpp-docker-compat>`, 
+  which includes ROCm, llama.cpp, and all required dependencies.
+
+  - See the :doc:`ROCm llama.cpp installation guide <rocm-install-on-linux:install/3rd-party/llama-cpp-install>` 
+    to install and get started.
+
+  - See the `Installation guide <https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md#hip>`__ 
+    in the upstream llama.cpp documentation.
+
+.. note::
+
+  llama.cpp is supported on ROCm 6.4.0.
+
+Supported devices
+================================================================================
+
+**Officially Supported**: AMD Instinct™ MI300X, MI210
+
+
+Use cases and recommendations
+================================================================================
+
+llama.cpp can be applied in a variety of scenarios, particularly when you need to meet one or more of the following requirements:
+
+- Plain C/C++ implementation with no external dependencies
+- Support for 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory usage
+- Custom HIP (Heterogeneous-compute Interface for Portability) kernels for running large language models (LLMs) on AMD GPUs (graphics processing units)
+- CPU (central processing unit) + GPU (graphics processing unit) hybrid inference for partially accelerating models larger than the total available VRAM (video random-access memory)
+
+llama.cpp is also used in a range of real-world applications, including:
+
+- Games such as `Lucy's Labyrinth <https://github.com/MorganRO8/Lucys_Labyrinth>`__:
+  A simple maze game where AI-controlled agents attempt to trick the player.
+- Tools such as `Styled Lines <https://marketplace.unity.com/packages/tools/ai-ml-integration/style-text-webgl-ios-stand-alone-llm-llama-cpp-wrapper-292902>`__:
+  A proprietary, asynchronous inference wrapper for Unity3D game development, including pre-built mobile and web platform wrappers and a model example.
+- Various other AI applications use llama.cpp as their inference engine;  
+  for a detailed list, see the `user interfaces (UIs) section <https://github.com/ggml-org/llama.cpp?tab=readme-ov-file#description>`__.
+
+Refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`_,
+where you can search for llama.cpp examples and best practices to optimize your workloads on AMD GPUs.
+
+.. _llama-cpp-docker-compat:
+
+Docker image compatibility
+================================================================================
+
+.. |docker-icon| raw:: html
+
+   <i class="fab fa-docker"></i>
+
+AMD validates and publishes `ROCm llama.cpp Docker images <https://hub.docker.com/r/rocm/llama.cpp>`__
+with ROCm backends on Docker Hub. The following Docker image tags and associated
+inventories were tested on `ROCm 6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__.
+Click |docker-icon| to view the image on Docker Hub.
+
+.. important::
+
+   Tag endings of ``_full``, ``_server``, and ``_light`` serve different purposes for entrypoints as follows:
+
+   - Full: This image includes both the main executable file and the tools to convert ``LLaMA`` models into ``ggml`` and convert into 4-bit quantization.
+   - Server: This image only includes the server executable file.
+   - Light: This image only includes the main executable file.
+
+.. list-table::
+    :header-rows: 1
+    :class: docker-image-compatibility
+
+    * - Full Docker
+      - Server Docker
+      - Light Docker
+      - llama.cpp
+      - Ubuntu
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b5997_rocm6.4.0_ubuntu24.04_full/images/sha256-f78f6c81ab2f8e957469415fe2370a1334fe969c381d1fe46050c85effaee9d5"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
+      - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b5997_rocm6.4.0_ubuntu24.04_server/images/sha256-275ad9e18f292c26a00a2de840c37917e98737a88a3520bdc35fd3fc5c9a6a9b"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
+      - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b5997_rocm6.4.0_ubuntu24.04_light/images/sha256-cc324e6faeedf0e400011f07b49d2dc41a16bae257b2b7befa0f4e2e97231320"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
+      - `b5997 <https://github.com/ROCm/llama.cpp/tree/release/b5997>`__
+      - 24.04
+
+Key ROCm libraries for llama.cpp
+================================================================================
+
+llama.cpp functionality on ROCm is determined by its underlying library
+dependencies. These ROCm components affect the capabilities, performance, and
+feature set available to developers.
+
+.. list-table::
+    :header-rows: 1
+
+    * - ROCm library
+      - Version
+      - Purpose
+      - Usage
+    * - `hipBLAS <https://github.com/ROCm/hipBLAS>`__
+      - :version-ref:`hipBLAS rocm_version`
+      - Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
+        matrix and vector operations.
+      - Supports operations such as matrix multiplication, matrix-vector
+        products, and tensor contractions. Utilized in both dense and batched
+        linear algebra operations.
+    * - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`__
+      - :version-ref:`hipBLASLt rocm_version`
+      - hipBLASLt is an extension of the hipBLAS library, providing additional
+        features like epilogues fused into the matrix multiplication kernel or
+        use of integer tensor cores.
+      - By setting the flag ``ROCBLAS_USE_HIPBLASLT``, you can dispatch hipblasLt
+        kernels where possible.
+    * - `rocWMMA <https://github.com/ROCm/rocWMMA>`__
+      - :version-ref:`rocWMMA rocm_version`
+      - Accelerates warp-level matrix-multiply and matrix-accumulate to speed up matrix
+        multiplication (GEMM) and accumulation operations with mixed precision
+        support.
+      - Can be used to enhance the flash attention performance on AMD compute, by enabling
+        the flag during compile time.
\ No newline at end of file
diff --git a/docs/compatibility/ml-compatibility/ray-compatibility.rst b/docs/compatibility/ml-compatibility/ray-compatibility.rst
new file mode 100644
index 000000000..c5a2ed39f
--- /dev/null
+++ b/docs/compatibility/ml-compatibility/ray-compatibility.rst
@@ -0,0 +1,105 @@
+:orphan:
+
+.. meta::
+    :description: Ray deep learning framework compatibility
+    :keywords: GPU, Ray compatibility
+
+.. version-set:: rocm_version latest
+
+*******************************************************************************
+Ray compatibility
+*******************************************************************************
+
+Ray is a unified framework for scaling AI and Python applications from your laptop 
+to a full cluster, without changing your code. Ray consists of `a core distributed 
+runtime  <https://docs.ray.io/en/latest/ray-core/walkthrough.html>`_ and a set of 
+`AI libraries <https://docs.ray.io/en/latest/ray-air/getting-started.html>`_ for 
+simplifying machine learning computations.
+
+Ray is a general-purpose framework that runs many types of workloads efficiently. 
+Any Python application can be scaled with Ray, without extra infrastructure.
+
+ROCm support for Ray is upstreamed, and you can build the official source code
+with ROCm support: 
+
+- ROCm support for Ray is hosted in the official `https://github.com/ROCm/ray 
+  <https://github.com/ROCm/ray>`_ repository.
+
+- Due to independent compatibility considerations, this location differs from the 
+  `https://github.com/ray-project/ray <https://github.com/ray-project/ray>`_ upstream repository.
+
+- To install Ray, use the prebuilt :ref:`Docker image <ray-docker-compat>` 
+  which includes ROCm, Ray, and all required dependencies.
+
+  - See the :doc:`ROCm Ray installation guide <rocm-install-on-linux:install/3rd-party/ray-install>` 
+    for instructions to get started.
+
+  - See the `Installation section <https://docs.ray.io/en/latest/ray-overview/installation.html>`_ 
+    in the upstream Ray documentation.
+
+  - The Docker image provided is based on the upstream Ray `Daily Release (Nightly) wheels <https://docs.ray.io/en/latest/ray-overview/installation.html#daily-releases-nightlies>`__ 
+    corresponding to commit `005c372 <https://github.com/ray-project/ray/commit/005c372262e050d5745f475e22e64305fa07f8b8>`__.
+
+.. note::
+
+  Ray is supported on ROCm 6.4.1.
+
+Supported devices
+================================================================================
+
+**Officially Supported**: AMD Instinct™ MI300X, MI210
+
+
+Use cases and recommendations
+================================================================================
+
+* The `Reinforcement Learning from Human Feedback on AMD GPUs with verl and ROCm 
+  Integration <https://rocm.blogs.amd.com/artificial-intelligence/verl-large-scale/README.html>`__  
+  blog provides an overview of Volcano Engine Reinforcement Learning (verl) 
+  for large language models (LLMs) and discusses its benefits in large-scale 
+  reinforcement learning from human feedback (RLHF). It uses Ray as part of a 
+  hybrid orchestration engine to schedule and coordinate training and inference 
+  tasks in parallel, enabling optimized resource utilization and potential overlap 
+  between these phases. This dynamic resource allocation strategy significantly 
+  improves overall system efficiency. The blog presents verl’s performance results, 
+  focusing on throughput and convergence accuracy achieved on AMD Instinct™ MI300X 
+  GPUs. Follow this guide to get started with verl on AMD Instinct GPUs and 
+  accelerate your RLHF training with ROCm-optimized performance.
+
+For more use cases and recommendations, see the AMD GPU tabs in the `Accelerator Support 
+topic <https://docs.ray.io/en/latest/ray-core/scheduling/accelerators.html#accelerator-support>`_ 
+of the Ray core documentation and refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`_, 
+where you can search for Ray examples and best practices to optimize your workloads on AMD GPUs.
+
+.. _ray-docker-compat:
+
+Docker image compatibility
+================================================================================
+
+.. |docker-icon| raw:: html
+
+   <i class="fab fa-docker"></i>
+
+AMD validates and publishes ready-made `ROCm Ray Docker images <https://hub.docker.com/r/rocm/ray/tags>`__
+with ROCm backends on Docker Hub. The following Docker image tags and
+associated inventories represent the latest Ray version from the official Docker Hub and are validated for
+`ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`_. Click the |docker-icon|
+icon to view the image on Docker Hub.
+
+.. list-table::
+    :header-rows: 1
+    :class: docker-image-compatibility
+
+    * - Docker image
+      - Ray
+      - Pytorch
+      - Ubuntu
+      - Python
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/ray/ray-2.48.0.post0_rocm6.4.1_ubuntu24.04_py3.12_pytorch2.6.0/images/sha256-0d166fe6bdced38338c78eedfb96eff92655fb797da3478a62dd636365133cc0"><i class="fab fa-docker fa-lg"></i> rocm/ray</a>
+      - `2.48.0.post0 <https://github.com/ROCm/ray/tree/release/2.48.0.post0>`_
+      - 2.6.0+git684f6f2
+      - 24.04
+      - `3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
diff --git a/docs/conf.py b/docs/conf.py
index 6e7fa5e61..f852b6697 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -108,6 +108,8 @@ article_pages = [
     {"file": "compatibility/ml-compatibility/dgl-compatibility", "os": ["linux"]},
     {"file": "compatibility/ml-compatibility/megablocks-compatibility", "os": ["linux"]},
     {"file": "compatibility/ml-compatibility/taichi-compatibility", "os": ["linux"]},
+    {"file": "compatibility/ml-compatibility/ray-compatibility", "os": ["linux"]},
+    {"file": "compatibility/ml-compatibility/llama-cpp-compatibility", "os": ["linux"]},
     {"file": "how-to/deep-learning-rocm", "os": ["linux"]},
 
     {"file": "how-to/rocm-for-ai/index", "os": ["linux"]},
diff --git a/docs/how-to/deep-learning-rocm.rst b/docs/how-to/deep-learning-rocm.rst
index fb1d55a3c..accb2e546 100644
--- a/docs/how-to/deep-learning-rocm.rst
+++ b/docs/how-to/deep-learning-rocm.rst
@@ -110,6 +110,28 @@ The table below summarizes information about ROCm-enabled deep learning framewor
 
           <a href="https://github.com/ROCm/taichi"><i class="fab fa-github fa-lg"></i></a>
 
+    * - `Ray <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/ray-compatibility.html>`__
+      - .. raw:: html
+
+          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/ray-install.html"><i class="fas fa-link fa-lg"></i></a>
+      - 
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/ray-install.html#using-a-prebuilt-docker-image-with-ray-pre-installed>`__
+        - `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/ray-install.html#install-ray-on-bare-metal-or-a-custom-container>`__
+        - `ROCm Base Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/ray-install.html#build-your-own-docker-image>`__
+      - .. raw:: html
+
+          <a href="https://github.com/ROCm/ray"><i class="fab fa-github fa-lg"></i></a>
+
+    * - `llama.cpp <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/llama-cpp-compatibility.html>`__
+      - .. raw:: html
+
+          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/llama-cpp-install.html"><i class="fas fa-link fa-lg"></i></a>
+      - 
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/llama-cpp-install.html#use-a-prebuilt-docker-image-with-llama-cpp-pre-installed>`__
+      - .. raw:: html
+
+          <a href="https://github.com/ROCm/llama.cpp"><i class="fab fa-github fa-lg"></i></a>
+
 Learn how to use your ROCm deep learning environment for training, fine-tuning, inference, and performance optimization
 through the following guides.
 
diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in
index 1bb9177f0..732aab15e 100644
--- a/docs/sphinx/_toc.yml.in
+++ b/docs/sphinx/_toc.yml.in
@@ -32,19 +32,23 @@ subtrees:
       - file: compatibility/ml-compatibility/pytorch-compatibility.rst
         title: PyTorch compatibility
       - file: compatibility/ml-compatibility/tensorflow-compatibility.rst
-        title: TensorFlow compatibility  
+        title: TensorFlow compatibility
       - file: compatibility/ml-compatibility/jax-compatibility.rst
         title: JAX compatibility
       - file: compatibility/ml-compatibility/verl-compatibility.rst
-        title: verl compatibility  
+        title: verl compatibility
       - file: compatibility/ml-compatibility/stanford-megatron-lm-compatibility.rst
         title: Stanford Megatron-LM compatibility
       - file: compatibility/ml-compatibility/dgl-compatibility.rst
-        title: DGL compatibility  
+        title: DGL compatibility
       - file: compatibility/ml-compatibility/megablocks-compatibility.rst
         title: Megablocks compatibility
       - file: compatibility/ml-compatibility/taichi-compatibility.rst
-        title: Taichi compatibility 
+        title: Taichi compatibility
+      - file: compatibility/ml-compatibility/ray-compatibility.rst
+        title: Ray compatibility
+      - file: compatibility/ml-compatibility/llama-cpp-compatibility.rst
+        title: llama.cpp compatibility
   - file: how-to/build-rocm.rst
     title: Build ROCm from source
 

From f25e27acf0bb312001b1f157520c00a10a59b75c Mon Sep 17 00:00:00 2001
From: amd-hsivasun <hsivasun@amd.com>
Date: Tue, 9 Sep 2025 12:22:04 -0400
Subject: [PATCH 22/53] Update roctracer pipeline ID and branch

---
 .azuredevops/templates/steps/dependencies-rocm.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.azuredevops/templates/steps/dependencies-rocm.yml b/.azuredevops/templates/steps/dependencies-rocm.yml
index 86d1b58e9..f62b973df 100644
--- a/.azuredevops/templates/steps/dependencies-rocm.yml
+++ b/.azuredevops/templates/steps/dependencies-rocm.yml
@@ -251,8 +251,8 @@ parameters:
       developBranch: develop
       hasGpuTarget: true
     roctracer:
-      pipelineId: 141
-      developBranch: amd-staging
+      pipelineId: 331
+      developBranch: develop
       hasGpuTarget: true
     rocWMMA:
       pipelineId: 109

From 985786e98d68374169391e93a23626c2afcf2a07 Mon Sep 17 00:00:00 2001
From: amd-hsivasun <hsivasun@amd.com>
Date: Tue, 9 Sep 2025 15:22:50 -0400
Subject: [PATCH 23/53] Add sqlalchemy to dependencies in rocprofiler-compute

---
 .azuredevops/components/rocprofiler-compute.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.azuredevops/components/rocprofiler-compute.yml b/.azuredevops/components/rocprofiler-compute.yml
index d15414469..bccb51f67 100644
--- a/.azuredevops/components/rocprofiler-compute.yml
+++ b/.azuredevops/components/rocprofiler-compute.yml
@@ -55,6 +55,7 @@ parameters:
     - pymongo
     - pyyaml
     - setuptools
+    - sqlalchemy
     - tabulate
     - textual
     - textual_plotext

From 3c37ae88f077849bc3d16e55c15250cde6516b82 Mon Sep 17 00:00:00 2001
From: Ibrahim Wani <113864060+ibrahimw1@users.noreply.github.com>
Date: Tue, 9 Sep 2025 15:13:54 -0600
Subject: [PATCH 24/53] Add origami CI pipelines (#5256)

* Add origami yaml pipeline.

* Unindent lines.

* Add cmake dependency step to origami yml.

* Add pybind dep

* Fix pipeline failures.

* Quick fix

* Fix pybind11 dep for almalinux

* Fix pybind11 dep for almalinux again

* Test

* [Ex CI] don't create symlink if more than one sparse checkout dir

* hipBLASLt multi sparse

* Replace pybind with nanobind.

* Quick fix

* Testing nanobind install in pipelines

* Run origami binding tests

* Change build path for tests

* Change build path for tests again

* Add missing dep for CI

* Add archs to buildJobs

* Fix CI error.

* Test

* Test job target

* Adding job target to hipblaslt dependant builds

* Check devices on machine

* Add gpu to pipeline

* Add more gpu targets

* test

* Add test job to origami

* Update test jobs

* Finding test dir

* Fix sparse checkout

* Find build dir

* Try to find build dir

* Clean up

* Test

* Change test dir

* Build origami in test job

* Try removing job.target from params

* Package bindings in build artifacts

* Download build as artifact.

* Comment out block

* Fix checkout in test job

* Test1

* Echo to list dir

* Sparse checkout origami/python

* Download python bindings as artifact

* Try ctest instead of running test files directly

* Only download artifacts for ubuntu

* Add missing cd

* Run individual tests not ctest.

* Fix hipblaslt build failures

* Resolve more ci failures in hipblaslt

* Add old changes back in

* Fix hipblaslt ci errors

* Clean up

* Add nanobind to array

* Add nanobind to array correctly

* Remove nanobind install script

* Quick fix

* Add pip module installs to test job

---------

Co-authored-by: Daniel Su <danielsu@amd.com>
---
 .azuredevops/components/hipBLASLt.yml |   4 +-
 .azuredevops/components/origami.yml   | 236 ++++++++++++++++++++++++++
 2 files changed, 239 insertions(+), 1 deletion(-)
 create mode 100644 .azuredevops/components/origami.yml

diff --git a/.azuredevops/components/hipBLASLt.yml b/.azuredevops/components/hipBLASLt.yml
index b2633e84d..6364380a5 100644
--- a/.azuredevops/components/hipBLASLt.yml
+++ b/.azuredevops/components/hipBLASLt.yml
@@ -178,7 +178,7 @@ jobs:
           mkdir -p $(Agent.BuildDirectory)/temp-deps
           cd $(Agent.BuildDirectory)/temp-deps
           # position-independent LAPACK is required for almalinux8 builds
-          cmake -DBUILD_GTEST=OFF -DBUILD_LAPACK=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON $(Agent.BuildDirectory)/s/deps
+          cmake -DBUILD_GTEST=OFF -DBUILD_LAPACK=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON $(Agent.BuildDirectory)/sparse/projects/hipblaslt/deps
           make -j
           sudo make install
     - script: |
@@ -197,6 +197,8 @@ jobs:
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
       parameters:
         os: ${{ job.os }}
+        cmakeSourceDir: $(Agent.BuildDirectory)/sparse/projects/hipblaslt
+        cmakeBuildDir: $(Agent.BuildDirectory)/sparse/projects/hipblaslt/build
         extraBuildFlags: >-
           -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor
           -DCMAKE_INCLUDE_PATH=$(Agent.BuildDirectory)/rocm/llvm/include
diff --git a/.azuredevops/components/origami.yml b/.azuredevops/components/origami.yml
new file mode 100644
index 000000000..b55cd67aa
--- /dev/null
+++ b/.azuredevops/components/origami.yml
@@ -0,0 +1,236 @@
+parameters:
+- name: componentName
+  type: string
+  default: origami
+- name: checkoutRepo
+  type: string
+  default: 'self'
+- name: checkoutRef
+  type: string
+  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
+# set to true if doing full build of ROCm stack
+# and dependencies are pulled from same pipeline
+- name: aggregatePipeline
+  type: boolean
+  default: false
+- name: aptPackages
+  type: object
+  default:
+    - cmake
+    - git
+    - ninja-build
+    - wget
+    - python3
+    - python3-dev
+    - python3-pip
+- name: pipModules
+  type: object
+  default:
+    - nanobind>=2.0.0
+- name: rocmDependencies
+  type: object
+  default:
+    - clr
+    - llvm-project
+    - rocm-cmake
+    - rocminfo
+    - ROCR-Runtime
+    - rocprofiler-register
+- name: rocmTestDependencies
+  type: object
+  default:
+    - clr
+    - llvm-project
+    - rocm-cmake
+    - rocminfo
+    - ROCR-Runtime
+    - rocprofiler-register
+
+- name: jobMatrix
+  type: object
+  default:
+    buildJobs:
+      - { os: ubuntu2204, packageManager: apt }
+      - { os: almalinux8, packageManager: dnf }
+    testJobs:
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+- name: downstreamComponentMatrix
+  type: object
+  default:
+    - hipBLASLt:
+      name: hipBLASLt
+      sparseCheckoutDir: projects/hipblaslt
+      skipUnifiedBuild: 'false'
+      buildDependsOn:
+        - origami_build
+
+jobs:
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
+  - job: origami_build_${{ job.os }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    - name: ROCM_PATH
+      value: $(Agent.BuildDirectory)/rocm
+    pool:
+      vmImage: ${{ variables.BASE_BUILD_POOL }}
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        pipModules: ${{ parameters.pipModules }}
+        packageManager: ${{ job.packageManager }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      parameters:
+        checkoutRef: ${{ parameters.checkoutRef }}
+        dependencyList: ${{ parameters.rocmDependencies }}
+        os: ${{ job.os }}
+        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        os: ${{ job.os }}
+        extraBuildFlags: >-
+          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
+          -DORIGAMI_BUILD_SHARED_LIBS=ON
+          -DORIGAMI_ENABLE_PYTHON=ON
+          -DORIGAMI_BUILD_TESTING=ON
+          -GNinja
+    - ${{ if ne(job.os, 'almalinux8') }}:
+      - task: PublishPipelineArtifact@1
+        displayName: 'Publish Build Directory Artifact'
+        inputs:
+          targetPath: '$(Agent.BuildDirectory)/s/build'
+          artifact: '${{ parameters.componentName }}_${{ job.os }}_build_dir'
+          publishLocation: 'pipeline'
+      - task: PublishPipelineArtifact@1
+        displayName: 'Publish Python Source Artifact'
+        inputs:
+          targetPath: '$(Agent.BuildDirectory)/s/python'
+          artifact: '${{ parameters.componentName }}_${{ job.os }}_python_src'
+          publishLocation: 'pipeline'
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+      parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
+        os: ${{ job.os }}
+        componentName: ${{ parameters.componentName }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
+    - job: origami_test_${{ job.os }}_${{ job.target }}
+      timeoutInMinutes: 120
+      dependsOn: origami_build_${{ job.os }}
+      condition:
+        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
+          eq(${{ parameters.aggregatePipeline }}, False)
+        )
+      variables:
+      - group: common
+      - template: /.azuredevops/variables-global.yml
+      pool: ${{ job.target }}_test_pool
+      workspace:
+        clean: all
+      steps:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+        parameters:
+          checkoutRepo: ${{ parameters.checkoutRepo }}
+          sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+          packageManager: ${{ job.packageManager }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+        parameters:
+          preTargetFilter: ${{ parameters.componentName }}
+          os: ${{ job.os }}
+      - task: DownloadPipelineArtifact@2
+        displayName: 'Download Build Directory Artifact'
+        inputs:
+          artifact: '${{ parameters.componentName }}_${{ job.os }}_build_dir'
+          path: '$(Agent.BuildDirectory)/s/build'
+      - task: DownloadPipelineArtifact@2
+        displayName: 'Download Python Source Artifact'
+        inputs:
+          artifact: '${{ parameters.componentName }}_${{ job.os }}_python_src'
+          path: '$(Agent.BuildDirectory)/s/python'
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
+          dependencyList: ${{ parameters.rocmTestDependencies }}
+          os: ${{ job.os }}
+          gpuTarget: ${{ job.target }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      - script: |
+          export PYTHONPATH=$(Agent.BuildDirectory)/s/build/python:$PYTHONPATH
+
+          echo "--- Running origami_test.py ---"
+          python3 $(Agent.BuildDirectory)/s/python/origami_test.py
+          
+          echo "--- Running origami_grid_test.py ---"
+          python3 $(Agent.BuildDirectory)/s/python/origami_grid_test.py
+        displayName: 'Run Python Binding Tests'
+        condition: succeeded()
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+          environment: test
+          gpuTarget: ${{ job.target }}
+
+- ${{ if parameters.triggerDownstreamJobs }}:
+  - ${{ each component in parameters.downstreamComponentMatrix }}:
+    - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
+      - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
+        parameters:
+          checkoutRepo: ${{ parameters.checkoutRepo }}
+          sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
+          buildDependsOn: ${{ component.buildDependsOn }}
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
+          triggerDownstreamJobs: true
+          unifiedBuild: ${{ parameters.unifiedBuild }}

From 05a66f75fea71fe19ba29f694c7c22854187e334 Mon Sep 17 00:00:00 2001
From: Peter Park <peter.park@amd.com>
Date: Tue, 9 Sep 2025 17:41:11 -0400
Subject: [PATCH 25/53] add qwen3 30b a3b to vllm-benchmark-models (#5280)

---
 .../how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
index 714534ef1..a522e61a6 100644
--- a/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
@@ -78,7 +78,11 @@ vllm_benchmark:
         model_repo: Qwen/QwQ-32B
         url: https://huggingface.co/Qwen/QwQ-32B
         precision: float16
-        tunableop: true
+      - model: Qwen3 30B A3B
+        mad_tag: pyt_vllm_qwen3-30b-a3b
+        model_repo: Qwen/Qwen3-30B-A3B
+        url: https://huggingface.co/Qwen/Qwen3-30B-A3B
+        precision: float16
     - group: Microsoft Phi
       tag: phi
       models:

From 68f505e375e9d0a7500aad927bcd77c6aea1b972 Mon Sep 17 00:00:00 2001
From: Pratik Basyal <prbasyal@amd.com>
Date: Wed, 10 Sep 2025 10:07:55 -0400
Subject: [PATCH 26/53] Taichi removed (#5283)

---
 RELEASE.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/RELEASE.md b/RELEASE.md
index be1527030..327a74c0a 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -59,7 +59,6 @@ ROCm documentation continues to be updated to provide clearer and more comprehen
 
 * ROCm provides a comprehensive ecosystem for deep learning development. For more details, see [Deep learning frameworks for ROCm](https://rocm.docs.amd.com/en/docs-6.4.3/how-to/deep-learning-rocm.html). AMD ROCm adds support for the following deep learning frameworks:
 
-    * Taichi is an open-source, imperative, and parallel programming language designed for high-performance numerical computation. Embedded in Python, it leverages just-in-time (JIT) compilation frameworks such as LLVM to accelerate compute-intensive Python code by compiling it to native GPU or CPU instructions. It is currently supported on ROCm 6.3.2. For more information, see [Taichi compatibility](https://rocm.docs.amd.com/en/docs-6.4.3/compatibility/ml-compatibility/taichi-compatibility.html).
     * Megablocks is a light-weight library for mixture-of-experts (MoE) training. The core of the system is efficient "dropless-MoE" and standard MoE layers. Megablocks is integrated with Megatron-LM, where data and pipeline parallel training of MoEs is supported. It is currently supported on ROCm 6.3.0. For more information, see [Megablocks compatibility](https://rocm.docs.amd.com/en/docs-6.4.3/compatibility/ml-compatibility/megablocks-compatibility.html).
 
 * The [Data types and precision support](https://rocm.docs.amd.com/en/latest/reference/precision-support.html) topic now includes new hardware and library support information.

From 3b5019e03f0bb35d41061d2061df0307e4107b2a Mon Sep 17 00:00:00 2001
From: Pratik Basyal <prbasyal@amd.com>
Date: Wed, 10 Sep 2025 10:53:25 -0400
Subject: [PATCH 27/53] Minor correction (#5285)

---
 RELEASE.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/RELEASE.md b/RELEASE.md
index 327a74c0a..9d8835de8 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -57,7 +57,7 @@ ROCm documentation continues to be updated to provide clearer and more comprehen
  
     For more information about the changes, see [Changelog for the AI Developer Hub](https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/changelog.html).
 
-* ROCm provides a comprehensive ecosystem for deep learning development. For more details, see [Deep learning frameworks for ROCm](https://rocm.docs.amd.com/en/docs-6.4.3/how-to/deep-learning-rocm.html). AMD ROCm adds support for the following deep learning frameworks:
+* ROCm provides a comprehensive ecosystem for deep learning development. For more details, see [Deep learning frameworks for ROCm](https://rocm.docs.amd.com/en/docs-6.4.3/how-to/deep-learning-rocm.html). AMD ROCm adds support for the following deep learning framework:
 
     * Megablocks is a light-weight library for mixture-of-experts (MoE) training. The core of the system is efficient "dropless-MoE" and standard MoE layers. Megablocks is integrated with Megatron-LM, where data and pipeline parallel training of MoEs is supported. It is currently supported on ROCm 6.3.0. For more information, see [Megablocks compatibility](https://rocm.docs.amd.com/en/docs-6.4.3/compatibility/ml-compatibility/megablocks-compatibility.html).
 

From daa0184d2e768a5e9e618b6e9ff1cc82b544e7ad Mon Sep 17 00:00:00 2001
From: amd-hsivasun <hsivasun@amd.com>
Date: Tue, 9 Sep 2025 16:21:53 -0400
Subject: [PATCH 28/53] [Ex CI] enable rocm-core monorepo

---
 .azuredevops/components/rocm-core.yml | 28 +++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/.azuredevops/components/rocm-core.yml b/.azuredevops/components/rocm-core.yml
index f36252320..714518781 100644
--- a/.azuredevops/components/rocm-core.yml
+++ b/.azuredevops/components/rocm-core.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: rocm-core
 - name: checkoutRepo
   type: string
   default: 'self'
 - name: checkoutRef
   type: string
   default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -27,6 +46,10 @@ parameters:
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
   - job: rocm_core_${{ job.os }}
+    ${{ if parameters.buildDependsOn }}:
+        dependsOn:
+          - ${{ each build in parameters.buildDependsOn }}:
+            - ${{ build }}_${{ job.os }}
     pool:
       ${{ if eq(job.os, 'ubuntu2404') }}:
         vmImage: 'ubuntu-24.04'
@@ -50,8 +73,10 @@ jobs:
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
       parameters:
         checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
       parameters:
+        componentName: ${{ parameters.componentName }}
         os: ${{ job.os }}
         useAmdclang: false
         extraBuildFlags: >-
@@ -65,9 +90,12 @@ jobs:
           -GNinja
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
       parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
         os: ${{ job.os }}
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
       parameters:
+        componentName: ${{ parameters.componentName }}
         os: ${{ job.os }}
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
     # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml

From 0840c14b6dce9a337e350d4debd929a1deeb1b56 Mon Sep 17 00:00:00 2001
From: amd-hsivasun <hsivasun@amd.com>
Date: Wed, 10 Sep 2025 11:56:06 -0400
Subject: [PATCH 29/53] [Ex CI] update rocm-core pipeline ID to monorepo

---
 .azuredevops/templates/steps/dependencies-rocm.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.azuredevops/templates/steps/dependencies-rocm.yml b/.azuredevops/templates/steps/dependencies-rocm.yml
index f62b973df..cd68fe411 100644
--- a/.azuredevops/templates/steps/dependencies-rocm.yml
+++ b/.azuredevops/templates/steps/dependencies-rocm.yml
@@ -171,8 +171,8 @@ parameters:
       developBranch: develop
       hasGpuTarget: false
     rocm-core:
-      pipelineId: 103
-      developBranch: master
+      pipelineId: 349
+      developBranch: develop
       hasGpuTarget: false
     rocm-examples:
       pipelineId: 216

From 3ca9cb1fcc0914edddd153ee6a50bc11dd6ed247 Mon Sep 17 00:00:00 2001
From: anisha-amd <anisha.sankar@amd.com>
Date: Wed, 10 Sep 2025 15:02:03 -0400
Subject: [PATCH 30/53] Docs: adding ray and llama.cpp live blog links (#5290)

---
 .wordlist.txt                                          |  1 +
 .../ml-compatibility/llama-cpp-compatibility.rst       |  7 ++++++-
 .../ml-compatibility/ray-compatibility.rst             | 10 ++++++++--
 3 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/.wordlist.txt b/.wordlist.txt
index 5370f4752..8cc6399b6 100644
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -156,6 +156,7 @@ GEMMs
 GFLOPS
 GFortran
 GFXIP
+GGUF
 Gemma
 GiB
 GIM
diff --git a/docs/compatibility/ml-compatibility/llama-cpp-compatibility.rst b/docs/compatibility/ml-compatibility/llama-cpp-compatibility.rst
index fd1356d32..1ae246931 100644
--- a/docs/compatibility/ml-compatibility/llama-cpp-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/llama-cpp-compatibility.rst
@@ -67,9 +67,14 @@ llama.cpp is also used in a range of real-world applications, including:
 - Various other AI applications use llama.cpp as their inference engine;  
   for a detailed list, see the `user interfaces (UIs) section <https://github.com/ggml-org/llama.cpp?tab=readme-ov-file#description>`__.
 
-Refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`_,
+For more use cases and recommendations, refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__, 
 where you can search for llama.cpp examples and best practices to optimize your workloads on AMD GPUs.
 
+- The `Llama.cpp Meets Instinct: A New Era of Open-Source AI Acceleration <https://rocm.blogs.amd.com/ecosystems-and-partners/llama-cpp/README.html>`__, 
+  blog post outlines how the open-source llama.cpp framework enables efficient LLM inference—including interactive inference with ``llama-cli``, 
+  server deployment with ``llama-server``, GGUF model preparation and quantization, performance benchmarking, and optimizations tailored for 
+  AMD Instinct GPUs within the ROCm ecosystem. 
+
 .. _llama-cpp-docker-compat:
 
 Docker image compatibility
diff --git a/docs/compatibility/ml-compatibility/ray-compatibility.rst b/docs/compatibility/ml-compatibility/ray-compatibility.rst
index c5a2ed39f..2f5c83589 100644
--- a/docs/compatibility/ml-compatibility/ray-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/ray-compatibility.rst
@@ -66,9 +66,15 @@ Use cases and recommendations
   GPUs. Follow this guide to get started with verl on AMD Instinct GPUs and 
   accelerate your RLHF training with ROCm-optimized performance.
 
+* The `Exploring Use Cases for Scalable AI: Implementing Ray with ROCm Support for Efficient ML Workflows 
+  <https://rocm.blogs.amd.com/artificial-intelligence/rocm-ray/README.html>`__
+  blog post describes key use cases such as training and inference for large language models (LLMs), 
+  model serving, hyperparameter tuning, reinforcement learning, and the orchestration of large-scale 
+  workloads using Ray in the ROCm environment.
+
 For more use cases and recommendations, see the AMD GPU tabs in the `Accelerator Support 
-topic <https://docs.ray.io/en/latest/ray-core/scheduling/accelerators.html#accelerator-support>`_ 
-of the Ray core documentation and refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`_, 
+topic <https://docs.ray.io/en/latest/ray-core/scheduling/accelerators.html#accelerator-support>`__ 
+of the Ray core documentation and refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__, 
 where you can search for Ray examples and best practices to optimize your workloads on AMD GPUs.
 
 .. _ray-docker-compat:

From 88f1493b687905cc252c24f9b32641c4f25a540f Mon Sep 17 00:00:00 2001
From: Haresh Sivasuntharampillai <Haresh.Sivasuntharampillai@amd.com>
Date: Wed, 10 Sep 2025 19:16:35 +0000
Subject: [PATCH 31/53] [Ex CI] enable rocminfo monorepo

---
 .azuredevops/components/rocminfo.yml | 147 ++++++++++++++++-----------
 1 file changed, 90 insertions(+), 57 deletions(-)

diff --git a/.azuredevops/components/rocminfo.yml b/.azuredevops/components/rocminfo.yml
index aada773ca..f3e87bf57 100644
--- a/.azuredevops/components/rocminfo.yml
+++ b/.azuredevops/components/rocminfo.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: rocminfo
 - name: checkoutRepo
   type: string
   default: 'self'
 - name: checkoutRef
   type: string
   default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -40,7 +59,11 @@ parameters:
 
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: rocminfo_build_${{ job.os }}
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}
     pool:
       vmImage: 'ubuntu-22.04'
     ${{ if eq(job.os, 'almalinux8') }}:
@@ -62,14 +85,18 @@ jobs:
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
       parameters:
         checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
       parameters:
         checkoutRef: ${{ parameters.checkoutRef }}
         dependencyList: ${{ parameters.rocmDependencies }}
         aggregatePipeline: ${{ parameters.aggregatePipeline }}
         os: ${{ job.os }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
       parameters:
+        componentName: ${{ parameters.componentName }}
         os: ${{ job.os }}
         useAmdclang: false
         extraBuildFlags: >-
@@ -78,65 +105,71 @@ jobs:
           -GNinja
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
       parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
         os: ${{ job.os }}
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
       parameters:
+        componentName: ${{ parameters.componentName }}
         os: ${{ job.os }}
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
 
-- ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: rocminfo_test_${{ job.target }}
-    dependsOn: rocminfo_build_${{ job.os }}
-    condition:
-      and(succeeded(),
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
-        eq(${{ parameters.aggregatePipeline }}, False)
-      )
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool: ${{ job.target }}_test_pool
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        packageManager: ${{ job.packageManager }}
-        registerROCmPackages: true
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-      parameters:
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmTestDependencies }}
-        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-      parameters:
-        runRocminfo: false
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: rocminfo
-        testDir: '$(Agent.BuildDirectory)'
-        testExecutable: './rocm/bin/rocminfo'
-        testParameters: ''
-        testPublishResults: false
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: rocm_agent_enumerator
-        testDir: '$(Agent.BuildDirectory)'
-        testExecutable: './rocm/bin/rocm_agent_enumerator'
-        testParameters: ''
-        testPublishResults: false
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        registerROCmPackages: true
-        environment: test
-        gpuTarget: ${{ job.target }}
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
+    - job: rocminfo_test_${{ job.target }}
+      dependsOn: rocminfo_build_${{ job.os }}
+      condition:
+        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
+          eq(${{ parameters.aggregatePipeline }}, False)
+        )
+      variables:
+      - group: common
+      - template: /.azuredevops/variables-global.yml
+      pool: ${{ job.target }}_test_pool
+      workspace:
+        clean: all
+      steps:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          packageManager: ${{ job.packageManager }}
+          registerROCmPackages: true
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+        parameters:
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
+          dependencyList: ${{ parameters.rocmTestDependencies }}
+          gpuTarget: ${{ job.target }}
+          os: ${{ job.os }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+        parameters:
+          runRocminfo: false
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: ${{ parameters.componentName }}
+          testDir: '$(Agent.BuildDirectory)'
+          testExecutable: './rocm/bin/rocminfo'
+          testParameters: ''
+          testPublishResults: false
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: rocm_agent_enumerator
+          testDir: '$(Agent.BuildDirectory)'
+          testExecutable: './rocm/bin/rocm_agent_enumerator'
+          testParameters: ''
+          testPublishResults: false
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          registerROCmPackages: true
+          environment: test
+          gpuTarget: ${{ job.target }}

From 56f566c1dca6c3d934fb106d004d0b18f829fb86 Mon Sep 17 00:00:00 2001
From: amd-hsivasun <hsivasun@amd.com>
Date: Wed, 10 Sep 2025 16:45:35 -0400
Subject: [PATCH 32/53] [Ex CI] update rocminfo pipeline ID to monorepo

---
 .azuredevops/templates/steps/dependencies-rocm.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.azuredevops/templates/steps/dependencies-rocm.yml b/.azuredevops/templates/steps/dependencies-rocm.yml
index cd68fe411..efd5a81ae 100644
--- a/.azuredevops/templates/steps/dependencies-rocm.yml
+++ b/.azuredevops/templates/steps/dependencies-rocm.yml
@@ -179,8 +179,8 @@ parameters:
       developBranch: amd-staging
       hasGpuTarget: true
     rocminfo:
-      pipelineId: 91
-      developBranch: amd-staging
+      pipelineId: 356
+      developBranch: develop
       hasGpuTarget: false
     rocMLIR:
       pipelineId: 229

From d3fe7439cff7b1f2b1ab69c57cb334e68906d6af Mon Sep 17 00:00:00 2001
From: amd-hsivasun <hsivasun@amd.com>
Date: Wed, 10 Sep 2025 20:37:18 +0000
Subject: [PATCH 33/53] [Ex CI] enable rocm-smi-lib monorepo

---
 .azuredevops/components/rocm_smi_lib.yml | 110 ++++++++++++++---------
 1 file changed, 70 insertions(+), 40 deletions(-)

diff --git a/.azuredevops/components/rocm_smi_lib.yml b/.azuredevops/components/rocm_smi_lib.yml
index 31459a868..101ed0bd1 100644
--- a/.azuredevops/components/rocm_smi_lib.yml
+++ b/.azuredevops/components/rocm_smi_lib.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: rocm_smi_lib
 - name: checkoutRepo
   type: string
   default: 'self'
 - name: checkoutRef
   type: string
   default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -31,7 +50,11 @@ parameters:
 
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: rocm_smi_lib_build_${{ job.os }}
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}
     pool:
       ${{ if eq(job.os, 'ubuntu2404') }}:
         vmImage: 'ubuntu-24.04'
@@ -55,8 +78,10 @@ jobs:
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
       parameters:
         checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
       parameters:
+        componentName: ${{ parameters.componentName }}
         os: ${{ job.os }}
         useAmdclang: false
         extraBuildFlags: >-
@@ -65,51 +90,56 @@ jobs:
           -GNinja
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
       parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
         os: ${{ job.os }}
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
       parameters:
+        componentName: ${{ parameters.componentName }}
         os: ${{ job.os }}
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
     # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
     #   parameters:
     #     aptPackages: ${{ parameters.aptPackages }}
 
-- ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: rocm_smi_lib_test_${{ job.os }}_${{ job.target }}
-    dependsOn: rocm_smi_lib_build_${{ job.os }}
-    condition:
-      and(succeeded(),
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
-        eq(${{ parameters.aggregatePipeline }}, False)
-      )
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool: ${{ job.target }}_test_pool
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        packageManager: ${{ job.packageManager }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-      parameters:
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-      parameters:
-        runRocminfo: false
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: rocm_smi_lib
-        testDir: '$(Agent.BuildDirectory)'
-        testExecutable: 'sudo ./rocm/share/rocm_smi/rsmitst_tests/rsmitst'
-        testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        environment: test
-        gpuTarget: ${{ job.target }}
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
+    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
+      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}
+      condition:
+        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
+          eq(${{ parameters.aggregatePipeline }}, False)
+        )
+      variables:
+      - group: common
+      - template: /.azuredevops/variables-global.yml
+      pool: ${{ job.target }}_test_pool
+      workspace:
+        clean: all
+      steps:
+      - checkout: none
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          packageManager: ${{ job.packageManager }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+        parameters:
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+        parameters:
+          runRocminfo: false
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: ${{ parameters.componentName }}
+          testDir: '$(Agent.BuildDirectory)'
+          testExecutable: 'sudo ./rocm/share/rocm_smi/rsmitst_tests/rsmitst'
+          testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          environment: test
+          gpuTarget: ${{ job.target }}
\ No newline at end of file

From 964a7cd0b51dc12ef16048cc2277fdefaf82b1ab Mon Sep 17 00:00:00 2001
From: amd-hsivasun <hsivasun@amd.com>
Date: Wed, 10 Sep 2025 20:43:28 +0000
Subject: [PATCH 34/53] fixed component name

---
 .azuredevops/components/rocm_smi_lib.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.azuredevops/components/rocm_smi_lib.yml b/.azuredevops/components/rocm_smi_lib.yml
index 101ed0bd1..138bc559e 100644
--- a/.azuredevops/components/rocm_smi_lib.yml
+++ b/.azuredevops/components/rocm_smi_lib.yml
@@ -1,7 +1,7 @@
 parameters:
 - name: componentName
   type: string
-  default: rocm_smi_lib
+  default: rocm-smi-lib
 - name: checkoutRepo
   type: string
   default: 'self'
@@ -50,7 +50,7 @@ parameters:
 
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: ${{ parameters.componentName }}_build_${{ job.os }}
+  - job: rocm_smi_lib_build_${{ job.os }}
     ${{ if parameters.buildDependsOn }}:
       dependsOn:
         - ${{ each build in parameters.buildDependsOn }}:
@@ -104,8 +104,8 @@ jobs:
 
 - ${{ if eq(parameters.unifiedBuild, False) }}:
   - ${{ each job in parameters.jobMatrix.testJobs }}:
-    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
-      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}
+    - job: rocm_smi_lib_test_${{ job.os }}_${{ job.target }}
+      dependsOn: rocm_smi_lib_build_${{ job.os }}
       condition:
         and(succeeded(),
           eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),

From 10f60868197a4d13591ac6949a077c05121c28b6 Mon Sep 17 00:00:00 2001
From: Joseph Macaranas <145489236+jayhawk-commits@users.noreply.github.com>
Date: Thu, 11 Sep 2025 12:53:11 -0400
Subject: [PATCH 35/53] [External CI] Updates to rocm-libraries pipelines
 (#5300)

- Add msgpack python module dependency for hipsparselt pipeline.
- Change CMake dirs for rocblas pipeline to allow relative-path access to shared/tensile directory.
---
 .azuredevops/components/hipSPARSELt.yml | 1 +
 .azuredevops/components/rocBLAS.yml     | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/.azuredevops/components/hipSPARSELt.yml b/.azuredevops/components/hipSPARSELt.yml
index 104e0ee6c..02e258f78 100644
--- a/.azuredevops/components/hipSPARSELt.yml
+++ b/.azuredevops/components/hipSPARSELt.yml
@@ -44,6 +44,7 @@ parameters:
   type: object
   default:
     - joblib
+    - msgpack
 - name: rocmDependencies
   type: object
   default:
diff --git a/.azuredevops/components/rocBLAS.yml b/.azuredevops/components/rocBLAS.yml
index 6aab7ebb3..ca3577b5b 100644
--- a/.azuredevops/components/rocBLAS.yml
+++ b/.azuredevops/components/rocBLAS.yml
@@ -179,6 +179,8 @@ jobs:
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
       parameters:
         os: ${{ job.os }}
+        cmakeSourceDir: $(Agent.BuildDirectory)/sparse/projects/rocblas
+        cmakeBuildDir: $(Agent.BuildDirectory)/sparse/projects/rocblas/build
         extraBuildFlags: >-
           -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm/llvm;$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor
           -DCMAKE_BUILD_TYPE=Release

From 7098bdc03bf51bb79a483dc1b4fd5f0d6aa98813 Mon Sep 17 00:00:00 2001
From: Peter Park <peter.park@amd.com>
Date: Thu, 11 Sep 2025 15:01:17 -0400
Subject: [PATCH 36/53] Update vLLM inference benchmark doc for 0909 release
 (and Sphinx fixes) (#5289)

---
 .wordlist.txt                                 |   3 +
 ...vllm_0.10.0_20250812-benchmark-models.yaml |  91 ++++
 ...vllm_0.9.1_20250715-benchmark-models.yaml} |   0
 .../inference/sglang-benchmark-models.yaml    |  33 +-
 .../inference/vllm-benchmark-models.yaml      | 280 +++++++----
 .../vllm-0.10.0-20250812.rst                  | 445 ++++++++++++++++++
 .../previous-versions/vllm-0.9.1-20250715.rst |   6 +-
 .../previous-versions/vllm-history.rst        |   2 +-
 .../benchmark-docker/pytorch-inference.rst    |  42 +-
 .../inference/benchmark-docker/sglang.rst     |  26 +-
 .../inference/benchmark-docker/vllm.rst       | 363 +++++++-------
 .../training/benchmark-docker/jax-maxtext.rst |  30 +-
 .../training/benchmark-docker/megatron-lm.rst |  28 +-
 .../previous-versions/jax-maxtext-v25.5.rst   |   6 +-
 .../benchmark-docker/primus-megatron.rst      |  28 +-
 .../benchmark-docker/pytorch-training.rst     |  46 +-
 docs/sphinx/static/css/vllm-benchmark.css     |  46 +-
 17 files changed, 1041 insertions(+), 434 deletions(-)
 create mode 100644 docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.0_20250812-benchmark-models.yaml
 rename docs/data/how-to/rocm-for-ai/inference/previous-versions/{vllm_0.9.1_20250715-benchmark_models.yaml => vllm_0.9.1_20250715-benchmark-models.yaml} (100%)
 create mode 100644 docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.0-20250812.rst

diff --git a/.wordlist.txt b/.wordlist.txt
index 8cc6399b6..cf9f990d4 100644
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -673,6 +673,7 @@ github
 globals
 gnupg
 grayscale
+gx
 gzip
 heterogenous
 hipBLAS
@@ -783,6 +784,7 @@ parallelizing
 param
 parameterization
 passthrough
+pe
 perfcounter
 performant
 perl
@@ -812,6 +814,7 @@ profiler
 profilers
 protobuf
 pseudorandom
+px
 py
 pytorch
 recommender
diff --git a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.0_20250812-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.0_20250812-benchmark-models.yaml
new file mode 100644
index 000000000..418415319
--- /dev/null
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.0_20250812-benchmark-models.yaml
@@ -0,0 +1,91 @@
+vllm_benchmark:
+  unified_docker:
+    latest:
+      pull_tag: rocm/vllm:rocm6.4.1_vllm_0.10.0_20250812
+      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.0_20250812/images/sha256-4c277ad39af3a8c9feac9b30bf78d439c74d9b4728e788a419d3f1d0c30cacaa
+      rocm_version: 6.4.1
+      vllm_version: 0.10.0 (0.10.1.dev395+g340ea86df.rocm641)
+      pytorch_version: 2.7.0+gitf717b2a
+      hipblaslt_version: 0.15
+  model_groups:
+    - group: Meta Llama
+      tag: llama
+      models:
+      - model: Llama 3.1 8B
+        mad_tag: pyt_vllm_llama-3.1-8b
+        model_repo: meta-llama/Llama-3.1-8B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-8B
+        precision: float16
+      - model: Llama 3.1 70B
+        mad_tag: pyt_vllm_llama-3.1-70b
+        model_repo: meta-llama/Llama-3.1-70B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
+        precision: float16
+      - model: Llama 3.1 405B
+        mad_tag: pyt_vllm_llama-3.1-405b
+        model_repo: meta-llama/Llama-3.1-405B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
+        precision: float16
+      - model: Llama 2 70B
+        mad_tag: pyt_vllm_llama-2-70b
+        model_repo: meta-llama/Llama-2-70b-chat-hf
+        url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
+        precision: float16
+      - model: Llama 3.1 8B FP8
+        mad_tag: pyt_vllm_llama-3.1-8b_fp8
+        model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
+        precision: float8
+      - model: Llama 3.1 70B FP8
+        mad_tag: pyt_vllm_llama-3.1-70b_fp8
+        model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
+        precision: float8
+      - model: Llama 3.1 405B FP8
+        mad_tag: pyt_vllm_llama-3.1-405b_fp8
+        model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
+        precision: float8
+    - group: Mistral AI
+      tag: mistral
+      models:
+      - model: Mixtral MoE 8x7B
+        mad_tag: pyt_vllm_mixtral-8x7b
+        model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
+        url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
+        precision: float16
+      - model: Mixtral MoE 8x22B
+        mad_tag: pyt_vllm_mixtral-8x22b
+        model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
+        url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
+        precision: float16
+      - model: Mixtral MoE 8x7B FP8
+        mad_tag: pyt_vllm_mixtral-8x7b_fp8
+        model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+        precision: float8
+      - model: Mixtral MoE 8x22B FP8
+        mad_tag: pyt_vllm_mixtral-8x22b_fp8
+        model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+        precision: float8
+    - group: Qwen
+      tag: qwen
+      models:
+      - model: QwQ-32B
+        mad_tag: pyt_vllm_qwq-32b
+        model_repo: Qwen/QwQ-32B
+        url: https://huggingface.co/Qwen/QwQ-32B
+        precision: float16
+      - model: Qwen3 30B A3B
+        mad_tag: pyt_vllm_qwen3-30b-a3b
+        model_repo: Qwen/Qwen3-30B-A3B
+        url: https://huggingface.co/Qwen/Qwen3-30B-A3B
+        precision: float16
+    - group: Microsoft Phi
+      tag: phi
+      models:
+      - model: Phi-4
+        mad_tag: pyt_vllm_phi-4
+        model_repo: microsoft/phi-4
+        url: https://huggingface.co/microsoft/phi-4
diff --git a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark-models.yaml
similarity index 100%
rename from docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml
rename to docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark-models.yaml
diff --git a/docs/data/how-to/rocm-for-ai/inference/sglang-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/inference/sglang-benchmark-models.yaml
index cc832dffb..8f80424d3 100644
--- a/docs/data/how-to/rocm-for-ai/inference/sglang-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/sglang-benchmark-models.yaml
@@ -1,17 +1,16 @@
-sglang_benchmark:
-  unified_docker:
-    latest:
-      pull_tag: lmsysorg/sglang:v0.4.5-rocm630
-      docker_hub_url: https://hub.docker.com/layers/lmsysorg/sglang/v0.4.5-rocm630/images/sha256-63d2cb760a237125daf6612464cfe2f395c0784e21e8b0ea37d551cd10d3c951
-      rocm_version: 6.3.0
-      sglang_version: 0.4.5 (0.4.5-rocm)
-      pytorch_version: 2.6.0a0+git8d4926e
-  model_groups:
-    - group: DeepSeek
-      tag: deepseek
-      models:
-      - model: DeepSeek-R1-Distill-Qwen-32B
-        mad_tag: pyt_sglang_deepseek-r1-distill-qwen-32b
-        model_repo: deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
-        url: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
-        precision: bfloat16
+dockers:
+  - pull_tag: lmsysorg/sglang:v0.4.5-rocm630
+    docker_hub_url: https://hub.docker.com/layers/lmsysorg/sglang/v0.4.5-rocm630/images/sha256-63d2cb760a237125daf6612464cfe2f395c0784e21e8b0ea37d551cd10d3c951
+    components:
+      ROCm: 6.3.0
+      SGLang: 0.4.5 (0.4.5-rocm)
+      PyTorch: 2.6.0a0+git8d4926e
+model_groups:
+  - group: DeepSeek
+    tag: deepseek
+    models:
+    - model: DeepSeek-R1-Distill-Qwen-32B
+      mad_tag: pyt_sglang_deepseek-r1-distill-qwen-32b
+      model_repo: deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
+      url: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
+      precision: bfloat16
diff --git a/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml b/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
index a522e61a6..99d9b773b 100644
--- a/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
@@ -1,92 +1,188 @@
-vllm_benchmark:
-  unified_docker:
-    latest:
-      # TODO: update me
-      pull_tag: rocm/vllm:rocm6.4.1_vllm_0.10.0_20250812
-      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.0_20250812/images/sha256-4c277ad39af3a8c9feac9b30bf78d439c74d9b4728e788a419d3f1d0c30cacaa
-      rocm_version: 6.4.1
-      vllm_version: 0.10.0 (0.10.1.dev395+g340ea86df.rocm641)
-      pytorch_version: 2.7.0+gitf717b2a (2.7.0+gitf717b2a)
-      hipblaslt_version: 0.15
-  model_groups:
-    - group: Meta Llama
-      tag: llama
-      models:
-      - model: Llama 3.1 8B
-        mad_tag: pyt_vllm_llama-3.1-8b
-        model_repo: meta-llama/Llama-3.1-8B-Instruct
-        url: https://huggingface.co/meta-llama/Llama-3.1-8B
-        precision: float16
-      - model: Llama 3.1 70B
-        mad_tag: pyt_vllm_llama-3.1-70b
-        model_repo: meta-llama/Llama-3.1-70B-Instruct
-        url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
-        precision: float16
-      - model: Llama 3.1 405B
-        mad_tag: pyt_vllm_llama-3.1-405b
-        model_repo: meta-llama/Llama-3.1-405B-Instruct
-        url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
-        precision: float16
-      - model: Llama 2 70B
-        mad_tag: pyt_vllm_llama-2-70b
-        model_repo: meta-llama/Llama-2-70b-chat-hf
-        url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
-        precision: float16
-      - model: Llama 3.1 8B FP8
-        mad_tag: pyt_vllm_llama-3.1-8b_fp8
-        model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
-        url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
-        precision: float8
-      - model: Llama 3.1 70B FP8
-        mad_tag: pyt_vllm_llama-3.1-70b_fp8
-        model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
-        url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
-        precision: float8
-      - model: Llama 3.1 405B FP8
-        mad_tag: pyt_vllm_llama-3.1-405b_fp8
-        model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
-        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
-        precision: float8
-    - group: Mistral AI
-      tag: mistral
-      models:
-      - model: Mixtral MoE 8x7B
-        mad_tag: pyt_vllm_mixtral-8x7b
-        model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
-        url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
-        precision: float16
-      - model: Mixtral MoE 8x22B
-        mad_tag: pyt_vllm_mixtral-8x22b
-        model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
-        url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
-        precision: float16
-      - model: Mixtral MoE 8x7B FP8
-        mad_tag: pyt_vllm_mixtral-8x7b_fp8
-        model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
-        url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
-        precision: float8
-      - model: Mixtral MoE 8x22B FP8
-        mad_tag: pyt_vllm_mixtral-8x22b_fp8
-        model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
-        url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
-        precision: float8
-    - group: Qwen
-      tag: qwen
-      models:
-      - model: QwQ-32B
-        mad_tag: pyt_vllm_qwq-32b
-        model_repo: Qwen/QwQ-32B
-        url: https://huggingface.co/Qwen/QwQ-32B
-        precision: float16
-      - model: Qwen3 30B A3B
-        mad_tag: pyt_vllm_qwen3-30b-a3b
-        model_repo: Qwen/Qwen3-30B-A3B
-        url: https://huggingface.co/Qwen/Qwen3-30B-A3B
-        precision: float16
-    - group: Microsoft Phi
-      tag: phi
-      models:
-      - model: Phi-4
-        mad_tag: pyt_vllm_phi-4
-        model_repo: microsoft/phi-4
-        url: https://huggingface.co/microsoft/phi-4
+dockers:
+  - pull_tag: rocm/vllm:rocm6.4.1_vllm_0.10.1_20250909
+    docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.1_20250909/images/sha256-1113268572e26d59b205792047bea0e61e018e79aeadceba118b7bf23cb3715c
+    components:
+      ROCm: 6.4.1
+      vLLM: 0.10.1 (0.10.1rc2.dev409+g0b6bf6691.rocm641)
+      PyTorch: 2.7.0+gitf717b2a
+      hipBLASLt: 0.15
+model_groups:
+  - group: Meta Llama
+    tag: llama
+    models:
+    - model: Llama 3.1 8B
+      mad_tag: pyt_vllm_llama-3.1-8b
+      model_repo: meta-llama/Llama-3.1-8B-Instruct
+      url: https://huggingface.co/meta-llama/Llama-3.1-8B
+      precision: float16
+      config:
+        tp: 1
+        dtype: auto
+        kv_cache_dtype: auto
+        max_seq_len_to_capture: 131072
+        max_num_batched_tokens: 131072
+        max_model_len: 8192
+    - model: Llama 3.1 70B
+      mad_tag: pyt_vllm_llama-3.1-70b
+      model_repo: meta-llama/Llama-3.1-70B-Instruct
+      url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
+      precision: float16
+      config:
+        tp: 8
+        dtype: auto
+        kv_cache_dtype: auto
+        max_seq_len_to_capture: 131072
+        max_num_batched_tokens: 131072
+        max_model_len: 8192
+    - model: Llama 3.1 405B
+      mad_tag: pyt_vllm_llama-3.1-405b
+      model_repo: meta-llama/Llama-3.1-405B-Instruct
+      url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
+      precision: float16
+      config:
+        tp: 8
+        dtype: auto
+        kv_cache_dtype: auto
+        max_seq_len_to_capture: 131072
+        max_num_batched_tokens: 131072
+        max_model_len: 8192
+    - model: Llama 2 70B
+      mad_tag: pyt_vllm_llama-2-70b
+      model_repo: meta-llama/Llama-2-70b-chat-hf
+      url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
+      precision: float16
+      config:
+        tp: 8
+        dtype: auto
+        kv_cache_dtype: auto
+        max_seq_len_to_capture: 4096
+        max_num_batched_tokens: 4096
+        max_model_len: 4096
+    - model: Llama 3.1 8B FP8
+      mad_tag: pyt_vllm_llama-3.1-8b_fp8
+      model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
+      url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
+      precision: float8
+      config:
+        tp: 1
+        dtype: auto
+        kv_cache_dtype: fp8
+        max_seq_len_to_capture: 131072
+        max_num_batched_tokens: 131072
+        max_model_len: 8192
+    - model: Llama 3.1 70B FP8
+      mad_tag: pyt_vllm_llama-3.1-70b_fp8
+      model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
+      url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
+      precision: float8
+      config:
+        tp: 8
+        dtype: auto
+        kv_cache_dtype: fp8
+        max_seq_len_to_capture: 131072
+        max_num_batched_tokens: 131072
+        max_model_len: 8192
+    - model: Llama 3.1 405B FP8
+      mad_tag: pyt_vllm_llama-3.1-405b_fp8
+      model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
+      url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
+      precision: float8
+      config:
+        tp: 8
+        dtype: auto
+        kv_cache_dtype: fp8
+        max_seq_len_to_capture: 131072
+        max_num_batched_tokens: 131072
+        max_model_len: 8192
+  - group: Mistral AI
+    tag: mistral
+    models:
+    - model: Mixtral MoE 8x7B
+      mad_tag: pyt_vllm_mixtral-8x7b
+      model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
+      url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
+      precision: float16
+      config:
+        tp: 8
+        dtype: auto
+        kv_cache_dtype: auto
+        max_seq_len_to_capture: 32768
+        max_num_batched_tokens: 32768
+        max_model_len: 8192
+    - model: Mixtral MoE 8x22B
+      mad_tag: pyt_vllm_mixtral-8x22b
+      model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
+      url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
+      precision: float16
+      config:
+        tp: 8
+        dtype: auto
+        kv_cache_dtype: auto
+        max_seq_len_to_capture: 65536
+        max_num_batched_tokens: 65536
+        max_model_len: 8192
+    - model: Mixtral MoE 8x7B FP8
+      mad_tag: pyt_vllm_mixtral-8x7b_fp8
+      model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+      url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+      precision: float8
+      config:
+        tp: 8
+        dtype: auto
+        kv_cache_dtype: fp8
+        max_seq_len_to_capture: 32768
+        max_num_batched_tokens: 32768
+        max_model_len: 8192
+    - model: Mixtral MoE 8x22B FP8
+      mad_tag: pyt_vllm_mixtral-8x22b_fp8
+      model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+      url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+      precision: float8
+      config:
+        tp: 8
+        dtype: auto
+        kv_cache_dtype: fp8
+        max_seq_len_to_capture: 65536
+        max_num_batched_tokens: 65536
+        max_model_len: 8192
+  - group: Qwen
+    tag: qwen
+    models:
+    - model: QwQ-32B
+      mad_tag: pyt_vllm_qwq-32b
+      model_repo: Qwen/QwQ-32B
+      url: https://huggingface.co/Qwen/QwQ-32B
+      precision: float16
+      config:
+        tp: 1
+        dtype: auto
+        kv_cache_dtype: auto
+        max_seq_len_to_capture: 131072
+        max_num_batched_tokens: 131072
+        max_model_len: 8192
+    - model: Qwen3 30B A3B
+      mad_tag: pyt_vllm_qwen3-30b-a3b
+      model_repo: Qwen/Qwen3-30B-A3B
+      url: https://huggingface.co/Qwen/Qwen3-30B-A3B
+      precision: float16
+      config:
+        tp: 1
+        dtype: auto
+        kv_cache_dtype: auto
+        max_seq_len_to_capture: 32768
+        max_num_batched_tokens: 32768
+        max_model_len: 8192
+  - group: Microsoft Phi
+    tag: phi
+    models:
+    - model: Phi-4
+      mad_tag: pyt_vllm_phi-4
+      model_repo: microsoft/phi-4
+      url: https://huggingface.co/microsoft/phi-4
+      config:
+        tp: 1
+        dtype: auto
+        kv_cache_dtype: auto
+        max_seq_len_to_capture: 16384
+        max_num_batched_tokens: 16384
+        max_model_len: 8192
diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.0-20250812.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.0-20250812.rst
new file mode 100644
index 000000000..68d7f66e7
--- /dev/null
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.0-20250812.rst
@@ -0,0 +1,445 @@
+:orphan:
+
+.. meta::
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
+                 ROCm vLLM Docker image.
+   :keywords: model, MAD, automation, dashboarding, validate
+
+**********************************
+vLLM inference performance testing
+**********************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm vLLM
+   inference performance documentation. See :doc:`../vllm` for the latest version.
+
+.. _vllm-benchmark-unified-docker-812:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.0_20250812-benchmark-models.yaml
+
+   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
+   {% set model_groups = data.vllm_benchmark.model_groups %}
+
+   The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
+   a prebuilt, optimized environment for validating large language model (LLM)
+   inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
+   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
+   accelerators and includes the following components:
+
+   .. list-table::
+      :header-rows: 1
+
+      * - Software component
+        - Version
+
+      * - `ROCm <https://github.com/ROCm/ROCm>`__
+        - {{ unified_docker.rocm_version }}
+
+      * - `vLLM <https://docs.vllm.ai/en/latest>`__
+        - {{ unified_docker.vllm_version }}
+
+      * - `PyTorch <https://github.com/ROCm/pytorch>`__
+        - {{ unified_docker.pytorch_version }}
+
+      * - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`__
+        - {{ unified_docker.hipblaslt_version }}
+
+With this Docker image, you can quickly test the :ref:`expected
+inference performance numbers <vllm-benchmark-performance-measurements-812>` for
+MI300X series accelerators.
+
+What's new
+==========
+
+The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <vllm-history>`.
+
+* Upgraded to vLLM v0.10.
+
+* FP8 KV cache support via AITER.
+
+* Full graph capture support via AITER.
+
+Supported models
+================
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.0_20250812-benchmark-models.yaml
+
+   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
+   {% set model_groups = data.vllm_benchmark.model_groups %}
+
+   .. _vllm-benchmark-available-models-812:
+
+   The following models are supported for inference performance benchmarking
+   with vLLM and ROCm. Some instructions, commands, and recommendations in this
+   documentation might vary by model -- select one to get started.
+
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+      <div class="row">
+         <div class="col-2 me-2 model-param-head">Model group</div>
+         <div class="row col-10">
+   {% for model_group in model_groups %}
+            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+   {% endfor %}
+         </div>
+      </div>
+
+      <div class="row mt-1">
+         <div class="col-2 me-2 model-param-head">Model</div>
+         <div class="row col-10">
+   {% for model_group in model_groups %}
+      {% set models = model_group.models %}
+      {% for model in models %}
+         {% if models|length % 3 == 0 %}
+            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% else %}
+            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% endif %}
+      {% endfor %}
+   {% endfor %}
+         </div>
+      </div>
+      </div>
+
+   .. _vllm-benchmark-vllm-812:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. note::
+
+         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
+         Some models require access authorization prior to use via an external license agreement through a third party.
+
+      {% endfor %}
+   {% endfor %}
+
+.. note::
+
+   vLLM is a toolkit and library for LLM inference and serving. AMD implements
+   high-performance custom kernels and modules in vLLM to enhance performance.
+   See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
+   more information.
+
+.. _vllm-benchmark-performance-measurements-812:
+
+Performance measurements
+========================
+
+To evaluate performance, the
+`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+page provides reference throughput and serving measurements for inferencing popular AI models.
+
+.. important::
+
+   The performance data presented in
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+   only reflects the latest version of this inference benchmarking environment.
+   The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting training.
+
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.0_20250812-benchmark-models.yaml
+
+   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
+   {% set model_groups = data.vllm_benchmark.model_groups %}
+
+   Pull the Docker image
+   =====================
+
+   Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
+   Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull {{ unified_docker.pull_tag }}
+
+   Benchmarking
+   ============
+
+   Once the setup is complete, choose between two options to reproduce the
+   benchmark results:
+
+   .. _vllm-benchmark-mad-812:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. tab-set::
+
+         .. tab-item:: MAD-integrated benchmarking
+
+            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+               directory and install the required packages on the host machine.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD
+                  pip install -r requirements.txt
+
+            2. Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
+               using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
+
+               .. code-block:: shell
+
+                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+                  madengine run \
+                      --tags {{model.mad_tag}} \
+                      --keep-model-dir \
+                      --live-output \
+                      --timeout 28800
+
+            MAD launches a Docker container with the name
+            ``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the
+            model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv``
+            and ``{{ model.mad_tag }}_serving.csv``.
+
+            Although the :ref:`available models
+            <vllm-benchmark-available-models-812>` are preconfigured to collect
+            offline throughput and online serving performance data, you can
+            also change the benchmarking parameters. See the standalone
+            benchmarking tab for more information.
+
+            {% if model.tunableop %}
+
+            .. note::
+
+               For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
+               TunableOp automatically explores different implementations and configurations of certain PyTorch
+               operators to find the fastest one for your hardware.
+
+               By default, ``{{model.mad_tag}}`` runs with TunableOp disabled (see
+               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To enable it, include
+               the ``--tunableop on`` argument in your run.
+
+               Enabling TunableOp triggers a two-pass run -- a warm-up followed by the
+               performance-collection run.
+
+            {% endif %}
+
+         .. tab-item:: Standalone benchmarking
+
+            .. rubric:: Download the Docker image and required scripts
+
+            1. Run the vLLM benchmark tool independently by starting the
+               `Docker container <{{ unified_docker.docker_hub_url }}>`_
+               as shown in the following snippet.
+
+               .. code-block:: shell
+
+                  docker pull {{ unified_docker.pull_tag }}
+                  docker run -it \
+                      --device=/dev/kfd \
+                      --device=/dev/dri \
+                      --group-add video \
+                      --shm-size 16G \
+                      --security-opt seccomp=unconfined \
+                      --security-opt apparmor=unconfined \
+                      --cap-add=SYS_PTRACE \
+                      -v $(pwd):/workspace \
+                      --env HUGGINGFACE_HUB_CACHE=/workspace \
+                      --name test \
+                      {{ unified_docker.pull_tag }}
+
+            2. In the Docker container, clone the ROCm MAD repository and navigate to the
+               benchmark scripts directory at ``~/MAD/scripts/vllm``.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD/scripts/vllm
+
+            3. To start the benchmark, use the following command with the appropriate options.
+
+               .. code-block::
+
+                  ./run.sh \
+                      --config $CONFIG_CSV \
+                      --model_repo {{ model.model_repo }} \
+                      <overrides>
+
+               .. dropdown:: Benchmark options
+                  :open:
+
+                  .. list-table::
+                     :header-rows: 1
+                     :align: center
+
+                     * - Name
+                       - Options
+                       - Description
+
+                     * - ``--config``
+                       - ``configs/default.csv``
+                       - Run configs from the CSV for the chosen model repo and benchmark.
+
+                     * -
+                       - ``configs/extended.csv``
+                       - 
+
+                     * -
+                       - ``configs/performance.csv``
+                       - 
+
+                     * - ``--benchmark``
+                       - ``throughput``
+                       - Measure offline end-to-end throughput.
+
+                     * - 
+                       - ``serving``
+                       - Measure online serving performance.
+
+                     * - 
+                       - ``all``
+                       - Measure both throughput and serving.
+
+                     * - `<overrides>`
+                       - See `run.sh <https://github.com/ROCm/MAD/blob/develop/scripts/vllm/run.sh>`__ for more info.
+                       - Additional overrides to the config CSV.
+
+                  The input sequence length, output sequence length, and tensor parallel (TP) are
+                  already configured. You don't need to specify them with this script.
+
+               .. note::
+
+                  For best performance, it's recommended to run with ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``.
+
+                  If you encounter the following error, pass your access-authorized Hugging
+                  Face token to the gated models.
+
+                  .. code-block::
+
+                     OSError: You are trying to access a gated repo.
+
+                     # pass your HF_TOKEN
+                     export HF_TOKEN=$your_personal_hf_token
+
+            .. rubric:: Benchmarking examples
+
+            Here are some examples of running the benchmark with various options:
+
+            * Throughput benchmark
+
+              Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
+
+              .. code-block:: shell
+
+                 export MAD_MODEL_NAME={{ model.mad_tag }}
+                 ./run.sh \
+                     --config configs/default.csv \
+                     --model_repo {{model.model_repo}} \
+                     --benchmark throughput
+
+              Find the throughput benchmark report at ``./{{ model.mad_tag }}_throughput.csv``.
+
+            * Serving benchmark
+
+              Use this command to benchmark the serving performance of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
+
+              .. code-block::
+
+                 export MAD_MODEL_NAME={{ model.mad_tag }}
+                 ./run.sh \
+                     --config configs/default.csv \
+                     --model_repo {{model.model_repo}} \
+                     --benchmark serving
+
+              Find the serving benchmark report at ``./{{ model.mad_tag }}_serving.csv``.
+
+            .. raw:: html
+
+               <style>
+               mjx-container[jax="CHTML"][display="true"] {
+                  text-align: left;
+                  margin: 0;
+               }
+               </style>
+
+            .. note::
+
+               Throughput is calculated as:
+
+               - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
+
+               - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
+      {% endfor %}
+   {% endfor %}
+
+Advanced usage
+==============
+
+For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
+see the developer's guide at `<https://github.com/ROCm/vllm/tree/f94ec9beeca1071cc34f9d1e206d8c7f3ac76129/docs/dev-docker>`__.
+
+Reproducing the Docker image
+----------------------------
+
+To reproduce this ROCm/vLLM Docker image release, follow these steps:
+
+1. Clone the `vLLM repository <https://github.com/ROCm/vllm>`__.
+
+   .. code-block:: shell
+
+      git clone https://github.com/ROCm/vllm.git
+
+2. Checkout the specific release commit.
+
+   .. code-block:: shell
+
+      cd vllm
+      git checkout 340ea86dfe5955d6f9a9e767d6abab5aacf2c978
+
+3. Build the Docker image. Replace ``vllm-rocm`` with your desired image tag.
+
+   .. code-block:: shell
+
+      docker build -f docker/Dockerfile.rocm -t vllm-rocm .
+
+Further reading
+===============
+
+- To learn more about the options for latency and throughput benchmark scripts,
+  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
+
+- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
+
+- To learn more about system settings and management practices to configure your system for
+  AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+
+- For application performance optimization strategies for HPC and AI workloads,
+  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
+
+- To learn how to run community models from Hugging Face on AMD GPUs, see
+  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
+
+- To learn how to fine-tune LLMs and optimize inference, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`vllm-history` to find documentation for previous releases
+of the ``ROCm/vllm`` Docker image.
diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715.rst
index 34df0359d..9f6d001ad 100644
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715.rst
@@ -16,7 +16,7 @@ vLLM inference performance testing
 
 .. _vllm-benchmark-unified-docker-715:
 
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark-models.yaml
 
    {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
    {% set model_groups = data.vllm_benchmark.model_groups %}
@@ -69,7 +69,7 @@ The following is summary of notable changes since the :doc:`previous ROCm/vLLM D
 Supported models
 ================
 
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark-models.yaml
 
    {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
    {% set model_groups = data.vllm_benchmark.model_groups %}
@@ -162,7 +162,7 @@ To test for optimal performance, consult the recommended :ref:`System health ben
 <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
 system's configuration.
 
-.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark-models.yaml
 
    {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
    {% set model_groups = data.vllm_benchmark.model_groups %}
diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst
index 6f87670ec..857a1ee0b 100644
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst
@@ -7,7 +7,7 @@ vLLM inference performance testing version history
 This table lists previous versions of the ROCm vLLM inference Docker image for
 inference performance testing. For detailed information about available models
 for benchmarking, see the version-specific documentation. You can find tagged
-previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub <https://hub.docker.com/r/rocm/vllm/tags>`__.
+previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.1_20250909/images/sha256-1113268572e26d59b205792047bea0e61e018e79aeadceba118b7bf23cb3715c>`__.
 
 .. list-table::
    :header-rows: 1
diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference.rst
index b9e22bf33..ad8db53c4 100644
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference.rst
@@ -31,26 +31,30 @@ PyTorch inference performance testing
    .. raw:: html
 
       <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
-        <div class="row">
-          <div class="col-2 me-2 model-param-head">Model</div>
-          <div class="row col-10">
-   {% for model_group in model_groups %}
-            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
-   {% endfor %}
-          </div>
-        </div>
-
-        <div class="row mt-1" style="display: none;">
-          <div class="col-2 me-2 model-param-head">Model</div>
-          <div class="row col-10">
-   {% for model_group in model_groups %}
-      {% set models = model_group.models %}
-      {% for model in models %}
-            <div class="col-12 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         <div class="row gx-0">
+            <div class="col-2 me-1 px-2 model-param-head">Model</div>
+            <div class="row col-10 pe-0">
+      {% for model_group in model_groups %}
+               <div class="col-3 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
       {% endfor %}
-   {% endfor %}
-          </div>
-        </div>
+            </div>
+         </div>
+
+         <div class="row gx-0 pt-1" style="display: none;">
+            <div class="col-2 me-1 px-2 model-param-head">Variant</div>
+            <div class="row col-10 pe-0">
+      {% for model_group in model_groups %}
+         {% set models = model_group.models %}
+         {% for model in models %}
+            {% if models|length % 3 == 0 %}
+               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% else %}
+               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% endif %}
+         {% endfor %}
+      {% endfor %}
+            </div>
+         </div>
       </div>
 
    {% for model_group in model_groups %}
diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/sglang.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/sglang.rst
index 340ef975e..1722b2018 100644
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/sglang.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/sglang.rst
@@ -2,19 +2,19 @@
    :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and SGLang
    :keywords: model, MAD, automation, dashboarding, validate
 
-************************************
-SGLang inference performance testing
-************************************
+*****************************************************************
+SGLang inference performance testing DeepSeek-R1-Distill-Qwen-32B
+*****************************************************************
 
 .. _sglang-benchmark-unified-docker:
 
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/sglang-benchmark-models.yaml
 
-   {% set unified_docker = data.sglang_benchmark.unified_docker.latest %}
+   {% set docker = data.dockers[0] %}
 
    `SGLang <https://docs.sglang.ai>`__ is a high-performance inference and
    serving engine for large language models (LLMs) and vision models. The
-   ROCm-enabled `SGLang Docker image <{{ unified_docker.docker_hub_url }}>`__
+   ROCm-enabled `SGLang Docker image <{{ docker.docker_hub_url }}>`__
    bundles SGLang with PyTorch, optimized for AMD Instinct MI300X series
    accelerators. It includes the following software components:
 
@@ -24,14 +24,10 @@ SGLang inference performance testing
       * - Software component
         - Version
 
-      * - `ROCm <https://github.com/ROCm/ROCm>`__
-        - {{ unified_docker.rocm_version }}
-
-      * - `SGLang <https://docs.sglang.ai/index.html>`__
-        - {{ unified_docker.sglang_version }} 
-
-      * - `PyTorch <https://github.com/pytorch/pytorch>`__
-        - {{ unified_docker.pytorch_version }} 
+      {% for component_name, component_version in docker.components.items() %}
+      * - {{ component_name }}
+        - {{ component_version }}
+      {% endfor %}
 
 System validation
 =================
@@ -50,8 +46,8 @@ system's configuration.
 
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/sglang-benchmark-models.yaml
 
-   {% set unified_docker = data.sglang_benchmark.unified_docker.latest %}
-   {% set model_groups = data.sglang_benchmark.model_groups %}
+   {% set unified_docker = data.dockers[0] %}
+   {% set model_groups = data.model_groups %}
 
    Pull the Docker image
    =====================
diff --git a/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst b/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
index 9f3bd608d..f2b060ebd 100644
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
@@ -7,14 +7,13 @@
 vLLM inference performance testing
 **********************************
 
-.. _vllm-benchmark-unified-docker-812:
+.. _vllm-benchmark-unified-docker-909:
 
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
 
-   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
-   {% set model_groups = data.vllm_benchmark.model_groups %}
+   {% set docker = data.dockers[0] %}
 
-   The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
+   The `ROCm vLLM Docker <{{ docker.docker_hub_url }}>`_ image offers
    a prebuilt, optimized environment for validating large language model (LLM)
    inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
    Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
@@ -26,20 +25,13 @@ vLLM inference performance testing
       * - Software component
         - Version
 
-      * - `ROCm <https://github.com/ROCm/ROCm>`__
-        - {{ unified_docker.rocm_version }}
-
-      * - `vLLM <https://docs.vllm.ai/en/latest>`__
-        - {{ unified_docker.vllm_version }}
-
-      * - `PyTorch <https://github.com/ROCm/pytorch>`__
-        - {{ unified_docker.pytorch_version }}
-
-      * - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`__
-        - {{ unified_docker.hipblaslt_version }}
+      {% for component_name, component_version in docker.components.items() %}
+      * - {{ component_name }}
+        - {{ component_version }}
+      {% endfor %}
 
 With this Docker image, you can quickly test the :ref:`expected
-inference performance numbers <vllm-benchmark-performance-measurements-812>` for
+inference performance numbers <vllm-benchmark-performance-measurements-909>` for
 MI300X series accelerators.
 
 What's new
@@ -47,21 +39,23 @@ What's new
 
 The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <previous-versions/vllm-history>`.
 
-* Upgraded to vLLM v0.10.
+* Upgraded to vLLM v0.10.1.
 
-* FP8 KV cache support via AITER.
+* Set ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1`` by default for better performance.
 
-* Full graph capture support via AITER.
+* Set ``VLLM_ROCM_USE_AITER_RMSNORM=0`` by default to avoid various issues with torch compile.
+
+.. _vllm-benchmark-supported-models-909:
 
 Supported models
 ================
 
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
 
-   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
-   {% set model_groups = data.vllm_benchmark.model_groups %}
+   {% set docker = data.dockers[0] %}
+   {% set model_groups = data.model_groups %}
 
-   .. _vllm-benchmark-available-models-812:
+   .. _vllm-benchmark-available-models-909:
 
    The following models are supported for inference performance benchmarking
    with vLLM and ROCm. Some instructions, commands, and recommendations in this
@@ -70,55 +64,51 @@ Supported models
    .. raw:: html
 
       <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
-      <div class="row">
-         <div class="col-2 me-2 model-param-head">Model group</div>
-         <div class="row col-10">
-   {% for model_group in model_groups %}
-            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
-   {% endfor %}
-         </div>
-      </div>
-
-      <div class="row mt-1">
-         <div class="col-2 me-2 model-param-head">Model</div>
-         <div class="row col-10">
-   {% for model_group in model_groups %}
-      {% set models = model_group.models %}
-      {% for model in models %}
-         {% if models|length % 3 == 0 %}
-            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-         {% else %}
-            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-         {% endif %}
+         <div class="row gx-0">
+            <div class="col-2 me-1 px-2 model-param-head">Model</div>
+            <div class="row col-10 pe-0">
+      {% for model_group in model_groups %}
+               <div class="col-3 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
       {% endfor %}
-   {% endfor %}
+            </div>
+         </div>
+
+         <div class="row gx-0 pt-1">
+            <div class="col-2 me-1 px-2 model-param-head">Variant</div>
+            <div class="row col-10 pe-0">
+      {% for model_group in model_groups %}
+         {% set models = model_group.models %}
+         {% for model in models %}
+            {% if models|length % 3 == 0 %}
+               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% else %}
+               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% endif %}
+         {% endfor %}
+      {% endfor %}
+            </div>
          </div>
       </div>
-      </div>
 
-   .. _vllm-benchmark-vllm-812:
+   .. _vllm-benchmark-vllm-909:
 
    {% for model_group in model_groups %}
       {% for model in model_group.models %}
 
-   .. container:: model-doc {{model.mad_tag}}
+   .. container:: model-doc {{ model.mad_tag }}
 
       .. note::
 
          See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
          Some models require access authorization prior to use via an external license agreement through a third party.
+      {% if model.precision == "float8" and model.model_repo.startswith("amd") %}
+         This model uses FP8 quantization via `AMD Quark <https://quark.docs.amd.com/latest/>`__ for efficient inference on AMD accelerators.
+      {% endif %}
 
       {% endfor %}
    {% endfor %}
 
-.. note::
-
-   vLLM is a toolkit and library for LLM inference and serving. AMD implements
-   high-performance custom kernels and modules in vLLM to enhance performance.
-   See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
-   more information.
-
-.. _vllm-benchmark-performance-measurements-812:
+.. _vllm-benchmark-performance-measurements-909:
 
 Performance measurements
 ========================
@@ -151,18 +141,18 @@ system's configuration.
 
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
 
-   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
-   {% set model_groups = data.vllm_benchmark.model_groups %}
+   {% set docker = data.dockers[0] %}
+   {% set model_groups = data.model_groups %}
 
    Pull the Docker image
    =====================
 
-   Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
+   Download the `ROCm vLLM Docker image <{{ docker.docker_hub_url }}>`_.
    Use the following command to pull the Docker image from Docker Hub.
 
    .. code-block:: shell
 
-      docker pull {{ unified_docker.pull_tag }}
+      docker pull {{ docker.pull_tag }}
 
    Benchmarking
    ============
@@ -170,7 +160,7 @@ system's configuration.
    Once the setup is complete, choose between two options to reproduce the
    benchmark results:
 
-   .. _vllm-benchmark-mad-812:
+   .. _vllm-benchmark-mad-909:
 
    {% for model_group in model_groups %}
       {% for model in model_group.models %}
@@ -181,6 +171,9 @@ system's configuration.
 
          .. tab-item:: MAD-integrated benchmarking
 
+            The following run command is tailored to {{ model.model }}.
+            See :ref:`vllm-benchmark-supported-models-909` to switch to another available model.
+
             1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
                directory and install the required packages on the host machine.
 
@@ -208,7 +201,7 @@ system's configuration.
             and ``{{ model.mad_tag }}_serving.csv``.
 
             Although the :ref:`available models
-            <vllm-benchmark-available-models-812>` are preconfigured to collect
+            <vllm-benchmark-available-models-909>` are preconfigured to collect
             offline throughput and online serving performance data, you can
             also change the benchmarking parameters. See the standalone
             benchmarking tab for more information.
@@ -232,132 +225,143 @@ system's configuration.
 
          .. tab-item:: Standalone benchmarking
 
-            .. rubric:: Download the Docker image and required scripts
+            The following commands are optimized for {{ model.model }}.
+            See :ref:`vllm-benchmark-supported-models-909` to switch to another available model.
 
-            1. Run the vLLM benchmark tool independently by starting the
-               `Docker container <{{ unified_docker.docker_hub_url }}>`_
-               as shown in the following snippet.
+            .. seealso::
+
+               For more information on configuration, see the `config files
+               <https://github.com/ROCm/MAD-private/tree/develop/scripts/vllm/configs>`__
+               in the MAD repository. Refer to the `vLLM engine <https://docs.vllm.ai/en/latest/configuration/engine_args.html#engineargs>`__
+               for descriptions of available configuration options
+               and `Benchmarking vLLM <https://github.com/vllm-project/vllm/blob/main/benchmarks/README.md>`__ for
+               additional benchmarking information.
+
+            .. rubric:: Launch the container
+
+            You can run the vLLM benchmark tool independently by starting the
+            `Docker container <{{ docker.docker_hub_url }}>`_ as shown
+            in the following snippet.
+
+            .. code-block:: shell
+
+               docker pull {{ docker.pull_tag }}
+               docker run -it \
+                   --device=/dev/kfd \
+                   --device=/dev/dri \
+                   --group-add video \
+                   --shm-size 16G \
+                   --security-opt seccomp=unconfined \
+                   --security-opt apparmor=unconfined \
+                   --cap-add=SYS_PTRACE \
+                   -v $(pwd):/workspace \
+                   --env HUGGINGFACE_HUB_CACHE=/workspace \
+                   --name test \
+                   {{ docker.pull_tag }}
+
+            .. rubric:: Throughput command
+
+            Use the following command to start the throughput benchmark.
+
+            .. code-block:: shell
+
+               model={{ model.model_repo }}
+               tp={{ model.config.tp }}
+               num_prompts=1024
+               in=128
+               out=128
+               dtype={{ model.config.dtype }}
+               kv_cache_dtype={{ model.config.kv_cache_dtype }}
+               max_num_seqs=1024
+               max_seq_len_to_capture={{ model.config.max_seq_len_to_capture }}
+               max_num_batched_tokens={{ model.config.max_num_batched_tokens }}
+               max_model_len={{ model.config.max_model_len }}
+
+               vllm bench throughput --model $model \
+                   -tp $tp \
+                   --num-prompts $num_prompts \
+                   --input-len $in \
+                   --output-len $out \
+                   --dtype $dtype \
+                   --kv-cache-dtype $kv_cache_dtype \
+                   --max-num-seqs $max_num_seqs \
+                   --max-seq-len-to-capture $max_seq_len_to_capture \
+                   --max-num-batched-tokens $max_num_batched_tokens \
+                   --max-model-len $max_model_len \
+                   --trust-remote-code \
+                   --output-json ${model}_throughput.json \
+                   --gpu-memory-utilization 0.9
+
+            .. rubric:: Serving command
+
+            1. Start the server using the following command:
 
                .. code-block:: shell
 
-                  docker pull {{ unified_docker.pull_tag }}
-                  docker run -it \
-                      --device=/dev/kfd \
-                      --device=/dev/dri \
-                      --group-add video \
-                      --shm-size 16G \
-                      --security-opt seccomp=unconfined \
-                      --security-opt apparmor=unconfined \
-                      --cap-add=SYS_PTRACE \
-                      -v $(pwd):/workspace \
-                      --env HUGGINGFACE_HUB_CACHE=/workspace \
-                      --name test \
-                      {{ unified_docker.pull_tag }}
+                  model={{ model.model_repo }}
+                  tp={{ model.config.tp }}
+                  dtype={{ model.config.dtype }}
+                  kv_cache_dtype={{ model.config.kv_cache_dtype }}
+                  max_num_seqs=256
+                  max_seq_len_to_capture={{ model.config.max_seq_len_to_capture }}
+                  max_num_batched_tokens={{ model.config.max_num_batched_tokens }}
+                  max_model_len={{ model.config.max_model_len }}
 
-            2. In the Docker container, clone the ROCm MAD repository and navigate to the
-               benchmark scripts directory at ``~/MAD/scripts/vllm``.
+                  vllm serve $model \
+                      -tp $tp \
+                      --dtype $dtype \
+                      --kv-cache-dtype $kv_cache_dtype \
+                      --max-num-seqs $max_num_seqs \
+                      --max-seq-len-to-capture $max_seq_len_to_capture \
+                      --max-num-batched-tokens $max_num_batched_tokens \
+                      --max-model-len $max_model_len \
+                      --no-enable-prefix-caching \
+                      --swap-space 16 \
+                      --disable-log-requests \
+                      --trust-remote-code \
+                      --gpu-memory-utilization 0.9
+
+               Wait until the model has loaded and the server is ready to accept requests.
+
+            2. On another terminal on the same machine, run the benchmark:
 
                .. code-block:: shell
 
-                  git clone https://github.com/ROCm/MAD
-                  cd MAD/scripts/vllm
+                  # Connect to the container
+                  docker exec -it test bash
 
-            3. To start the benchmark, use the following command with the appropriate options.
+                  # Wait for the server to start
+                  until curl -s http://localhost:8000/v1/models; do sleep 30; done
+
+                  # Run the benchmark
+                  model={{ model.model_repo }}
+                  max_concurrency=1
+                  num_prompts=10
+                  in=128
+                  out=128
+                  vllm bench serve --model $model \
+                      --percentile-metrics "ttft,tpot,itl,e2el" \
+                      --dataset-name random \
+                      --ignore-eos \
+                      --max-concurrency $max_concurrency \
+                      --num-prompts $num_prompts \
+                      --random-input-len $in \
+                      --random-output-len $out \
+                      --trust-remote-code \
+                      --save-result \
+                      --result-filename ${model}_serving.json
+
+            .. note::
+
+               If you encounter the following error, pass your access-authorized Hugging
+               Face token to the gated models.
 
                .. code-block::
 
-                  ./run.sh \
-                      --config $CONFIG_CSV \
-                      --model_repo {{ model.model_repo }} \
-                      <overrides>
+                  OSError: You are trying to access a gated repo.
 
-               .. dropdown:: Benchmark options
-                  :open:
-
-                  .. list-table::
-                     :header-rows: 1
-                     :align: center
-
-                     * - Name
-                       - Options
-                       - Description
-
-                     * - ``--config``
-                       - ``configs/default.csv``
-                       - Run configs from the CSV for the chosen model repo and benchmark.
-
-                     * -
-                       - ``configs/extended.csv``
-                       - 
-
-                     * -
-                       - ``configs/performance.csv``
-                       - 
-
-                     * - ``--benchmark``
-                       - ``throughput``
-                       - Measure offline end-to-end throughput.
-
-                     * - 
-                       - ``serving``
-                       - Measure online serving performance.
-
-                     * - 
-                       - ``all``
-                       - Measure both throughput and serving.
-
-                     * - `<overrides>`
-                       - See `run.sh <https://github.com/ROCm/MAD/blob/develop/scripts/vllm/run.sh>`__ for more info.
-                       - Additional overrides to the config CSV.
-
-                  The input sequence length, output sequence length, and tensor parallel (TP) are
-                  already configured. You don't need to specify them with this script.
-
-               .. note::
-
-                  For best performance, it's recommended to run with ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``.
-
-                  If you encounter the following error, pass your access-authorized Hugging
-                  Face token to the gated models.
-
-                  .. code-block::
-
-                     OSError: You are trying to access a gated repo.
-
-                     # pass your HF_TOKEN
-                     export HF_TOKEN=$your_personal_hf_token
-
-            .. rubric:: Benchmarking examples
-
-            Here are some examples of running the benchmark with various options:
-
-            * Throughput benchmark
-
-              Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
-
-              .. code-block:: shell
-
-                 export MAD_MODEL_NAME={{ model.mad_tag }}
-                 ./run.sh \
-                     --config configs/default.csv \
-                     --model_repo {{model.model_repo}} \
-                     --benchmark throughput
-
-              Find the throughput benchmark report at ``./{{ model.mad_tag }}_throughput.csv``.
-
-            * Serving benchmark
-
-              Use this command to benchmark the serving performance of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
-
-              .. code-block::
-
-                 export MAD_MODEL_NAME={{ model.mad_tag }}
-                 ./run.sh \
-                     --config configs/default.csv \
-                     --model_repo {{model.model_repo}} \
-                     --benchmark serving
-
-              Find the serving benchmark report at ``./{{ model.mad_tag }}_serving.csv``.
+                  # pass your HF_TOKEN
+                  export HF_TOKEN=$your_personal_hf_token
 
             .. raw:: html
 
@@ -382,7 +386,7 @@ Advanced usage
 ==============
 
 For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
-see the developer's guide at `<https://github.com/ROCm/vllm/tree/f94ec9beeca1071cc34f9d1e206d8c7f3ac76129/docs/dev-docker>`__.
+see the developer's guide at `<https://github.com/ROCm/vllm/blob/documentation/docs/dev-docker/README.md>`__.
 
 Reproducing the Docker image
 ----------------------------
@@ -400,7 +404,7 @@ To reproduce this ROCm/vLLM Docker image release, follow these steps:
    .. code-block:: shell
 
       cd vllm
-      git checkout 340ea86dfe5955d6f9a9e767d6abab5aacf2c978
+      git checkout 6663000a391911eba96d7864a26ac42b07f6ef29
 
 3. Build the Docker image. Replace ``vllm-rocm`` with your desired image tag.
 
@@ -419,15 +423,12 @@ Further reading
 - To learn more about system settings and management practices to configure your system for
   AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
 
+- See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
+  a brief introduction to vLLM and optimization strategies.
+
 - For application performance optimization strategies for HPC and AI workloads,
   including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
 
-- To learn how to run community models from Hugging Face on AMD GPUs, see
-  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
-
-- To learn how to fine-tune LLMs and optimize inference, see
-  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
-
 - For a list of other ready-made Docker images for AI with ROCm, see
   `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
 
diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
index a85f5af56..76c3582e7 100644
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
@@ -82,32 +82,32 @@ started.
    {% set model_groups = data.model_groups %}
    .. raw:: html
 
-         <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
-           <div class="row">
-             <div class="col-2 me-2 model-param-head">Model</div>
-             <div class="row col-10">
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+         <div class="row gx-0">
+            <div class="col-2 me-1 px-2 model-param-head">Model</div>
+            <div class="row col-10 pe-0">
       {% for model_group in model_groups %}
-               <div class="col-4 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+               <div class="col-4 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
       {% endfor %}
-             </div>
-           </div>
+            </div>
+         </div>
 
-           <div class="row mt-1">
-             <div class="col-2 me-2 model-param-head">Model variant</div>
-             <div class="row col-10">
+         <div class="row gx-0 pt-1">
+            <div class="col-2 me-1 px-2 model-param-head">Variant</div>
+            <div class="row col-10 pe-0">
       {% for model_group in model_groups %}
          {% set models = model_group.models %}
          {% for model in models %}
             {% if models|length % 3 == 0 %}
-               <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
             {% else %}
-               <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
             {% endif %}
          {% endfor %}
       {% endfor %}
-             </div>
-           </div>
+            </div>
          </div>
+      </div>
 
 .. note::
 
@@ -208,7 +208,7 @@ with RDMA, skip ahead to :ref:`amd-maxtext-get-started-v257`.
 
    e. RDMA interface
 
-      Ensure the :ref:`required packages <amd-maxtext-multi-node-setup>` are installed on all nodes.
+      Ensure the :ref:`required packages <amd-maxtext-multi-node-setup-v257>` are installed on all nodes.
       Then, set the RDMA interfaces to use for communication.
 
       .. code-block:: bash
diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
index 687cc514f..4df1da960 100644
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
@@ -70,32 +70,32 @@ workloads:
    {% set model_groups = data.model_groups %}
    .. raw:: html
 
-         <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
-           <div class="row">
-             <div class="col-2 me-2 model-param-head">Model</div>
-             <div class="row col-10">
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+         <div class="row gx-0">
+            <div class="col-2 me-1 px-2 model-param-head">Model</div>
+            <div class="row col-10 pe-0">
       {% for model_group in model_groups %}
-               <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+               <div class="col-3 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
       {% endfor %}
-             </div>
-           </div>
+            </div>
+         </div>
 
-           <div class="row mt-1">
-             <div class="col-2 me-2 model-param-head">Model variant</div>
-             <div class="row col-10">
+         <div class="row gx-0 pt-1">
+            <div class="col-2 me-1 px-2 model-param-head">Variant</div>
+            <div class="row col-10 pe-0">
       {% for model_group in model_groups %}
          {% set models = model_group.models %}
          {% for model in models %}
             {% if models|length % 3 == 0 %}
-               <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
             {% else %}
-               <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
             {% endif %}
          {% endfor %}
       {% endfor %}
-             </div>
-           </div>
+            </div>
          </div>
+      </div>
 
 .. note::
 
diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.5.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.5.rst
index d5051d28c..9bd7081d2 100644
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.5.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.5.rst
@@ -112,7 +112,7 @@ Multi-node setup
 
 For multi-node environments, ensure you have all the necessary packages for
 your network device, such as, RDMA. If you're not using a multi-node setup
-with RDMA, skip ahead to :ref:`amd-maxtext-download-docker`.
+with RDMA, skip ahead to :ref:`amd-maxtext-download-docker-v255`.
 
 1. Install the following packages to build and install the RDMA driver.
 
@@ -177,7 +177,7 @@ with RDMA, skip ahead to :ref:`amd-maxtext-download-docker`.
 
    e. RDMA interface
 
-      Ensure the :ref:`required packages <amd-maxtext-multi-node-setup>` are installed on all nodes.
+      Ensure the :ref:`required packages <amd-maxtext-multi-node-setup-v255>` are installed on all nodes.
       Then, set the RDMA interfaces to use for communication.
 
       .. code-block:: bash
@@ -199,7 +199,7 @@ Pull the Docker image
       docker pull rocm/jax-training:maxtext-v25.5
 
 2. Use the following command to launch the Docker container. Note that the benchmarking scripts
-   used in the :ref:`following section <amd-maxtext-get-started>` automatically launch the Docker container
+   used in the :ref:`following section <amd-maxtext-get-started-v255>` automatically launch the Docker container
    and execute the benchmark.
 
    .. code-block:: shell
diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst
index 0a80c7c9b..81ec4ed50 100644
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst
@@ -55,32 +55,32 @@ vary by model -- select one to get started.
    {% set model_groups = data.model_groups %}
    .. raw:: html
 
-         <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
-           <div class="row">
-             <div class="col-2 me-2 model-param-head">Model</div>
-             <div class="row col-10">
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+         <div class="row gx-0">
+            <div class="col-2 me-1 px-2 model-param-head">Model</div>
+            <div class="row col-10 pe-0">
       {% for model_group in model_groups %}
-               <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+               <div class="col-3 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
       {% endfor %}
-             </div>
-           </div>
+            </div>
+         </div>
 
-           <div class="row mt-1">
-             <div class="col-2 me-2 model-param-head">Model variant</div>
-             <div class="row col-10">
+         <div class="row gx-0 pt-1">
+            <div class="col-2 me-1 px-2 model-param-head">Variant</div>
+            <div class="row col-10 pe-0">
       {% for model_group in model_groups %}
          {% set models = model_group.models %}
          {% for model in models %}
             {% if models|length % 3 == 0 %}
-               <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
             {% else %}
-               <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
             {% endif %}
          {% endfor %}
       {% endfor %}
-             </div>
-           </div>
+            </div>
          </div>
+      </div>
 
 .. note::
 
diff --git a/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst b/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
index e7258e07b..d8ab01318 100644
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
@@ -45,30 +45,30 @@ vary by model -- select one to get started.
    .. raw:: html
 
       <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
-        <div class="row">
-          <div class="col-2 me-2 model-param-head">Model group</div>
-          <div class="row col-10">
-   {% for model_group in model_groups %}
-            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
-   {% endfor %}
-          </div>
-        </div>
-
-        <div class="row mt-1">
-          <div class="col-2 me-2 model-param-head">Model variant</div>
-          <div class="row col-10">
-   {% for model_group in model_groups %}
-      {% set models = model_group.models %}
-      {% for model in models %}
-         {% if models|length % 3 == 0 %}
-            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-         {% else %}
-            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
-         {% endif %}
+         <div class="row gx-0">
+            <div class="col-2 me-1 px-2 model-param-head">Model</div>
+            <div class="row col-10 pe-0">
+      {% for model_group in model_groups %}
+               <div class="col-3 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
       {% endfor %}
-   {% endfor %}
-          </div>
-        </div>
+            </div>
+         </div>
+
+         <div class="row gx-0 pt-1">
+            <div class="col-2 me-1 px-2 model-param-head">Variant</div>
+            <div class="row col-10 pe-0">
+      {% for model_group in model_groups %}
+         {% set models = model_group.models %}
+         {% for model in models %}
+            {% if models|length % 3 == 0 %}
+               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% else %}
+               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% endif %}
+         {% endfor %}
+      {% endfor %}
+            </div>
+         </div>
       </div>
 
 
diff --git a/docs/sphinx/static/css/vllm-benchmark.css b/docs/sphinx/static/css/vllm-benchmark.css
index 4c10b1ffb..231bb2cac 100644
--- a/docs/sphinx/static/css/vllm-benchmark.css
+++ b/docs/sphinx/static/css/vllm-benchmark.css
@@ -7,15 +7,14 @@ html {
   --compat-head-color: var(--pst-color-surface);
   --compat-param-hover-color: var(--pst-color-link-hover);
   --compat-param-selected-color: var(--pst-color-primary);
+  --compat-border-color: var(--pst-color-border);
 }
 
 html[data-theme="light"] {
-  --compat-border-color: var(--pst-gray-500);
   --compat-param-disabled-color: var(--pst-gray-300);
 }
 
 html[data-theme="dark"] {
-  --compat-border-color: var(--pst-gray-600);
   --compat-param-disabled-color: var(--pst-gray-600);
 }
 
@@ -23,6 +22,7 @@ div#vllm-benchmark-ud-params-picker.container-fluid {
   padding: 0 0 1rem 0;
 }
 
+div[data-param-k="model-group"],
 div[data-param-k="model"] {
   background-color: var(--compat-bg-color);
   padding: 2px;
@@ -31,40 +31,19 @@ div[data-param-k="model"] {
   cursor: pointer;
 }
 
+div[data-param-k="model-group"][data-param-state="selected"],
 div[data-param-k="model"][data-param-state="selected"] {
   background-color: var(--compat-param-selected-color);
   color: var(--compat-fg-color);
 }
 
-div[data-param-k="model"][data-param-state="latest-version"] {
-  background-color: var(--compat-param-selected-color);
-  color: var(--compat-fg-color);
-}
-
-div[data-param-k="model"][data-param-state="disabled"] {
-  background-color: var(--compat-param-disabled-color);
-  text-decoration: line-through;
-  /* text-decoration-color: var(--pst-color-danger); */
-  cursor: auto;
-}
-
-div[data-param-k="model"]:not([data-param-state]):hover {
+div[data-param-k="model-group"]:hover,
+div[data-param-k="model"]:hover {
   background-color: var(--compat-param-hover-color);
-}
-
-div[data-param-k="model-group"] {
-  background-color: var(--compat-bg-color);
-  padding: 2px;
-  border: solid 1px var(--compat-border-color);
-  font-weight: 500;
-  cursor: pointer;
-}
-
-div[data-param-k="model-group"][data-param-state="selected"] {
-  background-color: var(--compat-param-selected-color);
   color: var(--compat-fg-color);
 }
 
+/*
 div[data-param-k="model-group"][data-param-state="latest-version"] {
   background-color: var(--compat-param-selected-color);
   color: var(--compat-fg-color);
@@ -73,26 +52,19 @@ div[data-param-k="model-group"][data-param-state="latest-version"] {
 div[data-param-k="model-group"][data-param-state="disabled"] {
   background-color: var(--compat-param-disabled-color);
   text-decoration: line-through;
-  /* text-decoration-color: var(--pst-color-danger); */
+  text-decoration-color: var(--pst-color-danger);
   cursor: auto;
 }
-
-div[data-param-k="model-group"]:not([data-param-state]):hover {
-  background-color: var(--compat-param-hover-color);
-}
+*/
 
 .model-param-head {
   background-color: var(--compat-head-color);
   padding: 0.15rem 0.15rem 0.15rem 0.67rem;
-  /* margin: 2px; */
-  border-right: solid 2px var(--compat-accent-color);
+  border-right: solid 4px var(--compat-accent-color);
   font-weight: 600;
 }
 
 .model-param {
-  /* padding: 2px; */
-  /* margin: 0 2px 0 2px; */
-  /* margin: 2px; */
   border: solid 1px var(--compat-border-color);
   font-weight: 500;
 }

From 0d790615efd8311f6a10fcd2f40fcc15dabb5cc9 Mon Sep 17 00:00:00 2001
From: amd-hsivasun <hsivasun@amd.com>
Date: Wed, 3 Sep 2025 12:03:02 -0400
Subject: [PATCH 37/53] [Ex CI] Update pipeline Id for rocprofiler-compute to
 monorepo

---
 .azuredevops/templates/steps/dependencies-rocm.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.azuredevops/templates/steps/dependencies-rocm.yml b/.azuredevops/templates/steps/dependencies-rocm.yml
index efd5a81ae..4090d07be 100644
--- a/.azuredevops/templates/steps/dependencies-rocm.yml
+++ b/.azuredevops/templates/steps/dependencies-rocm.yml
@@ -207,7 +207,7 @@ parameters:
       developBranch: develop
       hasGpuTarget: true
     rocprofiler-compute:
-      pipelineId: 257
+      pipelineId: 334
       developBranch: develop
       hasGpuTarget: true
     rocprofiler-register:

From 61f09e2ab935c91b6bf02c51552c6e0b045064ac Mon Sep 17 00:00:00 2001
From: amd-hsivasun <hsivasun@amd.com>
Date: Tue, 9 Sep 2025 15:27:00 -0400
Subject: [PATCH 38/53] Update pipelineId for rocprofiler-compute

---
 .azuredevops/templates/steps/dependencies-rocm.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.azuredevops/templates/steps/dependencies-rocm.yml b/.azuredevops/templates/steps/dependencies-rocm.yml
index 4090d07be..e6e6db966 100644
--- a/.azuredevops/templates/steps/dependencies-rocm.yml
+++ b/.azuredevops/templates/steps/dependencies-rocm.yml
@@ -207,7 +207,7 @@ parameters:
       developBranch: develop
       hasGpuTarget: true
     rocprofiler-compute:
-      pipelineId: 334
+      pipelineId: 344
       developBranch: develop
       hasGpuTarget: true
     rocprofiler-register:

From a6fbf60594ed67a1a54d1a4733928210a33b1a9f Mon Sep 17 00:00:00 2001
From: Haresh Sivasuntharampillai <Haresh.Sivasuntharampillai@amd.com>
Date: Wed, 10 Sep 2025 17:54:37 +0000
Subject: [PATCH 39/53] [Ex CI] enable rocr-runtime monorepo

---
 .azuredevops/components/ROCR-Runtime.yml | 219 +++++++++++++----------
 1 file changed, 126 insertions(+), 93 deletions(-)

diff --git a/.azuredevops/components/ROCR-Runtime.yml b/.azuredevops/components/ROCR-Runtime.yml
index 0358dd335..7d5d07eba 100644
--- a/.azuredevops/components/ROCR-Runtime.yml
+++ b/.azuredevops/components/ROCR-Runtime.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: rocr-runtime
 - name: checkoutRepo
   type: string
   default: 'self'
 - name: checkoutRef
   type: string
   default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -45,6 +64,10 @@ parameters:
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
   - job: ROCR_Runtime_build_${{ job.os }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}
     pool:
       vmImage: 'ubuntu-22.04'
     ${{ if eq(job.os, 'almalinux8') }}:
@@ -65,14 +88,18 @@ jobs:
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
       parameters:
         checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
       parameters:
         checkoutRef: ${{ parameters.checkoutRef }}
         dependencyList: ${{ parameters.rocmDependencies }}
         aggregatePipeline: ${{ parameters.aggregatePipeline }}
         os: ${{ job.os }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
       parameters:
+        componentName: ${{ parameters.componentName }}
         os: ${{ job.os }}
         useAmdclang: false
         extraBuildFlags: >-
@@ -82,105 +109,111 @@ jobs:
           -GNinja
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
       parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
         os: ${{ job.os }}
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
       parameters:
+        componentName: ${{ parameters.componentName }}
         os: ${{ job.os }}
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
     # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
     #   parameters:
     #     aptPackages: ${{ parameters.aptPackages }}
 
-- ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: ROCR_Runtime_test_${{ job.os }}_${{ job.target }}
-    dependsOn: ROCR_Runtime_build_${{ job.os }}
-    condition:
-      and(succeeded(),
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
-        eq(${{ parameters.aggregatePipeline }}, False)
-      )
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool: ${{ job.target }}_test_pool
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        packageManager: ${{ job.packageManager }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-      parameters:
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmTestDependencies }}
-        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      parameters:
-        checkoutRepo: ${{ parameters.checkoutRepo }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-      parameters:
-        runRocminfo: false
-    - task: Bash@3
-      displayName: Build kfdtest
-      inputs:
-        targetType: 'inline'
-        workingDirectory: $(Build.SourcesDirectory)/libhsakmt/tests/kfdtest
-        script: |
-          if [ -e /opt/rh/gcc-toolset-14/enable ]; then
-            source /opt/rh/gcc-toolset-14/enable
-          fi
-          mkdir build && cd build
-          cmake -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm ..
-          make
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: kfdtest
-        testExecutable: BIN_DIR=$(Build.SourcesDirectory)/libhsakmt/tests/kfdtest/build ./run_kfdtest.sh
-        testParameters: '-p core --gtest_output=xml:./test_output.xml --gtest_color=yes'
-        testDir: $(Build.SourcesDirectory)/libhsakmt/tests/kfdtest/scripts
-        os: ${{ job.os }}
-    - task: Bash@3
-      displayName: Build rocrtst
-      inputs:
-        targetType: 'inline'
-        workingDirectory: $(Build.SourcesDirectory)/rocrtst/suites/test_common
-        script: |
-          echo $(Build.SourcesDirectory)/rocrtst/thirdparty/lib | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
-          sudo cat /etc/ld.so.conf.d/rocm-ci.conf
-          sudo ldconfig -v
-          ldconfig -p
-          if [ -e /opt/rh/gcc-toolset-14/enable ]; then
-            source /opt/rh/gcc-toolset-14/enable
-          fi
-          BASE_CLANG_DIR=$(Agent.BuildDirectory)/rocm/llvm/lib/clang
-          export NEWEST_CLANG_VER=$(ls -1 $BASE_CLANG_DIR | sort -V | tail -n 1)
-          mkdir build && cd build
-          cmake .. \
-            -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm \
-            -DTARGET_DEVICES=${{ job.target }} \
-            -DROCM_DIR=$(Agent.BuildDirectory)/rocm \
-            -DLLVM_DIR=$(Agent.BuildDirectory)/rocm/llvm/bin \
-            -DOPENCL_INC_DIR=$BASE_CLANG_DIR/$NEWEST_CLANG_VER/include
-          make
-          make rocrtst_kernels
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: rocrtst
-        testExecutable: ./rocrtst64
-        testParameters: '--gtest_filter="-rocrtstNeg.Memory_Negative_Tests:rocrtstFunc.Memory_Max_Mem" --gtest_output=xml:./test_output.xml --gtest_color=yes'
-        testDir: $(Build.SourcesDirectory)/rocrtst/suites/test_common/build/${{ job.target }}
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        environment: test
-        gpuTarget: ${{ job.target }}
-  # docker image will be missing libhwloc5
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
+    - job: ROCR_Runtime_test_${{ job.os }}_${{ job.target }}
+      dependsOn: ROCR_Runtime_build_${{ job.os }}
+      condition:
+        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
+          eq(${{ parameters.aggregatePipeline }}, False)
+        )
+      variables:
+      - group: common
+      - template: /.azuredevops/variables-global.yml
+      pool: ${{ job.target }}_test_pool
+      workspace:
+        clean: all
+      steps:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          packageManager: ${{ job.packageManager }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+        parameters:
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
+          dependencyList: ${{ parameters.rocmTestDependencies }}
+          gpuTarget: ${{ job.target }}
+          os: ${{ job.os }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+        parameters:
+          checkoutRepo: ${{ parameters.checkoutRepo }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+        parameters:
+          runRocminfo: false
+      - task: Bash@3
+        displayName: Build kfdtest
+        inputs:
+          targetType: 'inline'
+          workingDirectory: $(Build.SourcesDirectory)/libhsakmt/tests/kfdtest
+          script: |
+            if [ -e /opt/rh/gcc-toolset-14/enable ]; then
+              source /opt/rh/gcc-toolset-14/enable
+            fi
+            mkdir build && cd build
+            cmake -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm ..
+            make
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: kfdtest
+          testExecutable: BIN_DIR=$(Build.SourcesDirectory)/libhsakmt/tests/kfdtest/build ./run_kfdtest.sh
+          testParameters: '-p core --gtest_output=xml:./test_output.xml --gtest_color=yes'
+          testDir: $(Build.SourcesDirectory)/libhsakmt/tests/kfdtest/scripts
+          os: ${{ job.os }}
+      - task: Bash@3
+        displayName: Build rocrtst
+        inputs:
+          targetType: 'inline'
+          workingDirectory: $(Build.SourcesDirectory)/rocrtst/suites/test_common
+          script: |
+            echo $(Build.SourcesDirectory)/rocrtst/thirdparty/lib | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
+            sudo cat /etc/ld.so.conf.d/rocm-ci.conf
+            sudo ldconfig -v
+            ldconfig -p
+            if [ -e /opt/rh/gcc-toolset-14/enable ]; then
+              source /opt/rh/gcc-toolset-14/enable
+            fi
+            BASE_CLANG_DIR=$(Agent.BuildDirectory)/rocm/llvm/lib/clang
+            export NEWEST_CLANG_VER=$(ls -1 $BASE_CLANG_DIR | sort -V | tail -n 1)
+            mkdir build && cd build
+            cmake .. \
+              -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm \
+              -DTARGET_DEVICES=${{ job.target }} \
+              -DROCM_DIR=$(Agent.BuildDirectory)/rocm \
+              -DLLVM_DIR=$(Agent.BuildDirectory)/rocm/llvm/bin \
+              -DOPENCL_INC_DIR=$BASE_CLANG_DIR/$NEWEST_CLANG_VER/include
+            make
+            make rocrtst_kernels
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: rocrtst
+          testExecutable: ./rocrtst64
+          testParameters: '--gtest_filter="-rocrtstNeg.Memory_Negative_Tests:rocrtstFunc.Memory_Max_Mem" --gtest_output=xml:./test_output.xml --gtest_color=yes'
+          testDir: $(Build.SourcesDirectory)/rocrtst/suites/test_common/build/${{ job.target }}
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          environment: test
+          gpuTarget: ${{ job.target }}
+    # docker image will be missing libhwloc5

From 91f21d890fdf7a5b2bb02541aad0b8d111b1d539 Mon Sep 17 00:00:00 2001
From: Haresh Sivasuntharampillai <Haresh.Sivasuntharampillai@amd.com>
Date: Wed, 10 Sep 2025 18:44:18 +0000
Subject: [PATCH 40/53] Fixed SparseCheckout

---
 .azuredevops/components/ROCR-Runtime.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.azuredevops/components/ROCR-Runtime.yml b/.azuredevops/components/ROCR-Runtime.yml
index 7d5d07eba..cdf935e2a 100644
--- a/.azuredevops/components/ROCR-Runtime.yml
+++ b/.azuredevops/components/ROCR-Runtime.yml
@@ -158,6 +158,7 @@ jobs:
       - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
         parameters:
           checkoutRepo: ${{ parameters.checkoutRepo }}
+          sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
       - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
         parameters:
           runRocminfo: false

From 26ddf7e6ac1914241426b6da61683263c8306105 Mon Sep 17 00:00:00 2001
From: Haresh Sivasuntharampillai <Haresh.Sivasuntharampillai@amd.com>
Date: Wed, 10 Sep 2025 19:48:53 +0000
Subject: [PATCH 41/53] test commit

---
 .azuredevops/components/ROCR-Runtime.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.azuredevops/components/ROCR-Runtime.yml b/.azuredevops/components/ROCR-Runtime.yml
index cdf935e2a..3482c0a18 100644
--- a/.azuredevops/components/ROCR-Runtime.yml
+++ b/.azuredevops/components/ROCR-Runtime.yml
@@ -8,7 +8,7 @@ parameters:
 - name: checkoutRef
   type: string
   default: ''
-# monorepo related parameters
+# monorepo related parameters test
 - name: sparseCheckoutDir
   type: string
   default: ''

From 8617b653f8a2ca00ed7f4556dc5321161f36a90e Mon Sep 17 00:00:00 2001
From: Haresh Sivasuntharampillai <Haresh.Sivasuntharampillai@amd.com>
Date: Wed, 10 Sep 2025 19:53:30 +0000
Subject: [PATCH 42/53] test commit

---
 .azuredevops/components/ROCR-Runtime.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.azuredevops/components/ROCR-Runtime.yml b/.azuredevops/components/ROCR-Runtime.yml
index 3482c0a18..cdf935e2a 100644
--- a/.azuredevops/components/ROCR-Runtime.yml
+++ b/.azuredevops/components/ROCR-Runtime.yml
@@ -8,7 +8,7 @@ parameters:
 - name: checkoutRef
   type: string
   default: ''
-# monorepo related parameters test
+# monorepo related parameters
 - name: sparseCheckoutDir
   type: string
   default: ''

From 9b2b1d3a661de72fa5d29a6606c79441e17fa87f Mon Sep 17 00:00:00 2001
From: amd-hsivasun <hsivasun@amd.com>
Date: Wed, 10 Sep 2025 19:56:04 +0000
Subject: [PATCH 43/53] User test

---
 .azuredevops/components/ROCR-Runtime.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.azuredevops/components/ROCR-Runtime.yml b/.azuredevops/components/ROCR-Runtime.yml
index cdf935e2a..3482c0a18 100644
--- a/.azuredevops/components/ROCR-Runtime.yml
+++ b/.azuredevops/components/ROCR-Runtime.yml
@@ -8,7 +8,7 @@ parameters:
 - name: checkoutRef
   type: string
   default: ''
-# monorepo related parameters
+# monorepo related parameters test
 - name: sparseCheckoutDir
   type: string
   default: ''

From c4b4abe3543ab36e23777d1a9bb9f6d6144a2c31 Mon Sep 17 00:00:00 2001
From: amd-hsivasun <hsivasun@amd.com>
Date: Wed, 10 Sep 2025 19:58:20 +0000
Subject: [PATCH 44/53] User Test Commit

---
 .azuredevops/components/ROCR-Runtime.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.azuredevops/components/ROCR-Runtime.yml b/.azuredevops/components/ROCR-Runtime.yml
index 3482c0a18..cdf935e2a 100644
--- a/.azuredevops/components/ROCR-Runtime.yml
+++ b/.azuredevops/components/ROCR-Runtime.yml
@@ -8,7 +8,7 @@ parameters:
 - name: checkoutRef
   type: string
   default: ''
-# monorepo related parameters test
+# monorepo related parameters
 - name: sparseCheckoutDir
   type: string
   default: ''

From 2383edc1fee1faa80bf94bd60f6151d68ce32e97 Mon Sep 17 00:00:00 2001
From: amd-hsivasun <hsivasun@amd.com>
Date: Thu, 11 Sep 2025 18:12:55 +0000
Subject: [PATCH 45/53] Fixed WorkingDir in TestJobs

---
 .azuredevops/components/ROCR-Runtime.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.azuredevops/components/ROCR-Runtime.yml b/.azuredevops/components/ROCR-Runtime.yml
index cdf935e2a..098efb5aa 100644
--- a/.azuredevops/components/ROCR-Runtime.yml
+++ b/.azuredevops/components/ROCR-Runtime.yml
@@ -166,7 +166,7 @@ jobs:
         displayName: Build kfdtest
         inputs:
           targetType: 'inline'
-          workingDirectory: $(Build.SourcesDirectory)/libhsakmt/tests/kfdtest
+          workingDirectory: $(Agent.BuildDirectory)/s/libhsakmt/tests/kfdtest
           script: |
             if [ -e /opt/rh/gcc-toolset-14/enable ]; then
               source /opt/rh/gcc-toolset-14/enable
@@ -177,17 +177,17 @@ jobs:
       - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
         parameters:
           componentName: kfdtest
-          testExecutable: BIN_DIR=$(Build.SourcesDirectory)/libhsakmt/tests/kfdtest/build ./run_kfdtest.sh
+          testExecutable: BIN_DIR=$(Agent.BuildDirectory)/s/libhsakmt/tests/kfdtest/build ./run_kfdtest.sh
           testParameters: '-p core --gtest_output=xml:./test_output.xml --gtest_color=yes'
-          testDir: $(Build.SourcesDirectory)/libhsakmt/tests/kfdtest/scripts
+          testDir: $(Agent.BuildDirectory)/s/libhsakmt/tests/kfdtest/scripts
           os: ${{ job.os }}
       - task: Bash@3
         displayName: Build rocrtst
         inputs:
           targetType: 'inline'
-          workingDirectory: $(Build.SourcesDirectory)/rocrtst/suites/test_common
+          workingDirectory: $(Agent.BuildDirectory)/s/rocrtst/suites/test_common
           script: |
-            echo $(Build.SourcesDirectory)/rocrtst/thirdparty/lib | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
+            echo $(Agent.BuildDirectory)/s/rocrtst/thirdparty/lib | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
             sudo cat /etc/ld.so.conf.d/rocm-ci.conf
             sudo ldconfig -v
             ldconfig -p

From 957005f596cc55e99597e53f3900801cf56102f4 Mon Sep 17 00:00:00 2001
From: amd-hsivasun <hsivasun@amd.com>
Date: Thu, 11 Sep 2025 18:56:55 +0000
Subject: [PATCH 46/53] Updated rocrtst testDir

---
 .azuredevops/components/ROCR-Runtime.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.azuredevops/components/ROCR-Runtime.yml b/.azuredevops/components/ROCR-Runtime.yml
index 098efb5aa..fea5ae3d0 100644
--- a/.azuredevops/components/ROCR-Runtime.yml
+++ b/.azuredevops/components/ROCR-Runtime.yml
@@ -210,7 +210,7 @@ jobs:
           componentName: rocrtst
           testExecutable: ./rocrtst64
           testParameters: '--gtest_filter="-rocrtstNeg.Memory_Negative_Tests:rocrtstFunc.Memory_Max_Mem" --gtest_output=xml:./test_output.xml --gtest_color=yes'
-          testDir: $(Build.SourcesDirectory)/rocrtst/suites/test_common/build/${{ job.target }}
+          testDir: $(Agent.BuildDirectory)/s//rocrtst/suites/test_common/build/${{ job.target }}
           os: ${{ job.os }}
       - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
         parameters:

From 8c1df97e34836783d27d6910e39af3263252aa28 Mon Sep 17 00:00:00 2001
From: amd-hsivasun <hsivasun@amd.com>
Date: Wed, 10 Sep 2025 22:40:35 +0000
Subject: [PATCH 47/53] [Ex CI] Enable rocprofiler-sdk monorepo

---
 .azuredevops/components/rocprofiler-sdk.yml | 39 +++++++++++++++++++--
 1 file changed, 37 insertions(+), 2 deletions(-)

diff --git a/.azuredevops/components/rocprofiler-sdk.yml b/.azuredevops/components/rocprofiler-sdk.yml
index 7dea99f0e..7ccfd39db 100644
--- a/.azuredevops/components/rocprofiler-sdk.yml
+++ b/.azuredevops/components/rocprofiler-sdk.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: rocprofiler-sdk
 - name: checkoutRepo
   type: string
   default: 'self'
 - name: checkoutRef
   type: string
   default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -73,6 +92,10 @@ parameters:
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
   - job: rocprofiler_sdk_build_${{ job.target }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.target }}
     variables:
     - group: common
     - template: /.azuredevops/variables-global.yml
@@ -89,6 +112,7 @@ jobs:
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
       parameters:
         checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
       parameters:
@@ -96,6 +120,8 @@ jobs:
         dependencyList: ${{ parameters.rocmDependencies }}
         gpuTarget: ${{ job.target }}
         aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
     - task: Bash@3
       displayName: Add Python site-packages binaries to path
       inputs:
@@ -105,6 +131,7 @@ jobs:
           echo "##vso[task.prependpath]$USER_BASE/bin"
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
       parameters:
+        componentName: ${{ parameters.componentName }}
         extraBuildFlags: >-
           -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
           -DROCPROFILER_BUILD_TESTS=ON
@@ -114,9 +141,12 @@ jobs:
           -GNinja
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
       parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
         gpuTarget: ${{ job.target }}
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
       parameters:
+        componentName: ${{ parameters.componentName }}
         gpuTarget: ${{ job.target }}
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
     # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
@@ -126,13 +156,14 @@ jobs:
     #     gpuTarget: ${{ job.target }}
     #     registerROCmPackages: true
 
+- ${{ if eq(parameters.unifiedBuild, False) }}:
 - ${{ each job in parameters.jobMatrix.testJobs }}:
   - job: rocprofiler_sdk_test_${{ job.target }}
     dependsOn: rocprofiler_sdk_build_${{ job.target }}
     condition:
       and(succeeded(),
         eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
+        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
         eq(${{ parameters.aggregatePipeline }}, False)
       )
     variables:
@@ -150,6 +181,7 @@ jobs:
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
       parameters:
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
         checkoutRepo: ${{ parameters.checkoutRepo }}
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
@@ -157,6 +189,8 @@ jobs:
         checkoutRef: ${{ parameters.checkoutRef }}
         dependencyList: ${{ parameters.rocmDependencies }}
         gpuTarget: ${{ job.target }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
     - task: Bash@3
       displayName: Add Python and ROCm binaries to path
       inputs:
@@ -167,6 +201,7 @@ jobs:
           echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/bin"
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
       parameters:
+        componentName: ${{ parameters.componentName }}
         extraBuildFlags: >-
           -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
           -DROCPROFILER_BUILD_TESTS=ON
@@ -177,7 +212,7 @@ jobs:
     - template: ${{ variables.CI_TEMPLATE_PATH}}/steps/gpu-diagnostics.yml
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
       parameters:
-        componentName: rocprofiler-sdk
+        componentName: ${{ parameters.componentName }}
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
       parameters:
         aptPackages: ${{ parameters.aptPackages }}

From e71b8212f9046f1ba096b80a32aaacbdb91fa30e Mon Sep 17 00:00:00 2001
From: amd-hsivasun <hsivasun@amd.com>
Date: Wed, 10 Sep 2025 22:42:02 +0000
Subject: [PATCH 48/53] Fixed Indentation

---
 .azuredevops/components/rocprofiler-sdk.yml | 126 ++++++++++----------
 1 file changed, 63 insertions(+), 63 deletions(-)

diff --git a/.azuredevops/components/rocprofiler-sdk.yml b/.azuredevops/components/rocprofiler-sdk.yml
index 7ccfd39db..3f1656040 100644
--- a/.azuredevops/components/rocprofiler-sdk.yml
+++ b/.azuredevops/components/rocprofiler-sdk.yml
@@ -157,66 +157,66 @@ jobs:
     #     registerROCmPackages: true
 
 - ${{ if eq(parameters.unifiedBuild, False) }}:
-- ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: rocprofiler_sdk_test_${{ job.target }}
-    dependsOn: rocprofiler_sdk_build_${{ job.target }}
-    condition:
-      and(succeeded(),
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
-        eq(${{ parameters.aggregatePipeline }}, False)
-      )
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool: ${{ job.target }}_test_pool
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-        registerROCmPackages: true
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      parameters:
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
-        checkoutRepo: ${{ parameters.checkoutRepo }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmDependencies }}
-        gpuTarget: ${{ job.target }}
-        ${{ if parameters.triggerDownstreamJobs }}:
-            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
-    - task: Bash@3
-      displayName: Add Python and ROCm binaries to path
-      inputs:
-        targetType: inline
-        script: |
-          USER_BASE=$(python3 -m site --user-base)
-          echo "##vso[task.prependpath]$USER_BASE/bin"
-          echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/bin"
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-      parameters:
-        componentName: ${{ parameters.componentName }}
-        extraBuildFlags: >-
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-          -DROCPROFILER_BUILD_TESTS=ON
-          -DROCPROFILER_BUILD_SAMPLES=ON
-          -DROCPROFILER_BUILD_RELEASE=ON
-          -DGPU_TARGETS=${{ job.target }}
-          -GNinja
-    - template: ${{ variables.CI_TEMPLATE_PATH}}/steps/gpu-diagnostics.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: ${{ parameters.componentName }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-        environment: test
-        gpuTarget: ${{ job.target }}
-        registerROCmPackages: true
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
+    - job: rocprofiler_sdk_test_${{ job.target }}
+      dependsOn: rocprofiler_sdk_build_${{ job.target }}
+      condition:
+        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
+          eq(${{ parameters.aggregatePipeline }}, False)
+        )
+      variables:
+      - group: common
+      - template: /.azuredevops/variables-global.yml
+      pool: ${{ job.target }}_test_pool
+      workspace:
+        clean: all
+      steps:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+          registerROCmPackages: true
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+        parameters:
+          sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+          checkoutRepo: ${{ parameters.checkoutRepo }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
+          dependencyList: ${{ parameters.rocmDependencies }}
+          gpuTarget: ${{ job.target }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+              downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+      - task: Bash@3
+        displayName: Add Python and ROCm binaries to path
+        inputs:
+          targetType: inline
+          script: |
+            USER_BASE=$(python3 -m site --user-base)
+            echo "##vso[task.prependpath]$USER_BASE/bin"
+            echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/bin"
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+        parameters:
+          componentName: ${{ parameters.componentName }}
+          extraBuildFlags: >-
+            -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+            -DROCPROFILER_BUILD_TESTS=ON
+            -DROCPROFILER_BUILD_SAMPLES=ON
+            -DROCPROFILER_BUILD_RELEASE=ON
+            -DGPU_TARGETS=${{ job.target }}
+            -GNinja
+      - template: ${{ variables.CI_TEMPLATE_PATH}}/steps/gpu-diagnostics.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: ${{ parameters.componentName }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+          environment: test
+          gpuTarget: ${{ job.target }}
+          registerROCmPackages: true

From c9c41a34c204359b7389a785a20c035ae2c92040 Mon Sep 17 00:00:00 2001
From: amd-hsivasun <hsivasun@amd.com>
Date: Thu, 11 Sep 2025 20:11:31 +0000
Subject: [PATCH 49/53] [Ex CI] enable hip-tests monorepo

---
 .azuredevops/components/hip-tests.yml | 133 ++++++++++++++++----------
 1 file changed, 83 insertions(+), 50 deletions(-)

diff --git a/.azuredevops/components/hip-tests.yml b/.azuredevops/components/hip-tests.yml
index c88465a6d..388ac4170 100644
--- a/.azuredevops/components/hip-tests.yml
+++ b/.azuredevops/components/hip-tests.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: hip-tests
 - name: checkoutRepo
   type: string
   default: 'self'
 - name: checkoutRef
   type: string
   default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -60,6 +79,10 @@ parameters:
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
   - job: hip_tests_build_${{ job.target }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.target }}
     variables:
     - group: common
     - template: /.azuredevops/variables-global.yml
@@ -76,15 +99,18 @@ jobs:
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
       parameters:
         checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
       parameters:
         checkoutRef: ${{ parameters.checkoutRef }}
         dependencyList: ${{ parameters.rocmDependencies }}
         aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
     # compile hip-tests
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
       parameters:
-        componentName: hip-tests
+        componentName: ${{ parameters.componentName }}
         cmakeSourceDir: '../catch'
         customBuildTarget: build_tests
         extraBuildFlags: >-
@@ -96,9 +122,12 @@ jobs:
           -GNinja
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
       parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
         gpuTarget: ${{ job.target }}
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
       parameters:
+        componentName: ${{ parameters.componentName }}
         gpuTarget: ${{ job.target }}
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
     - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
@@ -108,52 +137,56 @@ jobs:
         extraEnvVars:
           - HIP_ROCCLR_HOME:::/home/user/workspace/rocm
 
-- ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: hip_tests_test_${{ job.target }}
-    timeoutInMinutes: 240
-    dependsOn: hip_tests_build_${{ job.target }}
-    condition:
-      and(succeeded(),
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
-        eq(${{ parameters.aggregatePipeline }}, False)
-      )
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool: ${{ job.target }}_test_pool
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-      parameters:
-        gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmTestDependencies }}
-        gpuTarget: ${{ job.target }}
-    - task: Bash@3
-      displayName: Symlink rocm_agent_enumerator
-      inputs:
-        targetType: inline
-        script: |
-          # Assuming that /opt is no longer persistent across runs, test environments are fully ephemeral
-          sudo mkdir -p /opt/rocm/bin
-          sudo ln -s $(Agent.BuildDirectory)/rocm/bin/rocm_agent_enumerator /opt/rocm/bin/rocm_agent_enumerator
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: hip_tests
-        testDir: $(Agent.BuildDirectory)/rocm/share/hip
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        environment: test
-        gpuTarget: ${{ job.target }}
-        optSymLink: true
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
+    - job: hip_tests_test_${{ job.target }}
+      timeoutInMinutes: 240
+      dependsOn: hip_tests_build_${{ job.target }}
+      condition:
+        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
+          eq(${{ parameters.aggregatePipeline }}, False)
+        )
+      variables:
+      - group: common
+      - template: /.azuredevops/variables-global.yml
+      pool: ${{ job.target }}_test_pool
+      workspace:
+        clean: all
+      steps:
+      - checkout: none
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+        parameters:
+          gpuTarget: ${{ job.target }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
+          dependencyList: ${{ parameters.rocmTestDependencies }}
+          gpuTarget: ${{ job.target }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+      - task: Bash@3
+        displayName: Symlink rocm_agent_enumerator
+        inputs:
+          targetType: inline
+          script: |
+            # Assuming that /opt is no longer persistent across runs, test environments are fully ephemeral
+            sudo mkdir -p /opt/rocm/bin
+            sudo ln -s $(Agent.BuildDirectory)/rocm/bin/rocm_agent_enumerator /opt/rocm/bin/rocm_agent_enumerator
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: ${{ parameters.componentName }}
+          testDir: $(Agent.BuildDirectory)/rocm/share/hip
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          environment: test
+          gpuTarget: ${{ job.target }}
+          optSymLink: true

From 17be0ce7aadecbbe326bca9871aaa04094715311 Mon Sep 17 00:00:00 2001
From: amd-hsivasun <hsivasun@amd.com>
Date: Thu, 11 Sep 2025 16:33:27 -0400
Subject: [PATCH 50/53] [Ex CI] Update pipeline Id for rocprofiler-sdk to
 monorepo

---
 .azuredevops/templates/steps/dependencies-rocm.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.azuredevops/templates/steps/dependencies-rocm.yml b/.azuredevops/templates/steps/dependencies-rocm.yml
index e6e6db966..3db025e8c 100644
--- a/.azuredevops/templates/steps/dependencies-rocm.yml
+++ b/.azuredevops/templates/steps/dependencies-rocm.yml
@@ -215,8 +215,8 @@ parameters:
       developBranch: develop
       hasGpuTarget: false
     rocprofiler-sdk:
-      pipelineId: 246
-      developBranch: amd-staging
+      pipelineId: 347
+      developBranch: develop
       hasGpuTarget: true
     rocprofiler-systems:
       pipelineId: 255

From 9a3fc8c773a0b7249d596c77a8970c1e3f8cd924 Mon Sep 17 00:00:00 2001
From: amd-hsivasun <hsivasun@amd.com>
Date: Wed, 10 Sep 2025 17:55:19 -0400
Subject: [PATCH 51/53] [Ex CI] Update pipeline Id for rocm-smi-lib to monorepo

---
 .azuredevops/templates/steps/dependencies-rocm.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.azuredevops/templates/steps/dependencies-rocm.yml b/.azuredevops/templates/steps/dependencies-rocm.yml
index 3db025e8c..8f846b2ac 100644
--- a/.azuredevops/templates/steps/dependencies-rocm.yml
+++ b/.azuredevops/templates/steps/dependencies-rocm.yml
@@ -195,8 +195,8 @@ parameters:
       developBranch: master
       hasGpuTarget: false
     rocm_smi_lib:
-      pipelineId: 96
-      developBranch: amd-staging
+      pipelineId: 358
+      developBranch: develop
       hasGpuTarget: false
     rocPRIM:
       pipelineId: 273

From b3c566f6b984ba3ba1c254fffde51c893d857665 Mon Sep 17 00:00:00 2001
From: amd-hsivasun <hsivasun@amd.com>
Date: Thu, 11 Sep 2025 16:50:49 -0400
Subject: [PATCH 52/53] [Ex CI] Update pipeline Id for hip-tests to monorepo

---
 .azuredevops/templates/steps/dependencies-rocm.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.azuredevops/templates/steps/dependencies-rocm.yml b/.azuredevops/templates/steps/dependencies-rocm.yml
index 8f846b2ac..0b46ed37f 100644
--- a/.azuredevops/templates/steps/dependencies-rocm.yml
+++ b/.azuredevops/templates/steps/dependencies-rocm.yml
@@ -63,8 +63,8 @@ parameters:
       developBranch: develop
       hasGpuTarget: false
     hip-tests:
-      pipelineId: 233
-      developBranch: amd-staging
+      pipelineId: 362
+      developBranch: develop
       hasGpuTarget: false
     hipBLAS:
       pipelineId: 317

From 355feae2e2f9da98e2c139432c224d7d837db281 Mon Sep 17 00:00:00 2001
From: amd-hsivasun <hsivasun@amd.com>
Date: Thu, 11 Sep 2025 16:37:00 -0400
Subject: [PATCH 53/53] [Ex CI] Update pipeline Id for rocr-runtime to monorepo

---
 .azuredevops/templates/steps/dependencies-rocm.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.azuredevops/templates/steps/dependencies-rocm.yml b/.azuredevops/templates/steps/dependencies-rocm.yml
index 0b46ed37f..5fbb57bb5 100644
--- a/.azuredevops/templates/steps/dependencies-rocm.yml
+++ b/.azuredevops/templates/steps/dependencies-rocm.yml
@@ -227,8 +227,8 @@ parameters:
       developBranch: develop
       hasGpuTarget: true
     ROCR-Runtime:
-      pipelineId: 10
-      developBranch: amd-staging
+      pipelineId: 354
+      developBranch: develop
       hasGpuTarget: false
     rocRAND:
       pipelineId: 274