Merge branch 'develop' into develop-internal

2026-01-09 06:38:00 -05:00 · 2025-09-16 05:12:42 -04:00
parent b800801427 b07ae4ba6c
commit 60e3a8107c
89 changed files with 7608 additions and 1892 deletions
--- a/.azuredevops/components/HIP.yml
+++ b/.azuredevops/components/HIP.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: hip_clr_combined
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -35,93 +54,24 @@ parameters:
  type: object
  default:
    - llvm-project
-
-# hip and clr are tightly-coupled
-# run this same template for both repos
-# any changes for clr should just trigger HIP pipeline
-# similarly for hipother repo, for Nvidia backend
+    - ROCR-Runtime

 - name: jobMatrix
  type: object
  default:
    buildJobs:
-      - { os: ubuntu2204, packageManager: apt }
-      - { os: almalinux8, packageManager: dnf }
+      - { os: ubuntu2204, packageManager: apt, platform: amd }
+      - { os: ubuntu2204, packageManager: apt, platform: nvidia }
+      - { os: almalinux8, packageManager: dnf, platform: amd }
+      - { os: almalinux8, packageManager: dnf, platform: nvidia }

-# HIP with AMD backend
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: hip_clr_combined_${{ job.os }}_amd
-    pool:
-      vmImage: 'ubuntu-22.04'
-    ${{ if eq(job.os, 'almalinux8') }}:
-      container:
-        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-        endpoint: ContainerService3
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-        packageManager: ${{ job.packageManager }}
-  # checkout triggering repo (either HIP or clr)
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      parameters:
-        checkoutRepo: ${{ parameters.checkoutRepo }}
-  # if this is triggered by HIP repo, matching repo is clr
-  # if this is triggered by clr repo, matching repo is HIP
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      parameters:
-        checkoutRepo: matching_repo
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      parameters:
-        checkoutRepo: hipother_repo
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmDependenciesAMD }}
-        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-        os: ${{ job.os }}
-  # compile clr
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-      parameters:
-        componentName: clr
-        cmakeBuildDir: '$(Build.SourcesDirectory)/clr/build'
-        cmakeSourceDir: '$(Build.SourcesDirectory)/clr'
-        os: ${{ job.os }}
-        useAmdclang: false
-        extraBuildFlags: >-
-          -DHIP_COMMON_DIR=$(Build.SourcesDirectory)/HIP
-          -DHIP_PLATFORM=amd
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-          -DROCM_PATH=$(Agent.BuildDirectory)/rocm
-          -DHIPCC_BIN_DIR=$(Agent.BuildDirectory)/rocm/bin
-          -DCLR_BUILD_HIP=ON
-          -DCLR_BUILD_OCL=ON
-          -GNinja
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-      parameters:
-        artifactName: amd
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-      parameters:
-        artifactName: amd
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-    #   parameters:
-    #     aptPackages: ${{ parameters.aptPackages }}
-    #     pipModules: ${{ parameters.pipModules }}
-    #     environment: amd
-
-# HIP with Nvidia backend
- ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: hip_clr_combined_${{ job.os }}_nvidia
+  - job: ${{ parameters.componentName }}_${{ job.os }}_${{ job.platform }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}
    pool:
      vmImage: 'ubuntu-22.04'
    ${{ if eq(job.os, 'almalinux8') }}:
@@ -140,49 +90,45 @@ jobs:
        pipModules: ${{ parameters.pipModules }}
        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-  # checkout triggering repo (either HIP or clr)
+    # full checkout of rocm-systems superrepo, we need clr, hip, and hipother
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
-  # if this is triggered by HIP repo, matching repo is clr
-  # if this is triggered by clr repo, matching repo is HIP
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      parameters:
-        checkoutRepo: matching_repo
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      parameters:
-        checkoutRepo: hipother_repo
+        # sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmDependenciesNvidia }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
        os: ${{ job.os }}
-    - script: 'ls -1R $(Agent.BuildDirectory)/rocm'
-      displayName: 'Artifact listing'
-  # compile clr
+        ${{ if eq(job.platform, 'amd') }}:
+          dependencyList: ${{ parameters.rocmDependenciesAMD }}
+        ${{ elseif eq(job.platform, 'nvidia') }}:
+          dependencyList: ${{ parameters.rocmDependenciesNvidia }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        componentName: clr
-        cmakeBuildDir: '$(Build.SourcesDirectory)/clr/build'
-        cmakeSourceDir: '$(Build.SourcesDirectory)/clr'
+        cmakeBuildDir: $(Agent.BuildDirectory)/s/projects/clr/build
+        cmakeSourceDir: $(Agent.BuildDirectory)/s/projects/clr
        os: ${{ job.os }}
        useAmdclang: false
        extraBuildFlags: >-
-          -DHIP_COMMON_DIR=$(Build.SourcesDirectory)/HIP
-          -DHIP_PLATFORM=nvidia
+          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+          -DROCM_PATH=$(Agent.BuildDirectory)/rocm
          -DHIPCC_BIN_DIR=$(Agent.BuildDirectory)/rocm/bin
+          -DHIP_COMMON_DIR=$(Agent.BuildDirectory)/s/projects/hip
+          -DHIPNV_DIR=$(Agent.BuildDirectory)/s/projects/hipother/hipnv
+          -DHIP_PLATFORM=${{ job.platform }}
          -DCLR_BUILD_HIP=ON
-          -DCLR_BUILD_OCL=OFF
-          -DHIPNV_DIR=$(Build.SourcesDirectory)/hipother/hipnv
+          -DCLR_BUILD_OCL=ON
          -GNinja
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+      parameters:
+        artifactName: ${{ job.platform }}
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
-        artifactName: nvidia
+        artifactName: ${{ job.platform }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-    #   parameters:
-    #     aptPackages: ${{ parameters.aptPackages }}
-    #     pipModules: ${{ parameters.pipModules }}
-    #     environment: nvidia
--- a/.azuredevops/components/MIOpen.yml
+++ b/.azuredevops/components/MIOpen.yml
@@ -131,6 +131,7 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
@@ -149,6 +150,7 @@ jobs:
          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - task: Bash@3
      displayName: Build and install other dependencies
+      retryCountOnTaskFailure: 3
      inputs:
        targetType: inline
        workingDirectory: $(Agent.BuildDirectory)/s
@@ -210,6 +212,7 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
@@ -228,6 +231,7 @@ jobs:
          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - task: Bash@3
      displayName: Build and install other dependencies
+      retryCountOnTaskFailure: 3
      inputs:
        targetType: inline
        workingDirectory: $(Agent.BuildDirectory)/s
--- a/.azuredevops/components/ROCR-Runtime.yml
+++ b/.azuredevops/components/ROCR-Runtime.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: ROCR-Runtime
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -45,6 +64,10 @@ parameters:
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: ROCR_Runtime_build_${{ job.os }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}
    pool:
      vmImage: 'ubuntu-22.04'
    ${{ if eq(job.os, 'almalinux8') }}:
@@ -65,14 +88,18 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
        os: ${{ job.os }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        os: ${{ job.os }}
        useAmdclang: false
        extraBuildFlags: >-
@@ -82,22 +109,26 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
    #   parameters:
    #     aptPackages: ${{ parameters.aptPackages }}

- ${{ each job in parameters.jobMatrix.testJobs }}:
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
    - job: ROCR_Runtime_test_${{ job.os }}_${{ job.target }}
      dependsOn: ROCR_Runtime_build_${{ job.os }}
      condition:
        and(succeeded(),
          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
          eq(${{ parameters.aggregatePipeline }}, False)
        )
      variables:
@@ -122,9 +153,12 @@ jobs:
          dependencyList: ${{ parameters.rocmTestDependencies }}
          gpuTarget: ${{ job.target }}
          os: ${{ job.os }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
        parameters:
          checkoutRepo: ${{ parameters.checkoutRepo }}
+          sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
        parameters:
          runRocminfo: false
@@ -132,7 +166,7 @@ jobs:
        displayName: Build kfdtest
        inputs:
          targetType: 'inline'
-        workingDirectory: $(Build.SourcesDirectory)/libhsakmt/tests/kfdtest
+          workingDirectory: $(Agent.BuildDirectory)/s/libhsakmt/tests/kfdtest
          script: |
            if [ -e /opt/rh/gcc-toolset-14/enable ]; then
              source /opt/rh/gcc-toolset-14/enable
@@ -143,17 +177,17 @@ jobs:
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
        parameters:
          componentName: kfdtest
-        testExecutable: BIN_DIR=$(Build.SourcesDirectory)/libhsakmt/tests/kfdtest/build ./run_kfdtest.sh
+          testExecutable: BIN_DIR=$(Agent.BuildDirectory)/s/libhsakmt/tests/kfdtest/build ./run_kfdtest.sh
          testParameters: '-p core --gtest_output=xml:./test_output.xml --gtest_color=yes'
-        testDir: $(Build.SourcesDirectory)/libhsakmt/tests/kfdtest/scripts
+          testDir: $(Agent.BuildDirectory)/s/libhsakmt/tests/kfdtest/scripts
          os: ${{ job.os }}
      - task: Bash@3
        displayName: Build rocrtst
        inputs:
          targetType: 'inline'
-        workingDirectory: $(Build.SourcesDirectory)/rocrtst/suites/test_common
+          workingDirectory: $(Agent.BuildDirectory)/s/rocrtst/suites/test_common
          script: |
-          echo $(Build.SourcesDirectory)/rocrtst/thirdparty/lib | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
+            echo $(Agent.BuildDirectory)/s/rocrtst/thirdparty/lib | sudo tee -a /etc/ld.so.conf.d/rocm-ci.conf
            sudo cat /etc/ld.so.conf.d/rocm-ci.conf
            sudo ldconfig -v
            ldconfig -p
@@ -176,7 +210,7 @@ jobs:
          componentName: rocrtst
          testExecutable: ./rocrtst64
          testParameters: '--gtest_filter="-rocrtstNeg.Memory_Negative_Tests:rocrtstFunc.Memory_Max_Mem" --gtest_output=xml:./test_output.xml --gtest_color=yes'
-        testDir: $(Build.SourcesDirectory)/rocrtst/suites/test_common/build/${{ job.target }}
+          testDir: $(Agent.BuildDirectory)/s//rocrtst/suites/test_common/build/${{ job.target }}
          os: ${{ job.os }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
        parameters:
--- a/.azuredevops/components/Tensile.yml
+++ b/.azuredevops/components/Tensile.yml
@@ -171,6 +171,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - task: DownloadPipelineArtifact@2
      displayName: 'Download Pipeline Wheel Files'
+      retryCountOnTaskFailure: 3
      inputs:
        itemPattern: '**/*${{ job.os }}*.whl'
        targetPath: $(Agent.BuildDirectory)
--- a/.azuredevops/components/hip-tests.yml
+++ b/.azuredevops/components/hip-tests.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: hip-tests
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -60,6 +79,10 @@ parameters:
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: hip_tests_build_${{ job.target }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -76,15 +99,18 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    # compile hip-tests
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
-        componentName: hip-tests
+        componentName: ${{ parameters.componentName }}
        cmakeSourceDir: '../catch'
        customBuildTarget: build_tests
        extraBuildFlags: >-
@@ -96,9 +122,12 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
@@ -108,14 +137,15 @@ jobs:
        extraEnvVars:
          - HIP_ROCCLR_HOME:::/home/user/workspace/rocm

- ${{ each job in parameters.jobMatrix.testJobs }}:
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
    - job: hip_tests_test_${{ job.target }}
      timeoutInMinutes: 240
      dependsOn: hip_tests_build_${{ job.target }}
      condition:
        and(succeeded(),
          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
          eq(${{ parameters.aggregatePipeline }}, False)
        )
      variables:
@@ -125,6 +155,7 @@ jobs:
      workspace:
        clean: all
      steps:
+      - checkout: none
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
        parameters:
          aptPackages: ${{ parameters.aptPackages }}
@@ -138,6 +169,8 @@ jobs:
          checkoutRef: ${{ parameters.checkoutRef }}
          dependencyList: ${{ parameters.rocmTestDependencies }}
          gpuTarget: ${{ job.target }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
      - task: Bash@3
        displayName: Symlink rocm_agent_enumerator
        inputs:
@@ -149,7 +182,7 @@ jobs:
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
        parameters:
-        componentName: hip_tests
+          componentName: ${{ parameters.componentName }}
          testDir: $(Agent.BuildDirectory)/rocm/share/hip
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
        parameters:
--- a/.azuredevops/components/hipBLASLt.yml
+++ b/.azuredevops/components/hipBLASLt.yml
@@ -35,6 +35,8 @@ parameters:
    - ccache
    - gfortran
    - git
+    - libboost-filesystem-dev
+    - libboost-program-options-dev
    - libdrm-dev
    - liblapack-dev
    - libmsgpack-dev
@@ -176,7 +178,7 @@ jobs:
          mkdir -p $(Agent.BuildDirectory)/temp-deps
          cd $(Agent.BuildDirectory)/temp-deps
          # position-independent LAPACK is required for almalinux8 builds
-          cmake -DBUILD_GTEST=OFF -DBUILD_LAPACK=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON $(Agent.BuildDirectory)/s/deps
+          cmake -DBUILD_GTEST=OFF -DBUILD_LAPACK=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON $(Agent.BuildDirectory)/sparse/projects/hipblaslt/deps
          make -j
          sudo make install
    - script: |
@@ -195,6 +197,8 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        os: ${{ job.os }}
+        cmakeSourceDir: $(Agent.BuildDirectory)/sparse/projects/hipblaslt
+        cmakeBuildDir: $(Agent.BuildDirectory)/sparse/projects/hipblaslt/build
        extraBuildFlags: >-
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor
          -DCMAKE_INCLUDE_PATH=$(Agent.BuildDirectory)/rocm/llvm/include
--- a/.azuredevops/components/hipSPARSELt.yml
+++ b/.azuredevops/components/hipSPARSELt.yml
@@ -44,6 +44,7 @@ parameters:
  type: object
  default:
    - joblib
+    - msgpack
 - name: rocmDependencies
  type: object
  default:
@@ -158,6 +159,7 @@ jobs:
          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm"
          -DROCM_PATH=$(Agent.BuildDirectory)/rocm
          -DBUILD_CLIENTS_TESTS=ON
+          -DBUILD_USE_LOCAL_TENSILE=OFF
          -GNinja
        ${{ if ne(parameters.sparseCheckoutDir, '') }}:
          cmakeSourceDir: $(Build.SourcesDirectory)/projects/hipsparselt
--- a/.azuredevops/components/origami.yml
+++ b/.azuredevops/components/origami.yml
@@ -0,0 +1,236 @@
+parameters:
+- name: componentName
+  type: string
+  default: origami
+- name: checkoutRepo
+  type: string
+  default: 'self'
+- name: checkoutRef
+  type: string
+  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
+# set to true if doing full build of ROCm stack
+# and dependencies are pulled from same pipeline
+- name: aggregatePipeline
+  type: boolean
+  default: false
+- name: aptPackages
+  type: object
+  default:
+    - cmake
+    - git
+    - ninja-build
+    - wget
+    - python3
+    - python3-dev
+    - python3-pip
+- name: pipModules
+  type: object
+  default:
+    - nanobind>=2.0.0
+- name: rocmDependencies
+  type: object
+  default:
+    - clr
+    - llvm-project
+    - rocm-cmake
+    - rocminfo
+    - ROCR-Runtime
+    - rocprofiler-register
+- name: rocmTestDependencies
+  type: object
+  default:
+    - clr
+    - llvm-project
+    - rocm-cmake
+    - rocminfo
+    - ROCR-Runtime
+    - rocprofiler-register
+
+- name: jobMatrix
+  type: object
+  default:
+    buildJobs:
+      - { os: ubuntu2204, packageManager: apt }
+      - { os: almalinux8, packageManager: dnf }
+    testJobs:
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+- name: downstreamComponentMatrix
+  type: object
+  default:
+    - hipBLASLt:
+      name: hipBLASLt
+      sparseCheckoutDir: projects/hipblaslt
+      skipUnifiedBuild: 'false'
+      buildDependsOn:
+        - origami_build
+
+jobs:
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
+  - job: origami_build_${{ job.os }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    - name: ROCM_PATH
+      value: $(Agent.BuildDirectory)/rocm
+    pool:
+      vmImage: ${{ variables.BASE_BUILD_POOL }}
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        pipModules: ${{ parameters.pipModules }}
+        packageManager: ${{ job.packageManager }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      parameters:
+        checkoutRef: ${{ parameters.checkoutRef }}
+        dependencyList: ${{ parameters.rocmDependencies }}
+        os: ${{ job.os }}
+        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        os: ${{ job.os }}
+        extraBuildFlags: >-
+          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
+          -DORIGAMI_BUILD_SHARED_LIBS=ON
+          -DORIGAMI_ENABLE_PYTHON=ON
+          -DORIGAMI_BUILD_TESTING=ON
+          -GNinja
+    - ${{ if ne(job.os, 'almalinux8') }}:
+      - task: PublishPipelineArtifact@1
+        displayName: 'Publish Build Directory Artifact'
+        inputs:
+          targetPath: '$(Agent.BuildDirectory)/s/build'
+          artifact: '${{ parameters.componentName }}_${{ job.os }}_build_dir'
+          publishLocation: 'pipeline'
+      - task: PublishPipelineArtifact@1
+        displayName: 'Publish Python Source Artifact'
+        inputs:
+          targetPath: '$(Agent.BuildDirectory)/s/python'
+          artifact: '${{ parameters.componentName }}_${{ job.os }}_python_src'
+          publishLocation: 'pipeline'
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+      parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
+        os: ${{ job.os }}
+        componentName: ${{ parameters.componentName }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
+    - job: origami_test_${{ job.os }}_${{ job.target }}
+      timeoutInMinutes: 120
+      dependsOn: origami_build_${{ job.os }}
+      condition:
+        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
+          eq(${{ parameters.aggregatePipeline }}, False)
+        )
+      variables:
+      - group: common
+      - template: /.azuredevops/variables-global.yml
+      pool: ${{ job.target }}_test_pool
+      workspace:
+        clean: all
+      steps:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+        parameters:
+          checkoutRepo: ${{ parameters.checkoutRepo }}
+          sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+          packageManager: ${{ job.packageManager }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+        parameters:
+          preTargetFilter: ${{ parameters.componentName }}
+          os: ${{ job.os }}
+      - task: DownloadPipelineArtifact@2
+        displayName: 'Download Build Directory Artifact'
+        inputs:
+          artifact: '${{ parameters.componentName }}_${{ job.os }}_build_dir'
+          path: '$(Agent.BuildDirectory)/s/build'
+      - task: DownloadPipelineArtifact@2
+        displayName: 'Download Python Source Artifact'
+        inputs:
+          artifact: '${{ parameters.componentName }}_${{ job.os }}_python_src'
+          path: '$(Agent.BuildDirectory)/s/python'
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
+          dependencyList: ${{ parameters.rocmTestDependencies }}
+          os: ${{ job.os }}
+          gpuTarget: ${{ job.target }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      - script: |
+          export PYTHONPATH=$(Agent.BuildDirectory)/s/build/python:$PYTHONPATH
+
+          echo "--- Running origami_test.py ---"
+          python3 $(Agent.BuildDirectory)/s/python/origami_test.py
+          
+          echo "--- Running origami_grid_test.py ---"
+          python3 $(Agent.BuildDirectory)/s/python/origami_grid_test.py
+        displayName: 'Run Python Binding Tests'
+        condition: succeeded()
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+          environment: test
+          gpuTarget: ${{ job.target }}
+
+- ${{ if parameters.triggerDownstreamJobs }}:
+  - ${{ each component in parameters.downstreamComponentMatrix }}:
+    - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
+      - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
+        parameters:
+          checkoutRepo: ${{ parameters.checkoutRepo }}
+          sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
+          buildDependsOn: ${{ component.buildDependsOn }}
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
+          triggerDownstreamJobs: true
+          unifiedBuild: ${{ parameters.unifiedBuild }}
--- a/.azuredevops/components/rocBLAS.yml
+++ b/.azuredevops/components/rocBLAS.yml
@@ -115,6 +115,13 @@ parameters:
 #        buildDependsOn:
 #          - rocBLAS_build
 #          - rocPRIM_build
+    # temporary rocblas->hipblas downstream path while the SOLVERs are disabled
+    - hipBLAS:
+      name: hipBLAS
+      sparseCheckoutDir: projects/hipblas
+      skipUnifiedBuild: 'false'
+      buildDependsOn:
+        - rocBLAS_build

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
@@ -172,6 +179,8 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        os: ${{ job.os }}
+        cmakeSourceDir: $(Agent.BuildDirectory)/sparse/projects/rocblas
+        cmakeBuildDir: $(Agent.BuildDirectory)/sparse/projects/rocblas/build
        extraBuildFlags: >-
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm/llvm;$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor
          -DCMAKE_BUILD_TYPE=Release
--- a/.azuredevops/components/rocDecode.yml
+++ b/.azuredevops/components/rocDecode.yml
@@ -8,6 +8,25 @@ parameters:
 - name: checkoutRef
  type: string
  default: ''
+- name: rocPyDecodeRepo
+  type: string
+  default: rocpydecode_repo
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -56,10 +75,23 @@ parameters:
    testJobs:
      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+- name: downstreamComponentMatrix
+  type: object
+  default:
+    - rocPyDecode:
+      name: rocPyDecode
+      sparseCheckoutDir: ''
+      skipUnifiedBuild: 'false'
+      buildDependsOn:
+        - rocDecode_build

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: ${{ parameters.componentName }}_build_${{ job.os }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -83,12 +115,15 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        os: ${{ job.os }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        os: ${{ job.os }}
@@ -169,3 +204,15 @@ jobs:
        registerROCmPackages: true
        environment: test
        gpuTarget: ${{ job.target }}
+
+- ${{ if parameters.triggerDownstreamJobs }}:
+  - ${{ each component in parameters.downstreamComponentMatrix }}:
+    - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
+      - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
+        parameters:
+          checkoutRepo: ${{ parameters.rocPyDecodeRepo }}
+          sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
+          buildDependsOn: ${{ component.buildDependsOn }}
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
+          triggerDownstreamJobs: true
+          unifiedBuild: ${{ parameters.unifiedBuild }}
--- a/.azuredevops/components/rocPyDecode.yml
+++ b/.azuredevops/components/rocPyDecode.yml
@@ -5,6 +5,22 @@ parameters:
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -47,19 +63,19 @@ parameters:
  type: object
  default:
    buildJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
    testJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: rocPyDecode_build_${{ job.target }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -74,16 +90,20 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
+        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - task: Bash@3
      displayName: 'Save Python Package Paths'
      inputs:
@@ -190,6 +210,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - task: DownloadPipelineArtifact@2
      displayName: 'Download Pipeline Wheel Files'
+      retryCountOnTaskFailure: 3
      inputs:
        itemPattern: '**/*.whl'
        targetPath: $(Agent.BuildDirectory)
--- a/.azuredevops/components/rocm-core.yml
+++ b/.azuredevops/components/rocm-core.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: rocm-core
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -27,6 +46,10 @@ parameters:
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: rocm_core_${{ job.os }}
+    ${{ if parameters.buildDependsOn }}:
+        dependsOn:
+          - ${{ each build in parameters.buildDependsOn }}:
+            - ${{ build }}_${{ job.os }}
    pool:
      ${{ if eq(job.os, 'ubuntu2404') }}:
        vmImage: 'ubuntu-24.04'
@@ -50,8 +73,10 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        os: ${{ job.os }}
        useAmdclang: false
        extraBuildFlags: >-
@@ -65,9 +90,12 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
--- a/.azuredevops/components/rocm-libraries.yml
+++ b/.azuredevops/components/rocm-libraries.yml
@@ -36,8 +36,10 @@ parameters:
    - gfortran
    - git
    - libdrm-dev
+    - liblapack-dev
    - libmsgpack-dev
    - libnuma-dev
+    - libopenblas-dev
    - ninja-build
    - python3-pip
    - python3-venv
@@ -46,6 +48,8 @@ parameters:
  default:
    - joblib
    - "packaging>=22.0"
+    - pytest
+    - pytest-cmake
    - --upgrade
 - name: rocmDependencies
  type: object
@@ -98,12 +102,12 @@ jobs:
    workspace:
      clean: all
    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
        packageManager: ${{ job.packageManager }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
@@ -134,12 +138,26 @@ jobs:
          rocm-libraries | ${{ job.os }} | ${{ job.target }} | $(DAY_STRING)
          rocm-libraries | ${{ job.os }} | ${{ job.target }}
          rocm-libraries | ${{ job.os }}
+    - task: Bash@3
+      displayName: Add paths for CMake and Python site-packages binaries
+      inputs:
+        targetType: inline
+        script: |
+          USER_BASE=$(python3 -m site --user-base)
+          echo "##vso[task.prependpath]$USER_BASE/bin"
+          echo "##vso[task.setvariable variable=PytestCmakePath]$USER_BASE/share/Pytest/cmake"
+        displayName: Set cmake configure paths
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        os: ${{ job.os }}
        extraBuildFlags: >-
-          -DROCM_LIBRARIES_SUPERBUILD=ON
-          -GNinja
+          -D CMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor;$(PytestCmakePath)
+          -D CMAKE_INCLUDE_PATH=$(Agent.BuildDirectory)/rocm/llvm/include
+          -D CMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
+          -D CMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
+          -D CMAKE_CXX_COMPILER_LAUNCHER=ccache
+          -D CMAKE_C_COMPILER_LAUNCHER=ccache
+          -G Ninja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
        componentName: ${{ parameters.componentName }}
--- a/.azuredevops/components/rocm_smi_lib.yml
+++ b/.azuredevops/components/rocm_smi_lib.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: rocm_smi_lib
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -32,6 +51,10 @@ parameters:
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: rocm_smi_lib_build_${{ job.os }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}
    pool:
      ${{ if eq(job.os, 'ubuntu2404') }}:
        vmImage: 'ubuntu-24.04'
@@ -55,8 +78,10 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        os: ${{ job.os }}
        useAmdclang: false
        extraBuildFlags: >-
@@ -65,22 +90,26 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
    #   parameters:
    #     aptPackages: ${{ parameters.aptPackages }}

- ${{ each job in parameters.jobMatrix.testJobs }}:
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
    - job: rocm_smi_lib_test_${{ job.os }}_${{ job.target }}
      dependsOn: rocm_smi_lib_build_${{ job.os }}
      condition:
        and(succeeded(),
          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
          eq(${{ parameters.aggregatePipeline }}, False)
        )
      variables:
@@ -90,6 +119,7 @@ jobs:
      workspace:
        clean: all
      steps:
+      - checkout: none
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
        parameters:
          aptPackages: ${{ parameters.aptPackages }}
@@ -103,7 +133,7 @@ jobs:
          runRocminfo: false
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
        parameters:
-        componentName: rocm_smi_lib
+          componentName: ${{ parameters.componentName }}
          testDir: '$(Agent.BuildDirectory)'
          testExecutable: 'sudo ./rocm/share/rocm_smi/rsmitst_tests/rsmitst'
          testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
--- a/.azuredevops/components/rocminfo.yml
+++ b/.azuredevops/components/rocminfo.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: rocminfo
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -40,7 +59,11 @@ parameters:

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: rocminfo_build_${{ job.os }}
+  - job: ${{ parameters.componentName }}_build_${{ job.os }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}
    pool:
      vmImage: 'ubuntu-22.04'
    ${{ if eq(job.os, 'almalinux8') }}:
@@ -62,14 +85,18 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
        os: ${{ job.os }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        os: ${{ job.os }}
        useAmdclang: false
        extraBuildFlags: >-
@@ -78,19 +105,23 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml

- ${{ each job in parameters.jobMatrix.testJobs }}:
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
    - job: rocminfo_test_${{ job.target }}
      dependsOn: rocminfo_build_${{ job.os }}
      condition:
        and(succeeded(),
          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
          eq(${{ parameters.aggregatePipeline }}, False)
        )
      variables:
@@ -115,12 +146,14 @@ jobs:
          dependencyList: ${{ parameters.rocmTestDependencies }}
          gpuTarget: ${{ job.target }}
          os: ${{ job.os }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
        parameters:
          runRocminfo: false
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
        parameters:
-        componentName: rocminfo
+          componentName: ${{ parameters.componentName }}
          testDir: '$(Agent.BuildDirectory)'
          testExecutable: './rocm/bin/rocminfo'
          testParameters: ''
--- a/.azuredevops/components/rocprofiler-compute.yml
+++ b/.azuredevops/components/rocprofiler-compute.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: rocprofiler-compute
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -36,6 +55,7 @@ parameters:
    - pymongo
    - pyyaml
    - setuptools
+    - sqlalchemy
    - tabulate
    - textual
    - textual_plotext
@@ -78,6 +98,10 @@ parameters:
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: rocprofiler_compute_build_${{ job.target }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -94,15 +118,19 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        extraBuildFlags: >-
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
@@ -111,14 +139,15 @@ jobs:
    #     pipModules: ${{ parameters.pipModules }}
    #     gpuTarget: ${{ job.target }}

- ${{ each job in parameters.jobMatrix.testJobs }}:
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
    - job: rocprofiler_compute_test_${{ job.target }}
      timeoutInMinutes: 120
      dependsOn: rocprofiler_compute_build_${{ job.target }}
      condition:
        and(succeeded(),
          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
          eq(${{ parameters.aggregatePipeline }}, False)
        )
      variables:
@@ -138,8 +167,10 @@ jobs:
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
        parameters:
          checkoutRepo: ${{ parameters.checkoutRepo }}
+          sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
        parameters:
+          preTargetFilter: ${{ parameters.componentName }}
          gpuTarget: ${{ job.target }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
@@ -147,6 +178,8 @@ jobs:
          checkoutRef: ${{ parameters.checkoutRef }}
          dependencyList: ${{ parameters.rocmTestDependencies }}
          gpuTarget: ${{ job.target }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
      - task: Bash@3
        displayName: Add en_US.UTF-8 locale
        inputs:
@@ -177,7 +210,7 @@ jobs:
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
        parameters:
-        componentName: rocprofiler-compute
+          componentName: ${{ parameters.componentName }}
          testDir: $(Build.BinariesDirectory)/libexec/rocprofiler-compute
          testExecutable: ROCM_PATH=$(Agent.BuildDirectory)/rocm ctest
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
--- a/.azuredevops/components/rocprofiler-sdk.yml
+++ b/.azuredevops/components/rocprofiler-sdk.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: rocprofiler-sdk
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -73,6 +92,10 @@ parameters:
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: rocprofiler_sdk_build_${{ job.target }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -89,6 +112,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
@@ -96,6 +120,8 @@ jobs:
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - task: Bash@3
      displayName: Add Python site-packages binaries to path
      inputs:
@@ -105,6 +131,7 @@ jobs:
          echo "##vso[task.prependpath]$USER_BASE/bin"
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        extraBuildFlags: >-
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
          -DROCPROFILER_BUILD_TESTS=ON
@@ -114,9 +141,12 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
@@ -126,13 +156,14 @@ jobs:
    #     gpuTarget: ${{ job.target }}
    #     registerROCmPackages: true

- ${{ each job in parameters.jobMatrix.testJobs }}:
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
    - job: rocprofiler_sdk_test_${{ job.target }}
      dependsOn: rocprofiler_sdk_build_${{ job.target }}
      condition:
        and(succeeded(),
          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
          eq(${{ parameters.aggregatePipeline }}, False)
        )
      variables:
@@ -150,6 +181,7 @@ jobs:
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
        parameters:
+          sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
          checkoutRepo: ${{ parameters.checkoutRepo }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
@@ -157,6 +189,8 @@ jobs:
          checkoutRef: ${{ parameters.checkoutRef }}
          dependencyList: ${{ parameters.rocmDependencies }}
          gpuTarget: ${{ job.target }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+              downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
      - task: Bash@3
        displayName: Add Python and ROCm binaries to path
        inputs:
@@ -167,6 +201,7 @@ jobs:
            echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/bin"
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
        parameters:
+          componentName: ${{ parameters.componentName }}
          extraBuildFlags: >-
            -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
            -DROCPROFILER_BUILD_TESTS=ON
@@ -177,7 +212,7 @@ jobs:
      - template: ${{ variables.CI_TEMPLATE_PATH}}/steps/gpu-diagnostics.yml
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
        parameters:
-        componentName: rocprofiler-sdk
+          componentName: ${{ parameters.componentName }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
        parameters:
          aptPackages: ${{ parameters.aptPackages }}
--- a/.azuredevops/components/rocprofiler-systems.yml
+++ b/.azuredevops/components/rocprofiler-systems.yml
@@ -6,6 +6,25 @@ parameters:
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: componentName
+  type: string
+  default: rocprofiler-systems
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -87,6 +106,10 @@ parameters:
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: rocprofiler_systems_build_${{ job.target }}
+    ${{ if parameters.buildDependsOn }}:
+        dependsOn:
+          - ${{ each build in parameters.buildDependsOn }}:
+            - ${{ build }}_${{ job.os }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -105,6 +128,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
@@ -136,12 +160,16 @@ jobs:
          -DCMAKE_CXX_FLAGS=-I$(Agent.BuildDirectory)/rocm/include/rocjpeg
          -DGPU_TARGETS=${{ job.target }}
          -GNinja
+        componentName: ${{ parameters.componentName }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
        gpuTarget: ${{ job.target }}
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
        gpuTarget: ${{ job.target }}
+        componentName: ${{ parameters.componentName }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
      parameters:
@@ -151,13 +179,14 @@ jobs:
        registerROCmPackages: true
        extraPaths: /home/user/workspace/rocm/bin:/home/user/workspace/rocm/llvm/bin

- ${{ each job in parameters.jobMatrix.testJobs }}:
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
    - job: rocprofiler_systems_test_${{ job.target }}
      dependsOn: rocprofiler_systems_build_${{ job.target }}
      condition:
        and(succeeded(),
          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
          eq(${{ parameters.aggregatePipeline }}, False)
        )
      timeoutInMinutes: 180
@@ -186,6 +215,8 @@ jobs:
          checkoutRef: ${{ parameters.checkoutRef }}
          dependencyList: ${{ parameters.rocmDependencies }}
          gpuTarget: ${{ job.target }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
      - task: Bash@3
        displayName: Add ROCm binaries to PATH
        inputs:
@@ -218,7 +249,7 @@ jobs:
          workingDirectory: build
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
        parameters:
-        componentName: rocprofiler-systems
+          componentName: ${{ parameters.componentName }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
        parameters:
          gpuTarget: ${{ job.target }}
--- a/.azuredevops/components/rocprofiler.yml
+++ b/.azuredevops/components/rocprofiler.yml
@@ -8,6 +8,22 @@ parameters:
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -70,6 +86,10 @@ parameters:
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -94,6 +114,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
      parameters:
        dependencyList:
@@ -108,6 +129,8 @@ jobs:
        gpuTarget: ${{ job.target }}
        os: ${{ job.os }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        os: ${{ job.os }}
@@ -115,6 +138,7 @@ jobs:
        extraBuildFlags: >-
          -DCMAKE_MODULE_PATH=$(Build.SourcesDirectory)/cmake_modules;$(Agent.BuildDirectory)/rocm/lib/cmake;$(Agent.BuildDirectory)/rocm/lib/cmake/hip;$(Agent.BuildDirectory)/rocm/lib64/cmake;$(Agent.BuildDirectory)/rocm/lib64/cmake/hip
          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor"
+          -DROCM_PATH=$(Agent.BuildDirectory)/rocm
          -DCMAKE_POSITION_INDEPENDENT_CODE=ON
          -DENABLE_LDCONFIG=OFF
          -DUSE_PROF_API=1
@@ -122,10 +146,13 @@ jobs:
        multithreadFlag: -- -j32
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
        os: ${{ job.os }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
@@ -139,7 +166,8 @@ jobs:
            - HIP_ROCCLR_HOME:::/home/user/workspace/rocm
            - ROCM_PATH:::/home/user/workspace/rocm

- ${{ each job in parameters.jobMatrix.testJobs }}:
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
      condition:
@@ -159,6 +187,7 @@ jobs:
      workspace:
        clean: all
      steps:
+      - checkout: none
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
        parameters:
          aptPackages: ${{ parameters.aptPackages }}
@@ -166,6 +195,7 @@ jobs:
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
        parameters:
+          preTargetFilter: ${{ parameters.componentName }}
          gpuTarget: ${{ job.target }}
          os: ${{ job.os }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
@@ -177,6 +207,8 @@ jobs:
          dependencyList: ${{ parameters.rocmDependencies }}
          gpuTarget: ${{ job.target }}
          os: ${{ job.os }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
        parameters:
--- a/.azuredevops/components/roctracer.yml
+++ b/.azuredevops/components/roctracer.yml
@@ -8,6 +8,22 @@ parameters:
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -65,6 +81,10 @@ parameters:
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -87,6 +107,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
@@ -94,6 +115,8 @@ jobs:
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
        os: ${{ job.os }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    # the linker flags will not affect ubuntu2204 builds as the paths do not exist
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
@@ -109,10 +132,13 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
@@ -123,13 +149,14 @@ jobs:
    #     gpuTarget: ${{ job.target }}
    #     registerROCmPackages: true

- ${{ each job in parameters.jobMatrix.testJobs }}:
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
      condition:
        and(succeeded(),
          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
          eq(${{ parameters.aggregatePipeline }}, False)
        )
      variables:
@@ -148,6 +175,7 @@ jobs:
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
        parameters:
+          preTargetFilter: ${{ parameters.componentName }}
          gpuTarget: ${{ job.target }}
          os: ${{ job.os }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
@@ -157,10 +185,12 @@ jobs:
          dependencyList: ${{ parameters.rocmTestDependencies }}
          gpuTarget: ${{ job.target }}
          os: ${{ job.os }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
        parameters:
-        componentName: roctracer
+          componentName: ${{ parameters.componentName }}
          testExecutable: $(Agent.BuildDirectory)/rocm/share/roctracer/run_tests.sh
          testParameters: ''
          testDir: $(Agent.BuildDirectory)
--- a/.azuredevops/dependencies/catch2.yml
+++ b/.azuredevops/dependencies/catch2.yml
@@ -0,0 +1,63 @@
+parameters:
+- name: checkoutRepo
+  type: string
+  default: 'self'
+- name: checkoutRef
+  type: string
+  default: ''
+- name: catch2Version
+  type: string
+  default: ''
+- name: aptPackages
+  type: object
+  default:
+    - cmake
+    - git
+    - ninja-build
+
+- name: jobMatrix
+  type: object
+  default:
+    buildJobs:
+      - { os: ubuntu2204, packageManager: apt}
+      - { os: almalinux8, packageManager: dnf}
+
+jobs:
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
+  - job: catch2_${{ job.os }}
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    pool:
+      vmImage: 'ubuntu-22.04'
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
+    workspace:
+      clean: all
+    steps:
+    - checkout: none
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - task: Bash@3
+      displayName: Clone catch2 ${{ parameters.catch2Version }}
+      inputs:
+        targetType: inline
+        script: git clone https://github.com/catchorg/Catch2.git -b ${{ parameters.catch2Version }}
+        workingDirectory: $(Agent.BuildDirectory)
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        os: ${{ job.os }}
+        cmakeBuildDir: $(Agent.BuildDirectory)/Catch2/build
+        cmakeSourceDir: $(Agent.BuildDirectory)/Catch2
+        useAmdclang: false
+        extraBuildFlags: >-
+          -DCMAKE_BUILD_TYPE=Release
+          -GNinja
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
+        os: ${{ job.os }}
--- a/.azuredevops/dependencies/fmtlib.yml
+++ b/.azuredevops/dependencies/fmtlib.yml
@@ -0,0 +1,67 @@
+parameters:
+- name: checkoutRepo
+  type: string
+  default: 'self'
+- name: checkoutRef
+  type: string
+  default: ''
+- name: fmtlibVersion
+  type: string
+  default: ''
+- name: aptPackages
+  type: object
+  default:
+    - cmake
+    - git
+    - ninja-build
+    - libfmt-dev
+
+- name: jobMatrix
+  type: object
+  default:
+    buildJobs:
+      - { os: ubuntu2204, packageManager: apt}
+      - { os: almalinux8, packageManager: dnf}
+
+jobs:
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
+  - job: fmtlib_${{ job.os }}
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    pool:
+      vmImage: 'ubuntu-22.04'
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
+    workspace:
+      clean: all
+    steps:
+    - checkout: none
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - task: Bash@3
+      displayName: Clone fmtlib ${{ parameters.fmtlibVersion }}
+      inputs:
+        targetType: inline
+        script: git clone https://github.com/fmtlib/fmt.git -b ${{ parameters.fmtlibVersion }}
+        workingDirectory: $(Agent.BuildDirectory)
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        os: ${{ job.os }}
+        cmakeBuildDir: $(Agent.BuildDirectory)/fmt/build
+        cmakeSourceDir: $(Agent.BuildDirectory)/fmt
+        useAmdclang: false
+        extraBuildFlags: >-
+          -DCMAKE_BUILD_TYPE=Release
+          -DFMT_SYSTEM_HEADERS=ON
+          -DFMT_INSTALL=ON
+          -DFMT_TEST=OFF
+          -GNinja
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
+        os: ${{ job.os }}
--- a/.azuredevops/dependencies/libdivide.yml
+++ b/.azuredevops/dependencies/libdivide.yml
@@ -0,0 +1,64 @@
+parameters:
+- name: checkoutRepo
+  type: string
+  default: 'self'
+- name: checkoutRef
+  type: string
+  default: ''
+- name: libdivideVersion
+  type: string
+  default: ''
+- name: aptPackages
+  type: object
+  default:
+    - cmake
+    - git
+    - ninja-build
+
+- name: jobMatrix
+  type: object
+  default:
+    buildJobs:
+      - { os: ubuntu2204, packageManager: apt}
+      - { os: almalinux8, packageManager: dnf}
+
+jobs:
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
+  - job: libdivide_${{ job.os }}
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    pool:
+      vmImage: 'ubuntu-22.04'
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
+    workspace:
+      clean: all
+    steps:
+    - checkout: none
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - task: Bash@3
+      displayName: Clone libdivide ${{ parameters.libdivideVersion }}
+      inputs:
+        targetType: inline
+        script: git clone https://github.com/ridiculousfish/libdivide.git -b ${{ parameters.libdivideVersion }}
+        workingDirectory: $(Agent.BuildDirectory)
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        os: ${{ job.os }}
+        cmakeBuildDir: $(Agent.BuildDirectory)/libdivide/build
+        cmakeSourceDir: $(Agent.BuildDirectory)/libdivide
+        useAmdclang: false
+        extraBuildFlags: >-
+          -DCMAKE_BUILD_TYPE=Release
+          -DLIBDIVIDE_BUILD_TESTS=OFF
+          -GNinja
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
+        os: ${{ job.os }}
--- a/.azuredevops/dependencies/spdlog.yml
+++ b/.azuredevops/dependencies/spdlog.yml
@@ -0,0 +1,71 @@
+parameters:
+- name: checkoutRepo
+  type: string
+  default: 'self'
+- name: checkoutRef
+  type: string
+  default: ''
+- name: spdlogVersion
+  type: string
+  default: ''
+- name: aptPackages
+  type: object
+  default:
+    - cmake
+    - git
+    - ninja-build
+
+- name: jobMatrix
+  type: object
+  default:
+    buildJobs:
+      - { os: ubuntu2204, packageManager: apt}
+      - { os: almalinux8, packageManager: dnf}
+
+jobs:
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
+  - job: spdlog_${{ job.os }}
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    pool:
+      vmImage: 'ubuntu-22.04'
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
+    workspace:
+      clean: all
+    steps:
+    - checkout: none
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
+      parameters:
+        dependencyList:
+          - fmtlib
+    - task: Bash@3
+      displayName: Clone spdlog ${{ parameters.spdlogVersion }}
+      inputs:
+        targetType: inline
+        script: git clone https://github.com/gabime/spdlog.git -b ${{ parameters.spdlogVersion }}
+        workingDirectory: $(Agent.BuildDirectory)
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        os: ${{ job.os }}
+        cmakeBuildDir: $(Agent.BuildDirectory)/spdlog/build
+        cmakeSourceDir: $(Agent.BuildDirectory)/spdlog
+        useAmdclang: false
+        extraBuildFlags: >-
+          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/vendor
+          -DCMAKE_BUILD_TYPE=Release
+          -DSPDLOG_USE_STD_FORMAT=OFF
+          -DSPDLOG_FMT_EXTERNAL_HO=ON
+          -DSPDLOG_INSTALL=ON
+          -GNinja
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
+        os: ${{ job.os }}
--- a/.azuredevops/nightly/pytorch.yml
+++ b/.azuredevops/nightly/pytorch.yml
@@ -397,6 +397,7 @@ jobs:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
  - task: DownloadPipelineArtifact@2
    displayName: 'Download Pipeline Wheel Files'
+    retryCountOnTaskFailure: 3
    inputs:
      itemPattern: '**/*.whl'
      targetPath: $(Agent.BuildDirectory)
--- a/.azuredevops/nightly/rocm-nightly.yml
+++ b/.azuredevops/nightly/rocm-nightly.yml
@@ -93,7 +93,7 @@ schedules:
 jobs:
 - ${{ each job in parameters.jobList }}:
  - job: nightly_${{ job.os }}_${{ job.target }}
-    timeoutInMinutes: 90
+    timeoutInMinutes: 120
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -226,6 +226,7 @@ jobs:
            cat Dockerfile
    - task: Docker@2
      displayName: Build and upload Docker image
+      retryCountOnTaskFailure: 3
      inputs:
        containerRegistry: ContainerService3
        repository: 'nightly-${{ job.os }}-${{ job.target }}'
--- a/.azuredevops/tag-builds/catch2.yml
+++ b/.azuredevops/tag-builds/catch2.yml
@@ -0,0 +1,23 @@
+variables:
+- group: common
+- template: /.azuredevops/variables-global.yml
+
+parameters:
+- name: catch2Version
+  type: string
+  default: "v3.7.0"
+
+resources:
+  repositories:
+  - repository: pipelines_repo
+    type: github
+    endpoint: ROCm
+    name: ROCm/ROCm
+
+trigger: none
+pr: none
+
+jobs:
+  - template: ${{ variables.CI_DEPENDENCIES_PATH }}/catch2.yml
+    parameters:
+      catch2Version: ${{ parameters.catch2Version }}
--- a/.azuredevops/tag-builds/fmtlib.yml
+++ b/.azuredevops/tag-builds/fmtlib.yml
@@ -0,0 +1,23 @@
+variables:
+- group: common
+- template: /.azuredevops/variables-global.yml
+
+parameters:
+- name: fmtlibVersion
+  type: string
+  default: "11.1.3"
+
+resources:
+  repositories:
+  - repository: pipelines_repo
+    type: github
+    endpoint: ROCm
+    name: ROCm/ROCm
+
+trigger: none
+pr: none
+
+jobs:
+  - template: ${{ variables.CI_DEPENDENCIES_PATH }}/fmtlib.yml
+    parameters:
+      fmtlibVersion: ${{ parameters.fmtlibVersion }}
--- a/.azuredevops/tag-builds/libdivide.yml
+++ b/.azuredevops/tag-builds/libdivide.yml
@@ -0,0 +1,23 @@
+variables:
+- group: common
+- template: /.azuredevops/variables-global.yml
+
+parameters:
+- name: libdivideVersion
+  type: string
+  default: master
+
+resources:
+  repositories:
+  - repository: pipelines_repo
+    type: github
+    endpoint: ROCm
+    name: ROCm/ROCm
+
+trigger: none
+pr: none
+
+jobs:
+  - template: ${{ variables.CI_DEPENDENCIES_PATH }}/libdivide.yml
+    parameters:
+      libdivideVersion: ${{ parameters.libdivideVersion }}
--- a/.azuredevops/tag-builds/spdlog.yml
+++ b/.azuredevops/tag-builds/spdlog.yml
@@ -0,0 +1,23 @@
+variables:
+- group: common
+- template: /.azuredevops/variables-global.yml
+
+parameters:
+- name: spdlogVersion
+  type: string
+  default: "v1.15.1"
+
+resources:
+  repositories:
+  - repository: pipelines_repo
+    type: github
+    endpoint: ROCm
+    name: ROCm/ROCm
+
+trigger: none
+pr: none
+
+jobs:
+  - template: ${{ variables.CI_DEPENDENCIES_PATH }}/spdlog.yml
+    parameters:
+      spdlogVersion: ${{ parameters.spdlogVersion }}
--- a/.azuredevops/templates/steps/artifact-download.yml
+++ b/.azuredevops/templates/steps/artifact-download.yml
@@ -24,7 +24,11 @@ parameters:
 steps:
 - task: DownloadPipelineArtifact@2
  displayName: Download ${{ parameters.componentName }}
+  retryCountOnTaskFailure: 3
  inputs:
+    ${{ if eq(parameters.componentName, 'clr') }}:
+      itemPattern: '**/*${{ parameters.componentName }}*${{ parameters.fileFilter }}*amd*' # filter out nvidia clr artifacts
+    ${{ else }}:
      itemPattern: '**/*${{ parameters.componentName }}*${{ parameters.fileFilter }}*'
    targetPath: '$(Pipeline.Workspace)/d'
    allowPartiallySucceededBuilds: true
--- a/.azuredevops/templates/steps/checkout.yml
+++ b/.azuredevops/templates/steps/checkout.yml
@@ -20,7 +20,7 @@ steps:
    retryCountOnTaskFailure: 3
    fetchFilter: blob:none
    ${{ if ne(parameters.sparseCheckoutDir, '') }}:
-      sparseCheckoutDirectories: ${{ parameters.sparseCheckoutDir }}
+      sparseCheckoutDirectories: ${{ parameters.sparseCheckoutDir }} shared
      path: sparse
  - ${{ if ne(parameters.sparseCheckoutDir, '') }}:
    - task: Bash@3
--- a/.azuredevops/templates/steps/dependencies-apt.yml
+++ b/.azuredevops/templates/steps/dependencies-apt.yml
@@ -10,6 +10,7 @@ steps:
 - ${{ if eq(parameters.registerROCmPackages, true) }}:
  - task: Bash@3
    displayName: 'Register AMDGPU & ROCm repos (apt)'
+    retryCountOnTaskFailure: 3
    inputs:
      targetType: inline
      script: |
@@ -20,7 +21,8 @@ steps:
        echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600
        sudo apt update
 - task: Bash@3
-  displayName: 'sudo apt-get update'
+  displayName: 'APT update and install packages'
+  retryCountOnTaskFailure: 3
  inputs:
    targetType: inline
    script: |
@@ -28,15 +30,6 @@ steps:
      echo "deb http://archive.ubuntu.com/ubuntu/ jammy-updates main restricted universe multiverse" | sudo tee -a /etc/apt/sources.list.d/default.list
      echo "deb http://archive.ubuntu.com/ubuntu/ jammy-backports main restricted universe multiverse" | sudo tee -a /etc/apt/sources.list.d/default.list
      echo "deb http://archive.ubuntu.com/ubuntu/ jammy-security main restricted universe multiverse" | sudo tee -a /etc/apt/sources.list.d/default.list
-      sudo DEBIAN_FRONTEND=noninteractive apt-get --yes update
- task: Bash@3
-  displayName: 'sudo apt-get fix'
-  inputs:
-    targetType: inline
-    script: sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --fix-broken install
- ${{ if gt(length(parameters.aptPackages), 0) }}:
-  - task: Bash@3
-    displayName: 'sudo apt-get install ...'
-    inputs:
-      targetType: inline
-      script: sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --fix-missing install ${{ join(' ', parameters.aptPackages) }}
+      sudo DEBIAN_FRONTEND=noninteractive apt-get --yes update && \
+        sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --fix-broken install && \
+        sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --fix-missing install ${{ join(' ', parameters.aptPackages) }}
--- a/.azuredevops/templates/steps/dependencies-aqlprofile.yml
+++ b/.azuredevops/templates/steps/dependencies-aqlprofile.yml
@@ -5,51 +5,28 @@ parameters:

 steps:
 - task: Bash@3
-  displayName: Get aqlprofile package name
+  displayName: Download and install aqlprofile
+  retryCountOnTaskFailure: 3
  inputs:
    targetType: inline
-    ${{ if eq(parameters.os, 'ubuntu2204') }}:
+    workingDirectory: $(Agent.BuildDirectory)
    script: |
-        export packageName=$(curl -s https://repo.radeon.com/rocm/apt/$(REPO_RADEON_VERSION)/pool/main/h/hsa-amd-aqlprofile/ | grep -oP "href=\"\K[^\"]*$(lsb_release -rs)[^\"]*\.deb")
-        echo "##vso[task.setvariable variable=packageName;isreadonly=true]$packageName"
-    ${{ if eq(parameters.os, 'almalinux8') }}:
-      script: |
-        export packageName=$(curl -s https://repo.radeon.com/rocm/rhel8/$(REPO_RADEON_VERSION)/main/ | grep -oP "hsa-amd-aqlprofile-[^\"]+\.rpm" | head -n1)
-        echo "##vso[task.setvariable variable=packageName;isreadonly=true]$packageName"
- task: Bash@3
-  displayName: 'Download aqlprofile'
-  inputs:
-    targetType: inline
-    workingDirectory: '$(Pipeline.Workspace)'
-    ${{ if eq(parameters.os, 'ubuntu2204') }}:
-      script: wget -nv https://repo.radeon.com/rocm/apt/$(REPO_RADEON_VERSION)/pool/main/h/hsa-amd-aqlprofile/$(packageName)
-    ${{ if eq(parameters.os, 'almalinux8') }}:
-      script: wget -nv https://repo.radeon.com/rocm/rhel8/$(REPO_RADEON_VERSION)/main/$(packageName)
- task: Bash@3
-  displayName: 'Extract aqlprofile'
-  inputs:
-    targetType: inline
-    workingDirectory: '$(Pipeline.Workspace)'
-    ${{ if eq(parameters.os, 'ubuntu2204') }}:
-      script: |
-        mkdir hsa-amd-aqlprofile
-        dpkg-deb -R $(packageName) hsa-amd-aqlprofile
-    ${{ if eq(parameters.os, 'almalinux8') }}:
-      script: |
-        mkdir hsa-amd-aqlprofile
-        sudo dnf -y install rpm-build cpio
-        rpm2cpio $(packageName) | (cd hsa-amd-aqlprofile && cpio -idmv)
- task: Bash@3
-  displayName: 'Copy aqlprofile files'
-  inputs:
-    targetType: inline
-    script: |
-      mkdir -p $(Agent.BuildDirectory)/rocm
-      cp -R hsa-amd-aqlprofile/opt/rocm-*/* $(Agent.BuildDirectory)/rocm
-    workingDirectory: '$(Pipeline.Workspace)'
- task: Bash@3
-  displayName: 'Clean up aqlprofile'
-  inputs:
-    targetType: inline
-    script: rm -rf hsa-amd-aqlprofile $(packageName)
-    workingDirectory: '$(Pipeline.Workspace)'
+      set -e
+      if [ "${{ parameters.os }}" = "ubuntu2204" ]; then
+        packageName=$(curl -s https://repo.radeon.com/rocm/apt/$(REPO_RADEON_VERSION)/pool/main/h/hsa-amd-aqlprofile/ | grep -oP "href=\"\K[^\"]*$(lsb_release -rs)[^\"]*\.deb") && \
+        wget -nv https://repo.radeon.com/rocm/apt/$(REPO_RADEON_VERSION)/pool/main/h/hsa-amd-aqlprofile/$packageName && \
+        mkdir -p hsa-amd-aqlprofile && \
+        dpkg-deb -R $packageName hsa-amd-aqlprofile
+      elif [ "${{ parameters.os }}" = "almalinux8" ]; then
+        sudo dnf -y install rpm-build cpio && \
+        packageName=$(curl -s https://repo.radeon.com/rocm/rhel8/$(REPO_RADEON_VERSION)/main/ | grep -oP "hsa-amd-aqlprofile-[^\"]+\.rpm" | head -n1) && \
+        wget -nv https://repo.radeon.com/rocm/rhel8/$(REPO_RADEON_VERSION)/main/$packageName && \
+        mkdir -p hsa-amd-aqlprofile && \
+        rpm2cpio $packageName | (cd hsa-amd-aqlprofile && cpio -idmv)
+      else
+        echo "Unsupported OS: ${{ parameters.os }}"
+        exit 1
+      fi && \
+      mkdir -p $(Agent.BuildDirectory)/rocm && \
+      cp -R hsa-amd-aqlprofile/opt/rocm-*/* $(Agent.BuildDirectory)/rocm && \
+      rm -rf hsa-amd-aqlprofile $packageName
--- a/.azuredevops/templates/steps/dependencies-dnf.yml
+++ b/.azuredevops/templates/steps/dependencies-dnf.yml
@@ -89,6 +89,7 @@ steps:
 - ${{ if eq(parameters.registerROCmPackages, true) }}:
  - task: Bash@3
    displayName: 'Register AMDGPU & ROCm repos (dnf)'
+    retryCountOnTaskFailure: 3
    inputs:
      targetType: inline
      script: |
@@ -109,12 +110,13 @@ steps:
        sudo dnf makecache
 - task: Bash@3
  displayName: 'Install base dnf packages'
+  retryCountOnTaskFailure: 3
  inputs:
    targetType: inline
    script: |
-      sudo dnf config-manager --set-enabled powertools
      # rpm fusion free repo for some dependencies
-      sudo dnf -y install https://download1.rpmfusion.org/free/el/rpmfusion-free-release-8.noarch.rpm
+      sudo dnf config-manager --set-enabled powertools && \
+      sudo dnf -y install https://download1.rpmfusion.org/free/el/rpmfusion-free-release-8.noarch.rpm && \
      sudo dnf -y install ${{ join(' ', parameters.basePackages) }}
 - task: Bash@3
  displayName: 'Check gcc environment'
@@ -128,6 +130,7 @@ steps:
      g++ -print-file-name=libstdc++.so
 - task: Bash@3
  displayName: 'Set python 3.11 as default'
+  retryCountOnTaskFailure: 3
  inputs:
    targetType: inline
    script: |
@@ -142,18 +145,20 @@ steps:
  - ${{ if eq(pkg, 'ninja-build') }}:
    - task: Bash@3
      displayName: 'Install ninja 1.11.1'
+      retryCountOnTaskFailure: 3
      inputs:
        targetType: inline
        script: |
-          curl -LO https://github.com/ninja-build/ninja/releases/download/v1.11.1/ninja-linux.zip
-          sudo dnf -y install unzip
-          unzip ninja-linux.zip
-          sudo mv ninja /usr/local/bin/ninja
-          sudo chmod +x /usr/local/bin/ninja
+          sudo dnf -y install unzip && \
+          curl -LO https://github.com/ninja-build/ninja/releases/download/v1.11.1/ninja-linux.zip && \
+          unzip ninja-linux.zip && \
+          sudo mv ninja /usr/local/bin/ninja && \
+          sudo chmod +x /usr/local/bin/ninja && \
          echo "##vso[task.prependpath]/usr/local/bin"
  - ${{ if ne(parameters.aptToDnfMap[pkg], '') }}:
    - task: Bash@3
      displayName: 'dnf install ${{ parameters.aptToDnfMap[pkg] }}'
+      retryCountOnTaskFailure: 3
      inputs:
        targetType: inline
        script: |
--- a/.azuredevops/templates/steps/dependencies-other.yml
+++ b/.azuredevops/templates/steps/dependencies-other.yml
@@ -27,6 +27,7 @@ steps:
 - ${{ if gt(length(parameters.pipModules), 0) }}:
  - task: Bash@3
    displayName: 'pip install  ...'
+    retryCountOnTaskFailure: 3
    inputs:
      targetType: inline
      script: python3 -m pip install -v --force-reinstall ${{ join(' ', parameters.pipModules) }}
--- a/.azuredevops/templates/steps/dependencies-rocm.yml
+++ b/.azuredevops/templates/steps/dependencies-rocm.yml
@@ -47,8 +47,8 @@ parameters:
      developBranch: aomp-dev
      hasGpuTarget: false
    clr:
-      pipelineId: 145
-      developBranch: amd-staging
+      pipelineId: 335
+      developBranch: develop
      hasGpuTarget: false
    composable_kernel:
      pipelineId: 86
@@ -59,12 +59,12 @@ parameters:
      developBranch: rocm
      hasGpuTarget: false
    HIP:
-      pipelineId: 93
-      developBranch: amd-staging
+      pipelineId: 335
+      developBranch: develop
      hasGpuTarget: false
    hip-tests:
-      pipelineId: 233
-      developBranch: amd-staging
+      pipelineId: 362
+      developBranch: develop
      hasGpuTarget: false
    hipBLAS:
      pipelineId: 317
@@ -171,16 +171,16 @@ parameters:
      developBranch: develop
      hasGpuTarget: false
    rocm-core:
-      pipelineId: 103
-      developBranch: master
+      pipelineId: 349
+      developBranch: develop
      hasGpuTarget: false
    rocm-examples:
      pipelineId: 216
      developBranch: amd-staging
      hasGpuTarget: true
    rocminfo:
-      pipelineId: 91
-      developBranch: amd-staging
+      pipelineId: 356
+      developBranch: develop
      hasGpuTarget: false
    rocMLIR:
      pipelineId: 229
@@ -195,19 +195,19 @@ parameters:
      developBranch: master
      hasGpuTarget: false
    rocm_smi_lib:
-      pipelineId: 96
-      developBranch: amd-staging
+      pipelineId: 358
+      developBranch: develop
      hasGpuTarget: false
    rocPRIM:
      pipelineId: 273
      developBranch: develop
      hasGpuTarget: true
    rocprofiler:
-      pipelineId: 143
-      developBranch: amd-staging
+      pipelineId: 329
+      developBranch: develop
      hasGpuTarget: true
    rocprofiler-compute:
-      pipelineId: 257
+      pipelineId: 344
      developBranch: develop
      hasGpuTarget: true
    rocprofiler-register:
@@ -215,8 +215,8 @@ parameters:
      developBranch: develop
      hasGpuTarget: false
    rocprofiler-sdk:
-      pipelineId: 246
-      developBranch: amd-staging
+      pipelineId: 347
+      developBranch: develop
      hasGpuTarget: true
    rocprofiler-systems:
      pipelineId: 255
@@ -227,8 +227,8 @@ parameters:
      developBranch: develop
      hasGpuTarget: true
    ROCR-Runtime:
-      pipelineId: 10
-      developBranch: amd-staging
+      pipelineId: 354
+      developBranch: develop
      hasGpuTarget: false
    rocRAND:
      pipelineId: 274
@@ -251,8 +251,8 @@ parameters:
      developBranch: develop
      hasGpuTarget: true
    roctracer:
-      pipelineId: 141
-      developBranch: amd-staging
+      pipelineId: 331
+      developBranch: develop
      hasGpuTarget: true
    rocWMMA:
      pipelineId: 109
--- a/.azuredevops/templates/steps/dependencies-vendor.yml
+++ b/.azuredevops/templates/steps/dependencies-vendor.yml
@@ -8,15 +8,20 @@ parameters:
  type: object
  default:
    boost: 250
+    catch2: 343
+    fmtlib: 341
    grpc: 72
    gtest: 73
    half560: 68
    lapack: 69
+    libdivide: 342
+    spdlog: 340

 steps:
 - ${{ each dependency in parameters.dependencyList }}:
  - task: DownloadPipelineArtifact@2
    displayName: Download ${{ dependency }}
+    retryCountOnTaskFailure: 3
    inputs:
      project: ROCm-CI
      buildType: specific
@@ -28,7 +33,7 @@ steps:
    inputs:
      archiveFilePatterns: '$(Pipeline.Workspace)/d/**/*.tar.gz'
      destinationFolder: $(Agent.BuildDirectory)/vendor
-      cleanDestinationFolder: true
+      cleanDestinationFolder: false
      overwriteExistingFiles: true
  - task: DeleteFiles@1
    displayName: Clean up ${{ dependency }}
--- a/.azuredevops/templates/steps/local-artifact-download.yml
+++ b/.azuredevops/templates/steps/local-artifact-download.yml
@@ -33,6 +33,7 @@ parameters:
 steps:
  - task: DownloadPipelineArtifact@2
    displayName: Download ${{ parameters.preTargetFilter}}*${{ parameters.os }}_${{ parameters.gpuTarget}}*${{ parameters.postTargetFilter}}
+    retryCountOnTaskFailure: 3
    inputs:
      ${{ if eq(parameters.buildType, 'specific') }}:
        buildType: specific
--- a/.azuredevops/templates/steps/miopen-get-ck-build.yml
+++ b/.azuredevops/templates/steps/miopen-get-ck-build.yml
@@ -7,6 +7,7 @@ steps:
 - task: Bash@3
  name: downloadCKBuild
  displayName: Download specific CK build
+  retryCountOnTaskFailure: 3
  env:
    CXX: $(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
    CC: $(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
--- a/.azuredevops/variables-global.yml
+++ b/.azuredevops/variables-global.yml
@@ -28,13 +28,13 @@ variables:
 - name: GFX90A_TEST_POOL
  value: gfx90a_test_pool
 - name: LATEST_RELEASE_VERSION
-  value: 6.4.2
+  value: 6.4.3
 - name: REPO_RADEON_VERSION
-  value: 6.4.2
+  value: 6.4.3
 - name: NEXT_RELEASE_VERSION
  value: 7.0.0
 - name: LATEST_RELEASE_TAG
-  value: rocm-6.4.2
+  value: rocm-6.4.3
 - name: DOCKER_SKIP_GFX
  value: gfx90a
 - name: COMPOSABLE_KERNEL_PIPELINE_ID
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -5,6 +5,7 @@ ACEs
 ACS
 AccVGPR
 AccVGPRs
+AITER
 ALU
 AllReduce
 AMD
@@ -118,6 +119,7 @@ Deprecations
 DevCap
 DirectX
 Dockerfile
+Dockerized
 Doxygen
 dropless
 ELMo
@@ -125,6 +127,7 @@ ENDPGM
 EPYC
 ESXi
 EoS
+fas
 FBGEMM
 FIFOs
 FFT
@@ -157,6 +160,7 @@ GEMMs
 GFLOPS
 GFortran
 GFXIP
+GGUF
 Gemma
 GiB
 GIM
@@ -198,6 +202,7 @@ HWE
 HWS
 Haswell
 Higgs
+href
 Hyperparameters
 Huggingface
 IB
@@ -301,6 +306,7 @@ Multicore
 Multithreaded
 MyEnvironment
 MyST
+NANOO
 NBIO
 NBIOs
 NCCL
@@ -373,6 +379,7 @@ PowerEdge
 PowerShell
 Pretrained
 Pretraining
+Primus
 Profiler's
 PyPi
 Pytest
@@ -515,6 +522,7 @@ unwindowed
 VALU
 VBIOS
 VCN
+verl's
 VGPR
 VGPRs
 VM
@@ -546,6 +554,7 @@ Xilinx
 Xnack
 Xteam
 YAML
+YAMLs
 YML
 YModel
 ZeRO
@@ -610,6 +619,7 @@ completers
 composable
 concretization
 config
+configs
 conformant
 const
 constructible
@@ -692,6 +702,7 @@ github
 globals
 gnupg
 grayscale
+gx
 gzip
 heterogenous
 hipBLAS
@@ -764,6 +775,7 @@ logits
 lossy
 macOS
 matchers
+maxtext
 megatron
 microarchitecture
 migraphx
@@ -802,6 +814,7 @@ parallelizing
 param
 parameterization
 passthrough
+pe
 perfcounter
 performant
 perl
@@ -824,11 +837,14 @@ preprocessing
 preprocessor
 prequantized
 prerequisites
+pretrain
 pretraining
+primus
 profiler
 profilers
 protobuf
 pseudorandom
+px
 py
 pytorch
 recommender
@@ -944,6 +960,7 @@ toolchain
 toolchains
 toolset
 toolsets
+torchtitan
 torchvision
 tqdm
 tracebacks
--- a/docs/about/license.md
+++ b/docs/about/license.md
@@ -29,7 +29,7 @@ additional licenses. Please review individual repositories for more information.
 | [AMD SMI](https://github.com/ROCm/amdsmi) | [MIT](https://github.com/ROCm/amdsmi/blob/amd-staging/LICENSE) |
 | [aomp](https://github.com/ROCm/aomp/) | [Apache 2.0](https://github.com/ROCm/aomp/blob/aomp-dev/LICENSE) |
 | [aomp-extras](https://github.com/ROCm/aomp-extras/) | [MIT](https://github.com/ROCm/aomp-extras/blob/aomp-dev/LICENSE) |
-| [AQLprofile] | [MIT](https://github.com/ROCm/aqlprofile/blob/amd-staging/LICENSE) |
+| [AQLprofile] | [MIT](https://github.com/ROCm/aqlprofile/blob/amd-staging/LICENSE.md) |
 | [Code Object Manager (Comgr)](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/comgr) | [The University of Illinois/NCSA](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/comgr/LICENSE.txt) |
 | [Composable Kernel](https://github.com/ROCm/composable_kernel) | [MIT](https://github.com/ROCm/composable_kernel/blob/develop/LICENSE) |
 | [half](https://github.com/ROCm/half/) | [MIT](https://github.com/ROCm/half/blob/rocm/LICENSE.txt) |
@@ -50,7 +50,7 @@ additional licenses. Please review individual repositories for more information.
 | [llvm-project](https://github.com/ROCm/llvm-project/) | [Apache](https://github.com/ROCm/llvm-project/blob/amd-staging/LICENSE.TXT) |
 | [llvm-project/flang](https://github.com/ROCm/llvm-project/tree/amd-staging/flang) | [Apache 2.0](https://github.com/ROCm/llvm-project/blob/amd-staging/flang/LICENSE.TXT) |
 | [MIGraphX](https://github.com/ROCm/AMDMIGraphX/) | [MIT](https://github.com/ROCm/AMDMIGraphX/blob/develop/LICENSE) |
-| [MIOpen](https://github.com/ROCm/MIOpen/) | [MIT](https://github.com/ROCm/MIOpen/blob/develop/LICENSE.txt) |
+| [MIOpen](https://github.com/ROCm/MIOpen/) | [MIT](https://github.com/ROCm/rocm-libraries/blob/develop/projects/miopen/LICENSE.md) |
 | [MIVisionX](https://github.com/ROCm/MIVisionX/) | [MIT](https://github.com/ROCm/MIVisionX/blob/develop/LICENSE.txt) |
 | [rocAL](https://github.com/ROCm/rocAL) | [MIT](https://github.com/ROCm/rocAL/blob/develop/LICENSE.txt) |
 | [rocALUTION](https://github.com/ROCm/rocALUTION/) | [MIT](https://github.com/ROCm/rocALUTION/blob/develop/LICENSE.md) |
@@ -67,15 +67,15 @@ additional licenses. Please review individual repositories for more information.
 | [ROCm Communication Collectives Library (RCCL)](https://github.com/ROCm/rccl/) | [Custom](https://github.com/ROCm/rccl/blob/develop/LICENSE.txt) |
 | [ROCm-Core](https://github.com/ROCm/rocm-core) | [MIT](https://github.com/ROCm/rocm-core/blob/master/copyright) |
 | [ROCm Compute Profiler](https://github.com/ROCm/rocprofiler-compute) | [MIT](https://github.com/ROCm/rocprofiler-compute/blob/amd-staging/LICENSE) |
-| [ROCm Data Center (RDC)](https://github.com/ROCm/rdc/) | [MIT](https://github.com/ROCm/rdc/blob/amd-staging/LICENSE) |
+| [ROCm Data Center (RDC)](https://github.com/ROCm/rdc/) | [MIT](https://github.com/ROCm/rdc/blob/amd-staging/LICENSE.md) |
 | [ROCm-Device-Libs](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/device-libs) | [The University of Illinois/NCSA](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/device-libs/LICENSE.TXT) |
 | [ROCm-OpenCL-Runtime](https://github.com/ROCm/clr/tree/amd-staging/opencl) | [MIT](https://github.com/ROCm/clr/blob/amd-staging/opencl/LICENSE.txt) |
 | [ROCm Performance Primitives (RPP)](https://github.com/ROCm/rpp) | [MIT](https://github.com/ROCm/rpp/blob/develop/LICENSE) |
-| [ROCm SMI Lib](https://github.com/ROCm/rocm_smi_lib/) | [MIT](https://github.com/ROCm/rocm_smi_lib/blob/amd-staging/License.txt) |
-| [ROCm Systems Profiler](https://github.com/ROCm/rocprofiler-systems) | [MIT](https://github.com/ROCm/rocprofiler-systems/blob/amd-staging/LICENSE) |
+| [ROCm SMI Lib](https://github.com/ROCm/rocm_smi_lib/) | [MIT](https://github.com/ROCm/rocm_smi_lib/blob/amd-staging/LICENSE.md) |
+| [ROCm Systems Profiler](https://github.com/ROCm/rocprofiler-systems) | [MIT](https://github.com/ROCm/rocprofiler-systems/blob/amd-staging/LICENSE.md) |
 | [ROCm Validation Suite](https://github.com/ROCm/ROCmValidationSuite/) | [MIT](https://github.com/ROCm/ROCmValidationSuite/blob/master/LICENSE) |
 | [rocPRIM](https://github.com/ROCm/rocPRIM/) | [MIT](https://github.com/ROCm/rocPRIM/blob/develop/LICENSE.txt) |
-| [ROCProfiler](https://github.com/ROCm/rocprofiler/) | [MIT](https://github.com/ROCm/rocprofiler/blob/amd-staging/LICENSE) |
+| [ROCProfiler](https://github.com/ROCm/rocprofiler/) | [MIT](https://github.com/ROCm/rocprofiler/blob/amd-staging/LICENSE.md) |
 | [ROCprofiler-SDK](https://github.com/ROCm/rocprofiler-sdk) | [MIT](https://github.com/ROCm/rocprofiler-sdk/blob/amd-mainline/LICENSE) |
 | [rocPyDecode](https://github.com/ROCm/rocPyDecode) | [MIT](https://github.com/ROCm/rocPyDecode/blob/develop/LICENSE.txt) |
 | [rocRAND](https://github.com/ROCm/rocRAND/) | [MIT](https://github.com/ROCm/rocRAND/blob/develop/LICENSE.txt) |
--- a/docs/compatibility/compatibility-matrix.rst
+++ b/docs/compatibility/compatibility-matrix.rst
@@ -253,8 +253,12 @@ Expand for full historical view of:
   .. [#mi300_602-past-60] **For ROCm 6.0.2** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.
   .. [#mi300_600-past-60] **For ROCm 6.0.0** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.
   .. [#verl_compat] verl is only supported on ROCm 6.2.0.
+   .. [#stanford-megatron-lm_compat] Stanford Megatron-LM is only supported on ROCm 6.3.0.
   .. [#dgl_compat] DGL is only supported on ROCm 6.4.0.
+   .. [#megablocks_compat] Megablocks is only supported on ROCm 6.3.0.
   .. [#taichi_compat] Taichi is only supported on ROCm 6.3.2.
-   .. [#kfd_support-past-60] As of ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The supported user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and kernel-space support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
+   .. [#ray_compat] Ray is only supported on ROCm 6.4.1.
+   .. [#llama-cpp_compat] llama.cpp is only supported on ROCm 6.4.0.
+   .. [#kfd_support-past-60] As of ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The tested user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and kernel-space support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
   .. [#ROCT-rocr-past-60] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.
   
--- a/docs/compatibility/ml-compatibility/llama-cpp-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/llama-cpp-compatibility.rst
@@ -0,0 +1,156 @@
+:orphan:
+
+.. meta::
+    :description: llama.cpp deep learning framework compatibility
+    :keywords: GPU, GGML, llama.cpp compatibility
+
+.. version-set:: rocm_version latest
+
+********************************************************************************
+llama.cpp compatibility
+********************************************************************************
+
+`llama.cpp <https://github.com/ggml-org/llama.cpp>`__ is an open-source framework 
+for Large Language Model (LLM) inference that runs on both central processing units 
+(CPUs) and graphics processing units (GPUs). It is written in plain C/C++, providing 
+a simple, dependency-free setup. 
+
+The framework supports multiple quantization options, from 1.5-bit to 8-bit integers, 
+to speed up inference and reduce memory usage. Originally built as a CPU-first library, 
+llama.cpp is easy to integrate with other programming environments and is widely 
+adopted across diverse platforms, including consumer devices. 
+
+ROCm support for llama.cpp is upstreamed, and you can build the official source code
+with ROCm support:
+
+- ROCm support for llama.cpp is hosted in the official `https://github.com/ROCm/llama.cpp 
+  <https://github.com/ROCm/llama.cpp>`_ repository.
+
+- Due to independent compatibility considerations, this location differs from the 
+  `https://github.com/ggml-org/llama.cpp <https://github.com/ggml-org/llama.cpp>`_ upstream repository.
+
+- To install llama.cpp, use the prebuilt :ref:`Docker image <llama-cpp-docker-compat>`, 
+  which includes ROCm, llama.cpp, and all required dependencies.
+
+  - See the :doc:`ROCm llama.cpp installation guide <rocm-install-on-linux:install/3rd-party/llama-cpp-install>` 
+    to install and get started.
+
+  - See the `Installation guide <https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md#hip>`__ 
+    in the upstream llama.cpp documentation.
+
+.. note::
+
+  llama.cpp is supported on ROCm 6.4.0.
+
+Supported devices
+================================================================================
+
+**Officially Supported**: AMD Instinct™ MI300X, MI210
+
+
+Use cases and recommendations
+================================================================================
+
+llama.cpp can be applied in a variety of scenarios, particularly when you need to meet one or more of the following requirements:
+
+- Plain C/C++ implementation with no external dependencies
+- Support for 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory usage
+- Custom HIP (Heterogeneous-compute Interface for Portability) kernels for running large language models (LLMs) on AMD GPUs (graphics processing units)
+- CPU (central processing unit) + GPU (graphics processing unit) hybrid inference for partially accelerating models larger than the total available VRAM (video random-access memory)
+
+llama.cpp is also used in a range of real-world applications, including:
+
+- Games such as `Lucy's Labyrinth <https://github.com/MorganRO8/Lucys_Labyrinth>`__:
+  A simple maze game where AI-controlled agents attempt to trick the player.
+- Tools such as `Styled Lines <https://marketplace.unity.com/packages/tools/ai-ml-integration/style-text-webgl-ios-stand-alone-llm-llama-cpp-wrapper-292902>`__:
+  A proprietary, asynchronous inference wrapper for Unity3D game development, including pre-built mobile and web platform wrappers and a model example.
+- Various other AI applications use llama.cpp as their inference engine;  
+  for a detailed list, see the `user interfaces (UIs) section <https://github.com/ggml-org/llama.cpp?tab=readme-ov-file#description>`__.
+
+For more use cases and recommendations, refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__, 
+where you can search for llama.cpp examples and best practices to optimize your workloads on AMD GPUs.
+
+- The `Llama.cpp Meets Instinct: A New Era of Open-Source AI Acceleration <https://rocm.blogs.amd.com/ecosystems-and-partners/llama-cpp/README.html>`__, 
+  blog post outlines how the open-source llama.cpp framework enables efficient LLM inference—including interactive inference with ``llama-cli``, 
+  server deployment with ``llama-server``, GGUF model preparation and quantization, performance benchmarking, and optimizations tailored for 
+  AMD Instinct GPUs within the ROCm ecosystem. 
+
+.. _llama-cpp-docker-compat:
+
+Docker image compatibility
+================================================================================
+
+.. |docker-icon| raw:: html
+
+   <i class="fab fa-docker"></i>
+
+AMD validates and publishes `ROCm llama.cpp Docker images <https://hub.docker.com/r/rocm/llama.cpp>`__
+with ROCm backends on Docker Hub. The following Docker image tags and associated
+inventories were tested on `ROCm 6.4.0 <https://repo.radeon.com/rocm/apt/6.4/>`__.
+Click |docker-icon| to view the image on Docker Hub.
+
+.. important::
+
+   Tag endings of ``_full``, ``_server``, and ``_light`` serve different purposes for entrypoints as follows:
+
+   - Full: This image includes both the main executable file and the tools to convert ``LLaMA`` models into ``ggml`` and convert into 4-bit quantization.
+   - Server: This image only includes the server executable file.
+   - Light: This image only includes the main executable file.
+
+.. list-table::
+    :header-rows: 1
+    :class: docker-image-compatibility
+
+    * - Full Docker
+      - Server Docker
+      - Light Docker
+      - llama.cpp
+      - Ubuntu
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b5997_rocm6.4.0_ubuntu24.04_full/images/sha256-f78f6c81ab2f8e957469415fe2370a1334fe969c381d1fe46050c85effaee9d5"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
+      - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b5997_rocm6.4.0_ubuntu24.04_server/images/sha256-275ad9e18f292c26a00a2de840c37917e98737a88a3520bdc35fd3fc5c9a6a9b"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
+      - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/llama.cpp/llama.cpp-b5997_rocm6.4.0_ubuntu24.04_light/images/sha256-cc324e6faeedf0e400011f07b49d2dc41a16bae257b2b7befa0f4e2e97231320"><i class="fab fa-docker fa-lg"></i> rocm/llama.cpp</a>
+      - `b5997 <https://github.com/ROCm/llama.cpp/tree/release/b5997>`__
+      - 24.04
+
+Key ROCm libraries for llama.cpp
+================================================================================
+
+llama.cpp functionality on ROCm is determined by its underlying library
+dependencies. These ROCm components affect the capabilities, performance, and
+feature set available to developers.
+
+.. list-table::
+    :header-rows: 1
+
+    * - ROCm library
+      - Version
+      - Purpose
+      - Usage
+    * - `hipBLAS <https://github.com/ROCm/hipBLAS>`__
+      - :version-ref:`hipBLAS rocm_version`
+      - Provides GPU-accelerated Basic Linear Algebra Subprograms (BLAS) for
+        matrix and vector operations.
+      - Supports operations such as matrix multiplication, matrix-vector
+        products, and tensor contractions. Utilized in both dense and batched
+        linear algebra operations.
+    * - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`__
+      - :version-ref:`hipBLASLt rocm_version`
+      - hipBLASLt is an extension of the hipBLAS library, providing additional
+        features like epilogues fused into the matrix multiplication kernel or
+        use of integer tensor cores.
+      - By setting the flag ``ROCBLAS_USE_HIPBLASLT``, you can dispatch hipblasLt
+        kernels where possible.
+    * - `rocWMMA <https://github.com/ROCm/rocWMMA>`__
+      - :version-ref:`rocWMMA rocm_version`
+      - Accelerates warp-level matrix-multiply and matrix-accumulate to speed up matrix
+        multiplication (GEMM) and accumulation operations with mixed precision
+        support.
+      - Can be used to enhance the flash attention performance on AMD compute, by enabling
+        the flag during compile time.
--- a/docs/compatibility/ml-compatibility/ray-compatibility.rst
+++ b/docs/compatibility/ml-compatibility/ray-compatibility.rst
@@ -0,0 +1,111 @@
+:orphan:
+
+.. meta::
+    :description: Ray deep learning framework compatibility
+    :keywords: GPU, Ray compatibility
+
+.. version-set:: rocm_version latest
+
+*******************************************************************************
+Ray compatibility
+*******************************************************************************
+
+Ray is a unified framework for scaling AI and Python applications from your laptop 
+to a full cluster, without changing your code. Ray consists of `a core distributed 
+runtime  <https://docs.ray.io/en/latest/ray-core/walkthrough.html>`_ and a set of 
+`AI libraries <https://docs.ray.io/en/latest/ray-air/getting-started.html>`_ for 
+simplifying machine learning computations.
+
+Ray is a general-purpose framework that runs many types of workloads efficiently. 
+Any Python application can be scaled with Ray, without extra infrastructure.
+
+ROCm support for Ray is upstreamed, and you can build the official source code
+with ROCm support: 
+
+- ROCm support for Ray is hosted in the official `https://github.com/ROCm/ray 
+  <https://github.com/ROCm/ray>`_ repository.
+
+- Due to independent compatibility considerations, this location differs from the 
+  `https://github.com/ray-project/ray <https://github.com/ray-project/ray>`_ upstream repository.
+
+- To install Ray, use the prebuilt :ref:`Docker image <ray-docker-compat>` 
+  which includes ROCm, Ray, and all required dependencies.
+
+  - See the :doc:`ROCm Ray installation guide <rocm-install-on-linux:install/3rd-party/ray-install>` 
+    for instructions to get started.
+
+  - See the `Installation section <https://docs.ray.io/en/latest/ray-overview/installation.html>`_ 
+    in the upstream Ray documentation.
+
+  - The Docker image provided is based on the upstream Ray `Daily Release (Nightly) wheels <https://docs.ray.io/en/latest/ray-overview/installation.html#daily-releases-nightlies>`__ 
+    corresponding to commit `005c372 <https://github.com/ray-project/ray/commit/005c372262e050d5745f475e22e64305fa07f8b8>`__.
+
+.. note::
+
+  Ray is supported on ROCm 6.4.1.
+
+Supported devices
+================================================================================
+
+**Officially Supported**: AMD Instinct™ MI300X, MI210
+
+
+Use cases and recommendations
+================================================================================
+
+* The `Reinforcement Learning from Human Feedback on AMD GPUs with verl and ROCm 
+  Integration <https://rocm.blogs.amd.com/artificial-intelligence/verl-large-scale/README.html>`__  
+  blog provides an overview of Volcano Engine Reinforcement Learning (verl) 
+  for large language models (LLMs) and discusses its benefits in large-scale 
+  reinforcement learning from human feedback (RLHF). It uses Ray as part of a 
+  hybrid orchestration engine to schedule and coordinate training and inference 
+  tasks in parallel, enabling optimized resource utilization and potential overlap 
+  between these phases. This dynamic resource allocation strategy significantly 
+  improves overall system efficiency. The blog presents verl’s performance results, 
+  focusing on throughput and convergence accuracy achieved on AMD Instinct™ MI300X 
+  GPUs. Follow this guide to get started with verl on AMD Instinct GPUs and 
+  accelerate your RLHF training with ROCm-optimized performance.
+
+* The `Exploring Use Cases for Scalable AI: Implementing Ray with ROCm Support for Efficient ML Workflows 
+  <https://rocm.blogs.amd.com/artificial-intelligence/rocm-ray/README.html>`__
+  blog post describes key use cases such as training and inference for large language models (LLMs), 
+  model serving, hyperparameter tuning, reinforcement learning, and the orchestration of large-scale 
+  workloads using Ray in the ROCm environment.
+
+For more use cases and recommendations, see the AMD GPU tabs in the `Accelerator Support 
+topic <https://docs.ray.io/en/latest/ray-core/scheduling/accelerators.html#accelerator-support>`__ 
+of the Ray core documentation and refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__, 
+where you can search for Ray examples and best practices to optimize your workloads on AMD GPUs.
+
+.. _ray-docker-compat:
+
+Docker image compatibility
+================================================================================
+
+.. |docker-icon| raw:: html
+
+   <i class="fab fa-docker"></i>
+
+AMD validates and publishes ready-made `ROCm Ray Docker images <https://hub.docker.com/r/rocm/ray/tags>`__
+with ROCm backends on Docker Hub. The following Docker image tags and
+associated inventories represent the latest Ray version from the official Docker Hub and are validated for
+`ROCm 6.4.1 <https://repo.radeon.com/rocm/apt/6.4.1/>`_. Click the |docker-icon|
+icon to view the image on Docker Hub.
+
+.. list-table::
+    :header-rows: 1
+    :class: docker-image-compatibility
+
+    * - Docker image
+      - Ray
+      - Pytorch
+      - Ubuntu
+      - Python
+
+    * - .. raw:: html
+
+           <a href="https://hub.docker.com/layers/rocm/ray/ray-2.48.0.post0_rocm6.4.1_ubuntu24.04_py3.12_pytorch2.6.0/images/sha256-0d166fe6bdced38338c78eedfb96eff92655fb797da3478a62dd636365133cc0"><i class="fab fa-docker fa-lg"></i> rocm/ray</a>
+      - `2.48.0.post0 <https://github.com/ROCm/ray/tree/release/2.48.0.post0>`_
+      - 2.6.0+git684f6f2
+      - 24.04
+      - `3.12.10 <https://www.python.org/downloads/release/python-31210/>`_
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -9,17 +9,21 @@ import shutil
 import sys
 from pathlib import Path

-shutil.copy2("../RELEASE.md", "./about/release-notes.md")
-shutil.copy2("../CHANGELOG.md", "./release/changelog.md")
+gh_release_path = os.path.join("..", "RELEASE.md")
+gh_changelog_path = os.path.join("..", "CHANGELOG.md")
+sphinx_release_path = os.path.join("about", "release-notes.md")
+sphinx_changelog_path = os.path.join("release", "changelog.md")
+shutil.copy2(gh_release_path, sphinx_release_path)
+shutil.copy2(gh_changelog_path, sphinx_changelog_path)

 # Mark the consolidated changelog as orphan to prevent Sphinx from warning about missing toctree entries
-with open("./release/changelog.md", "r+") as file:
+with open(sphinx_changelog_path, "r+", encoding="utf-8") as file:
    content = file.read()
    file.seek(0)
    file.write(":orphan:\n" + content)

 # Replace GitHub-style [!ADMONITION]s with Sphinx-compatible ```{admonition} blocks
-with open("./release/changelog.md", "r") as file:
+with open(sphinx_changelog_path, "r", encoding="utf-8") as file:
    lines = file.readlines()

    modified_lines = []
@@ -57,11 +61,14 @@ with open("./release/changelog.md", "r") as file:

    file.close()

-    with open("./release/changelog.md", 'w') as file:
+    with open(sphinx_changelog_path, "w", encoding="utf-8") as file:
        file.writelines(modified_lines)

-os.system("mkdir -p ../_readthedocs/html/downloads")
-os.system("cp compatibility/compatibility-matrix-historical-6.0.csv ../_readthedocs/html/downloads/compatibility-matrix-historical-6.0.csv")
+matrix_path = os.path.join("compatibility", "compatibility-matrix-historical-6.0.csv")
+rtd_path = os.path.join("..", "_readthedocs", "html", "downloads")
+if not os.path.exists(rtd_path):
+    os.makedirs(rtd_path)
+shutil.copy2(matrix_path, rtd_path)

 latex_engine = "xelatex"
 latex_elements = {
@@ -101,6 +108,8 @@ article_pages = [
    {"file": "compatibility/ml-compatibility/dgl-compatibility", "os": ["linux"]},
    {"file": "compatibility/ml-compatibility/megablocks-compatibility", "os": ["linux"]},
    {"file": "compatibility/ml-compatibility/taichi-compatibility", "os": ["linux"]},
+    {"file": "compatibility/ml-compatibility/ray-compatibility", "os": ["linux"]},
+    {"file": "compatibility/ml-compatibility/llama-cpp-compatibility", "os": ["linux"]},
    {"file": "how-to/deep-learning-rocm", "os": ["linux"]},

    {"file": "how-to/rocm-for-ai/index", "os": ["linux"]},
@@ -117,11 +126,15 @@ article_pages = [
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.5", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.6", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-primus-migration-guide", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/primus-megatron", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.3", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.4", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.6", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4", "os": ["linux"]},
@@ -147,6 +160,8 @@ article_pages = [
    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250521", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250605", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250702", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250702", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/deploy-your-model", "os": ["linux"]},

--- a/docs/contribute/building.md
+++ b/docs/contribute/building.md
@@ -28,6 +28,10 @@ See the [Python requirements file](https://github.com/ROCm/ROCm/blob/develop/doc

 Use the Python Virtual Environment (`venv`) and run the following commands from the project root:

+::::{tab-set}
+:::{tab-item} Linux and WSL
+:sync: linux
+
 ```sh
 python3 -mvenv .venv

@@ -35,6 +39,20 @@ python3 -mvenv .venv
 .venv/bin/python -m sphinx -T -E -b html -d _build/doctrees -D language=en docs _build/html
 ```

+:::
+:::{tab-item} Windows
+:sync: windows
+
+```powershell
+python -mvenv .venv
+
+.venv\Scripts\python.exe -m pip install -r docs/sphinx/requirements.txt
+.venv\Scripts\python.exe -m sphinx -T -E -b html -d _build/doctrees -D language=en docs _build/html
+```
+
+:::
+::::
+
 Navigate to `_build/html/index.html` and open this file in a web browser.

 ## Visual Studio Code
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.0_20250812-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.0_20250812-benchmark-models.yaml
@@ -0,0 +1,91 @@
+vllm_benchmark:
+  unified_docker:
+    latest:
+      pull_tag: rocm/vllm:rocm6.4.1_vllm_0.10.0_20250812
+      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.0_20250812/images/sha256-4c277ad39af3a8c9feac9b30bf78d439c74d9b4728e788a419d3f1d0c30cacaa
+      rocm_version: 6.4.1
+      vllm_version: 0.10.0 (0.10.1.dev395+g340ea86df.rocm641)
+      pytorch_version: 2.7.0+gitf717b2a
+      hipblaslt_version: 0.15
+  model_groups:
+    - group: Meta Llama
+      tag: llama
+      models:
+      - model: Llama 3.1 8B
+        mad_tag: pyt_vllm_llama-3.1-8b
+        model_repo: meta-llama/Llama-3.1-8B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-8B
+        precision: float16
+      - model: Llama 3.1 70B
+        mad_tag: pyt_vllm_llama-3.1-70b
+        model_repo: meta-llama/Llama-3.1-70B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
+        precision: float16
+      - model: Llama 3.1 405B
+        mad_tag: pyt_vllm_llama-3.1-405b
+        model_repo: meta-llama/Llama-3.1-405B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
+        precision: float16
+      - model: Llama 2 70B
+        mad_tag: pyt_vllm_llama-2-70b
+        model_repo: meta-llama/Llama-2-70b-chat-hf
+        url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
+        precision: float16
+      - model: Llama 3.1 8B FP8
+        mad_tag: pyt_vllm_llama-3.1-8b_fp8
+        model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
+        precision: float8
+      - model: Llama 3.1 70B FP8
+        mad_tag: pyt_vllm_llama-3.1-70b_fp8
+        model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
+        precision: float8
+      - model: Llama 3.1 405B FP8
+        mad_tag: pyt_vllm_llama-3.1-405b_fp8
+        model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
+        precision: float8
+    - group: Mistral AI
+      tag: mistral
+      models:
+      - model: Mixtral MoE 8x7B
+        mad_tag: pyt_vllm_mixtral-8x7b
+        model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
+        url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
+        precision: float16
+      - model: Mixtral MoE 8x22B
+        mad_tag: pyt_vllm_mixtral-8x22b
+        model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
+        url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
+        precision: float16
+      - model: Mixtral MoE 8x7B FP8
+        mad_tag: pyt_vllm_mixtral-8x7b_fp8
+        model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+        precision: float8
+      - model: Mixtral MoE 8x22B FP8
+        mad_tag: pyt_vllm_mixtral-8x22b_fp8
+        model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+        precision: float8
+    - group: Qwen
+      tag: qwen
+      models:
+      - model: QwQ-32B
+        mad_tag: pyt_vllm_qwq-32b
+        model_repo: Qwen/QwQ-32B
+        url: https://huggingface.co/Qwen/QwQ-32B
+        precision: float16
+      - model: Qwen3 30B A3B
+        mad_tag: pyt_vllm_qwen3-30b-a3b
+        model_repo: Qwen/Qwen3-30B-A3B
+        url: https://huggingface.co/Qwen/Qwen3-30B-A3B
+        precision: float16
+    - group: Microsoft Phi
+      tag: phi
+      models:
+      - model: Phi-4
+        mad_tag: pyt_vllm_phi-4
+        model_repo: microsoft/phi-4
+        url: https://huggingface.co/microsoft/phi-4
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark-models.yaml
@@ -0,0 +1,163 @@
+vllm_benchmark:
+  unified_docker:
+    latest:
+      # TODO: update me
+      pull_tag: rocm/vllm:rocm6.4.1_vllm_0.9.1_20250715
+      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250715/images/sha256-4a429705fa95a58f6d20aceab43b1b76fa769d57f32d5d28bd3f4e030e2a78ea
+      rocm_version: 6.4.1
+      vllm_version: 0.9.1 (0.9.2.dev364+gb432b7a28.rocm641)
+      pytorch_version: 2.7.0+gitf717b2a
+      hipblaslt_version: 0.15
+  model_groups:
+    - group: Meta Llama
+      tag: llama
+      models:
+      - model: Llama 3.1 8B
+        mad_tag: pyt_vllm_llama-3.1-8b
+        model_repo: meta-llama/Llama-3.1-8B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-8B
+        precision: float16
+      - model: Llama 3.1 70B
+        mad_tag: pyt_vllm_llama-3.1-70b
+        model_repo: meta-llama/Llama-3.1-70B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
+        precision: float16
+      - model: Llama 3.1 405B
+        mad_tag: pyt_vllm_llama-3.1-405b
+        model_repo: meta-llama/Llama-3.1-405B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
+        precision: float16
+      - model: Llama 2 7B
+        mad_tag: pyt_vllm_llama-2-7b
+        model_repo: meta-llama/Llama-2-7b-chat-hf
+        url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
+        precision: float16
+      - model: Llama 2 70B
+        mad_tag: pyt_vllm_llama-2-70b
+        model_repo: meta-llama/Llama-2-70b-chat-hf
+        url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
+        precision: float16
+      - model: Llama 3.1 8B FP8
+        mad_tag: pyt_vllm_llama-3.1-8b_fp8
+        model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
+        precision: float8
+      - model: Llama 3.1 70B FP8
+        mad_tag: pyt_vllm_llama-3.1-70b_fp8
+        model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
+        precision: float8
+      - model: Llama 3.1 405B FP8
+        mad_tag: pyt_vllm_llama-3.1-405b_fp8
+        model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
+        precision: float8
+    - group: Mistral AI
+      tag: mistral
+      models:
+      - model: Mixtral MoE 8x7B
+        mad_tag: pyt_vllm_mixtral-8x7b
+        model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
+        url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
+        precision: float16
+      - model: Mixtral MoE 8x22B
+        mad_tag: pyt_vllm_mixtral-8x22b
+        model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
+        url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
+        precision: float16
+      - model: Mistral 7B
+        mad_tag: pyt_vllm_mistral-7b
+        model_repo: mistralai/Mistral-7B-Instruct-v0.3
+        url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
+        precision: float16
+      - model: Mixtral MoE 8x7B FP8
+        mad_tag: pyt_vllm_mixtral-8x7b_fp8
+        model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+        precision: float8
+      - model: Mixtral MoE 8x22B FP8
+        mad_tag: pyt_vllm_mixtral-8x22b_fp8
+        model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+        precision: float8
+      - model: Mistral 7B FP8
+        mad_tag: pyt_vllm_mistral-7b_fp8
+        model_repo: amd/Mistral-7B-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
+        precision: float8
+    - group: Qwen
+      tag: qwen
+      models:
+      - model: Qwen2 7B
+        mad_tag: pyt_vllm_qwen2-7b
+        model_repo: Qwen/Qwen2-7B-Instruct
+        url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
+        precision: float16
+      - model: Qwen2 72B
+        mad_tag: pyt_vllm_qwen2-72b
+        model_repo: Qwen/Qwen2-72B-Instruct
+        url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
+        precision: float16
+      - model: QwQ-32B
+        mad_tag: pyt_vllm_qwq-32b
+        model_repo: Qwen/QwQ-32B
+        url: https://huggingface.co/Qwen/QwQ-32B
+        precision: float16
+        tunableop: true
+    - group: Databricks DBRX
+      tag: dbrx
+      models:
+      - model: DBRX Instruct
+        mad_tag: pyt_vllm_dbrx-instruct
+        model_repo: databricks/dbrx-instruct
+        url: https://huggingface.co/databricks/dbrx-instruct
+        precision: float16
+      - model: DBRX Instruct FP8
+        mad_tag: pyt_vllm_dbrx_fp8
+        model_repo: amd/dbrx-instruct-FP8-KV
+        url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
+        precision: float8
+    - group: Google Gemma
+      tag: gemma
+      models:
+      - model: Gemma 2 27B
+        mad_tag: pyt_vllm_gemma-2-27b
+        model_repo: google/gemma-2-27b
+        url: https://huggingface.co/google/gemma-2-27b
+        precision: float16
+    - group: Cohere
+      tag: cohere
+      models:
+      - model: C4AI Command R+ 08-2024
+        mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
+        model_repo: CohereForAI/c4ai-command-r-plus-08-2024
+        url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
+        precision: float16
+      - model: C4AI Command R+ 08-2024 FP8
+        mad_tag: pyt_vllm_command-r-plus_fp8
+        model_repo: amd/c4ai-command-r-plus-FP8-KV
+        url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
+        precision: float8
+    - group: DeepSeek
+      tag: deepseek
+      models:
+      - model: DeepSeek MoE 16B
+        mad_tag: pyt_vllm_deepseek-moe-16b-chat
+        model_repo: deepseek-ai/deepseek-moe-16b-chat
+        url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
+        precision: float16
+    - group: Microsoft Phi
+      tag: phi
+      models:
+      - model: Phi-4
+        mad_tag: pyt_vllm_phi-4
+        model_repo: microsoft/phi-4
+        url: https://huggingface.co/microsoft/phi-4
+    - group: TII Falcon
+      tag: falcon
+      models:
+      - model: Falcon 180B
+        mad_tag: pyt_vllm_falcon-180b
+        model_repo: tiiuae/falcon-180B
+        url: https://huggingface.co/tiiuae/falcon-180B
+        precision: float16
--- a/docs/data/how-to/rocm-for-ai/inference/sglang-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/sglang-benchmark-models.yaml
@@ -1,12 +1,11 @@
-sglang_benchmark:
-  unified_docker:
-    latest:
-      pull_tag: lmsysorg/sglang:v0.4.5-rocm630
+dockers:
+  - pull_tag: lmsysorg/sglang:v0.4.5-rocm630
    docker_hub_url: https://hub.docker.com/layers/lmsysorg/sglang/v0.4.5-rocm630/images/sha256-63d2cb760a237125daf6612464cfe2f395c0784e21e8b0ea37d551cd10d3c951
-      rocm_version: 6.3.0
-      sglang_version: 0.4.5 (0.4.5-rocm)
-      pytorch_version: 2.6.0a0+git8d4926e
-  model_groups:
+    components:
+      ROCm: 6.3.0
+      SGLang: 0.4.5 (0.4.5-rocm)
+      PyTorch: 2.6.0a0+git8d4926e
+model_groups:
  - group: DeepSeek
    tag: deepseek
    models:
--- a/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
@@ -1,14 +1,12 @@
-vllm_benchmark:
-  unified_docker:
-    latest:
-      # TODO: update me
-      pull_tag: rocm/vllm:rocm6.4.1_vllm_0.9.1_20250715
-      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250715/images/sha256-4a429705fa95a58f6d20aceab43b1b76fa769d57f32d5d28bd3f4e030e2a78ea
-      rocm_version: 6.4.1
-      vllm_version: 0.9.1 (0.9.2.dev364+gb432b7a28.rocm641)
-      pytorch_version: 2.7.0+gitf717b2a
-      hipblaslt_version: 0.15
-  model_groups:
+dockers:
+  - pull_tag: rocm/vllm:rocm6.4.1_vllm_0.10.1_20250909
+    docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.1_20250909/images/sha256-1113268572e26d59b205792047bea0e61e018e79aeadceba118b7bf23cb3715c
+    components:
+      ROCm: 6.4.1
+      vLLM: 0.10.1 (0.10.1rc2.dev409+g0b6bf6691.rocm641)
+      PyTorch: 2.7.0+gitf717b2a
+      hipBLASLt: 0.15
+model_groups:
  - group: Meta Llama
    tag: llama
    models:
@@ -17,41 +15,85 @@ vllm_benchmark:
      model_repo: meta-llama/Llama-3.1-8B-Instruct
      url: https://huggingface.co/meta-llama/Llama-3.1-8B
      precision: float16
+      config:
+        tp: 1
+        dtype: auto
+        kv_cache_dtype: auto
+        max_seq_len_to_capture: 131072
+        max_num_batched_tokens: 131072
+        max_model_len: 8192
    - model: Llama 3.1 70B
      mad_tag: pyt_vllm_llama-3.1-70b
      model_repo: meta-llama/Llama-3.1-70B-Instruct
      url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
      precision: float16
+      config:
+        tp: 8
+        dtype: auto
+        kv_cache_dtype: auto
+        max_seq_len_to_capture: 131072
+        max_num_batched_tokens: 131072
+        max_model_len: 8192
    - model: Llama 3.1 405B
      mad_tag: pyt_vllm_llama-3.1-405b
      model_repo: meta-llama/Llama-3.1-405B-Instruct
      url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
      precision: float16
-      - model: Llama 2 7B
-        mad_tag: pyt_vllm_llama-2-7b
-        model_repo: meta-llama/Llama-2-7b-chat-hf
-        url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
-        precision: float16
+      config:
+        tp: 8
+        dtype: auto
+        kv_cache_dtype: auto
+        max_seq_len_to_capture: 131072
+        max_num_batched_tokens: 131072
+        max_model_len: 8192
    - model: Llama 2 70B
      mad_tag: pyt_vllm_llama-2-70b
      model_repo: meta-llama/Llama-2-70b-chat-hf
      url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
      precision: float16
+      config:
+        tp: 8
+        dtype: auto
+        kv_cache_dtype: auto
+        max_seq_len_to_capture: 4096
+        max_num_batched_tokens: 4096
+        max_model_len: 4096
    - model: Llama 3.1 8B FP8
      mad_tag: pyt_vllm_llama-3.1-8b_fp8
      model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
      url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
      precision: float8
+      config:
+        tp: 1
+        dtype: auto
+        kv_cache_dtype: fp8
+        max_seq_len_to_capture: 131072
+        max_num_batched_tokens: 131072
+        max_model_len: 8192
    - model: Llama 3.1 70B FP8
      mad_tag: pyt_vllm_llama-3.1-70b_fp8
      model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
      url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
      precision: float8
+      config:
+        tp: 8
+        dtype: auto
+        kv_cache_dtype: fp8
+        max_seq_len_to_capture: 131072
+        max_num_batched_tokens: 131072
+        max_model_len: 8192
    - model: Llama 3.1 405B FP8
      mad_tag: pyt_vllm_llama-3.1-405b_fp8
      model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
      url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
      precision: float8
+      config:
+        tp: 8
+        dtype: auto
+        kv_cache_dtype: fp8
+        max_seq_len_to_capture: 131072
+        max_num_batched_tokens: 131072
+        max_model_len: 8192
  - group: Mistral AI
    tag: mistral
    models:
@@ -60,92 +102,76 @@ vllm_benchmark:
      model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
      url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
      precision: float16
+      config:
+        tp: 8
+        dtype: auto
+        kv_cache_dtype: auto
+        max_seq_len_to_capture: 32768
+        max_num_batched_tokens: 32768
+        max_model_len: 8192
    - model: Mixtral MoE 8x22B
      mad_tag: pyt_vllm_mixtral-8x22b
      model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
      url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
      precision: float16
-      - model: Mistral 7B
-        mad_tag: pyt_vllm_mistral-7b
-        model_repo: mistralai/Mistral-7B-Instruct-v0.3
-        url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
-        precision: float16
+      config:
+        tp: 8
+        dtype: auto
+        kv_cache_dtype: auto
+        max_seq_len_to_capture: 65536
+        max_num_batched_tokens: 65536
+        max_model_len: 8192
    - model: Mixtral MoE 8x7B FP8
      mad_tag: pyt_vllm_mixtral-8x7b_fp8
      model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
      url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
      precision: float8
+      config:
+        tp: 8
+        dtype: auto
+        kv_cache_dtype: fp8
+        max_seq_len_to_capture: 32768
+        max_num_batched_tokens: 32768
+        max_model_len: 8192
    - model: Mixtral MoE 8x22B FP8
      mad_tag: pyt_vllm_mixtral-8x22b_fp8
      model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
      url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
      precision: float8
-      - model: Mistral 7B FP8
-        mad_tag: pyt_vllm_mistral-7b_fp8
-        model_repo: amd/Mistral-7B-v0.1-FP8-KV
-        url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
-        precision: float8
+      config:
+        tp: 8
+        dtype: auto
+        kv_cache_dtype: fp8
+        max_seq_len_to_capture: 65536
+        max_num_batched_tokens: 65536
+        max_model_len: 8192
  - group: Qwen
    tag: qwen
    models:
-      - model: Qwen2 7B
-        mad_tag: pyt_vllm_qwen2-7b
-        model_repo: Qwen/Qwen2-7B-Instruct
-        url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
-        precision: float16
-      - model: Qwen2 72B
-        mad_tag: pyt_vllm_qwen2-72b
-        model_repo: Qwen/Qwen2-72B-Instruct
-        url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
-        precision: float16
    - model: QwQ-32B
      mad_tag: pyt_vllm_qwq-32b
      model_repo: Qwen/QwQ-32B
      url: https://huggingface.co/Qwen/QwQ-32B
      precision: float16
-        tunableop: true
-    - group: Databricks DBRX
-      tag: dbrx
-      models:
-      - model: DBRX Instruct
-        mad_tag: pyt_vllm_dbrx-instruct
-        model_repo: databricks/dbrx-instruct
-        url: https://huggingface.co/databricks/dbrx-instruct
-        precision: float16
-      - model: DBRX Instruct FP8
-        mad_tag: pyt_vllm_dbrx_fp8
-        model_repo: amd/dbrx-instruct-FP8-KV
-        url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
-        precision: float8
-    - group: Google Gemma
-      tag: gemma
-      models:
-      - model: Gemma 2 27B
-        mad_tag: pyt_vllm_gemma-2-27b
-        model_repo: google/gemma-2-27b
-        url: https://huggingface.co/google/gemma-2-27b
-        precision: float16
-    - group: Cohere
-      tag: cohere
-      models:
-      - model: C4AI Command R+ 08-2024
-        mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
-        model_repo: CohereForAI/c4ai-command-r-plus-08-2024
-        url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
-        precision: float16
-      - model: C4AI Command R+ 08-2024 FP8
-        mad_tag: pyt_vllm_command-r-plus_fp8
-        model_repo: amd/c4ai-command-r-plus-FP8-KV
-        url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
-        precision: float8
-    - group: DeepSeek
-      tag: deepseek
-      models:
-      - model: DeepSeek MoE 16B
-        mad_tag: pyt_vllm_deepseek-moe-16b-chat
-        model_repo: deepseek-ai/deepseek-moe-16b-chat
-        url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
+      config:
+        tp: 1
+        dtype: auto
+        kv_cache_dtype: auto
+        max_seq_len_to_capture: 131072
+        max_num_batched_tokens: 131072
+        max_model_len: 8192
+    - model: Qwen3 30B A3B
+      mad_tag: pyt_vllm_qwen3-30b-a3b
+      model_repo: Qwen/Qwen3-30B-A3B
+      url: https://huggingface.co/Qwen/Qwen3-30B-A3B
      precision: float16
+      config:
+        tp: 1
+        dtype: auto
+        kv_cache_dtype: auto
+        max_seq_len_to_capture: 32768
+        max_num_batched_tokens: 32768
+        max_model_len: 8192
  - group: Microsoft Phi
    tag: phi
    models:
@@ -153,11 +179,10 @@ vllm_benchmark:
      mad_tag: pyt_vllm_phi-4
      model_repo: microsoft/phi-4
      url: https://huggingface.co/microsoft/phi-4
-    - group: TII Falcon
-      tag: falcon
-      models:
-      - model: Falcon 180B
-        mad_tag: pyt_vllm_falcon-180b
-        model_repo: tiiuae/falcon-180B
-        url: https://huggingface.co/tiiuae/falcon-180B
-        precision: float16
+      config:
+        tp: 1
+        dtype: auto
+        kv_cache_dtype: auto
+        max_seq_len_to_capture: 16384
+        max_num_batched_tokens: 16384
+        max_model_len: 8192
--- a/docs/data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
@@ -0,0 +1,72 @@
+dockers:
+  - pull_tag: rocm/jax-training:maxtext-v25.7
+    docker_hub_url: https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7/images/sha256-45f4c727d4019a63fc47313d3a5f5a5105569539294ddfd2d742218212ae9025
+    components:
+      ROCm: 6.4.1
+      JAX: 0.5.0
+      Python: 3.10.12
+      Transformer Engine: 2.1.0+90d703dd
+      hipBLASLt: 1.x.x
+  - pull_tag: rocm/jax-training:maxtext-v25.7-jax060
+    docker_hub_url: https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7/images/sha256-45f4c727d4019a63fc47313d3a5f5a5105569539294ddfd2d742218212ae9025
+    components:
+      ROCm: 6.4.1
+      JAX: 0.6.0
+      Python: 3.10.12
+      Transformer Engine: 2.1.0+90d703dd
+      hipBLASLt: 1.1.0-499ece1c21
+model_groups:
+  - group: Meta Llama
+    tag: llama
+    models:
+      - model: Llama 3.3 70B
+        mad_tag: jax_maxtext_train_llama-3.3-70b
+        model_repo: Llama-3.3-70B
+        precision: bf16
+        doc_options: ["single-node"]
+      - model: Llama 3.1 8B
+        mad_tag: jax_maxtext_train_llama-3.1-8b
+        model_repo: Llama-3.1-8B
+        precision: bf16
+        doc_options: ["single-node"]
+      - model: Llama 3.1 70B
+        mad_tag: jax_maxtext_train_llama-3.1-70b
+        model_repo: Llama-3.1-70B
+        precision: bf16
+        doc_options: ["single-node"]
+      - model: Llama 3 8B
+        mad_tag: jax_maxtext_train_llama-3-8b
+        multinode_training_script: llama3_8b_multinode.sh
+        doc_options: ["multi-node"]
+      - model: Llama 3 70B
+        mad_tag: jax_maxtext_train_llama-3-70b
+        multinode_training_script: llama3_70b_multinode.sh
+        doc_options: ["multi-node"]
+      - model: Llama 2 7B
+        mad_tag: jax_maxtext_train_llama-2-7b
+        model_repo: Llama-2-7B
+        precision: bf16
+        multinode_training_script: llama2_7b_multinode.sh
+        doc_options: ["single-node", "multi-node"]
+      - model: Llama 2 70B
+        mad_tag: jax_maxtext_train_llama-2-70b
+        model_repo: Llama-2-70B
+        precision: bf16
+        multinode_training_script: llama2_70b_multinode.sh
+        doc_options: ["single-node", "multi-node"]
+  - group: DeepSeek
+    tag: deepseek
+    models:
+      - model: DeepSeek-V2-Lite (16B)
+        mad_tag: jax_maxtext_train_deepseek-v2-lite-16b
+        model_repo: DeepSeek-V2-lite
+        precision: bf16
+        doc_options: ["single-node"]
+  - group: Mistral AI
+    tag: mistral
+    models:
+      - model: Mixtral 8x7B
+        mad_tag: jax_maxtext_train_mixtral-8x7b
+        model_repo: Mixtral-8x7B
+        precision: bf16
+        doc_options: ["single-node"]
--- a/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
@@ -1,26 +1,15 @@
 dockers:
-  - pull_tag: rocm/megatron-lm:v25.6_py312
-    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py312/images/sha256-482ff906532285bceabdf2bda629bd32cb6174d2d07f4243a736378001b28df0
+  - pull_tag: rocm/megatron-lm:v25.7_py310
+    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a
    components:
-      ROCm: 6.4.1
-      PyTorch: 2.8.0a0+git7d205b2
-      Python: 3.12
-      Transformer Engine: 2.1.0.dev0+8c4a512
-      hipBLASLt: 393e413
-      Triton: 3.3.0
-      RCCL: 2.23.4.7a84c5d
-    doc_name: Ubuntu 24.04 + Python 3.12
-  - pull_tag: rocm/megatron-lm:v25.6_py310
-    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py310/images/sha256-9627bd9378684fe26cb1a10c7dd817868f553b33402e49b058355b0f095568d6
-    components:
-      ROCm: 6.4.1
-      PyTorch: 2.8.0a0+git7d205b2
+      ROCm: 6.4.2
+      Primus: v0.1.0-rc1
+      PyTorch: 2.8.0a0+gitd06a406
      Python: "3.10"
-      Transformer Engine: 2.1.0.dev0+8c4a512
-      hipBLASLt: 393e413
+      Transformer Engine: 2.1.0.dev0+ba586519
+      hipBLASLt: 37ba1d36
      Triton: 3.3.0
-      RCCL: 2.23.4.7a84c5d
-    doc_name: Ubuntu 22.04 + Python 3.10
+      RCCL: 2.22.3
 model_groups:
  - group: Meta Llama
    tag: llama
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.6-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.6-benchmark-models.yaml
@@ -0,0 +1,60 @@
+dockers:
+  - pull_tag: rocm/megatron-lm:v25.6_py312
+    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py312/images/sha256-482ff906532285bceabdf2bda629bd32cb6174d2d07f4243a736378001b28df0
+    components:
+      ROCm: 6.4.1
+      PyTorch: 2.8.0a0+git7d205b2
+      Python: 3.12
+      Transformer Engine: 2.1.0.dev0+8c4a512
+      hipBLASLt: 393e413
+      Triton: 3.3.0
+      RCCL: 2.23.4.7a84c5d
+    doc_name: Ubuntu 24.04 + Python 3.12
+  - pull_tag: rocm/megatron-lm:v25.6_py310
+    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py310/images/sha256-9627bd9378684fe26cb1a10c7dd817868f553b33402e49b058355b0f095568d6
+    components:
+      ROCm: 6.4.1
+      PyTorch: 2.8.0a0+git7d205b2
+      Python: "3.10"
+      Transformer Engine: 2.1.0.dev0+8c4a512
+      hipBLASLt: 393e413
+      Triton: 3.3.0
+      RCCL: 2.23.4.7a84c5d
+    doc_name: Ubuntu 22.04 + Python 3.10
+model_groups:
+  - group: Meta Llama
+    tag: llama
+    models:
+      - model: Llama 3.3 70B
+        mad_tag: pyt_megatron_lm_train_llama-3.3-70b
+      - model: Llama 3.1 8B
+        mad_tag: pyt_megatron_lm_train_llama-3.1-8b
+      - model: Llama 3.1 70B
+        mad_tag: pyt_megatron_lm_train_llama-3.1-70b
+      - model: Llama 3.1 70B (proxy)
+        mad_tag: pyt_megatron_lm_train_llama-3.1-70b-proxy
+      - model: Llama 2 7B
+        mad_tag: pyt_megatron_lm_train_llama-2-7b
+      - model: Llama 2 70B
+        mad_tag: pyt_megatron_lm_train_llama-2-70b
+  - group: DeepSeek
+    tag: deepseek
+    models:
+      - model: DeepSeek-V3 (proxy)
+        mad_tag: pyt_megatron_lm_train_deepseek-v3-proxy
+      - model: DeepSeek-V2-Lite
+        mad_tag: pyt_megatron_lm_train_deepseek-v2-lite-16b
+  - group: Mistral AI
+    tag: mistral
+    models:
+      - model: Mixtral 8x7B
+        mad_tag: pyt_megatron_lm_train_mixtral-8x7b
+      - model: Mixtral 8x22B (proxy)
+        mad_tag: pyt_megatron_lm_train_mixtral-8x22b-proxy
+  - group: Qwen
+    tag: qwen
+    models:
+      - model: Qwen 2.5 7B
+        mad_tag: pyt_megatron_lm_train_qwen2.5-7b
+      - model: Qwen 2.5 72B
+        mad_tag: pyt_megatron_lm_train_qwen2.5-72b
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.6-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.6-benchmark-models.yaml
@@ -0,0 +1,120 @@
+unified_docker:
+  latest:
+    pull_tag: rocm/pytorch-training:v25.6
+    docker_hub_url: https://hub.docker.com/r/rocm/pytorch-training/tags
+    rocm_version: 6.4.1
+    pytorch_version: 2.8.0a0+git7d205b2
+    python_version: 3.10.17
+    transformer_engine_version: 1.14.0+2f85f5f2
+    flash_attention_version: 3.0.0.post1
+    hipblaslt_version: 0.15.0-8c6919d
+    triton_version: 3.3.0
+model_groups:
+  - group: Pre-training
+    tag: pre-training
+    models:
+    - model: Llama 3.1 8B
+      mad_tag: pyt_train_llama-3.1-8b
+      model_repo: Llama-3.1-8B
+      url: https://huggingface.co/meta-llama/Llama-3.1-8B
+      precision: BF16
+      training_modes: [pretrain]
+    - model: Llama 3.1 70B
+      mad_tag: pyt_train_llama-3.1-70b
+      model_repo: Llama-3.1-70B
+      url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
+      precision: BF16
+      training_modes: [pretrain]
+    - model: FLUX.1-dev
+      mad_tag: pyt_train_flux
+      model_repo: Flux
+      url: https://huggingface.co/black-forest-labs/FLUX.1-dev
+      precision: BF16
+      training_modes: [pretrain]
+  - group: Fine-tuning
+    tag: fine-tuning
+    models:
+    - model: Llama 4 Scout 17B-16E
+      mad_tag: pyt_train_llama-4-scout-17b-16e
+      model_repo: Llama-4-17B_16E
+      url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3.3 70B
+      mad_tag: pyt_train_llama-3.3-70b
+      model_repo: Llama-3.3-70B
+      url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
+    - model: Llama 3.2 1B
+      mad_tag: pyt_train_llama-3.2-1b
+      model_repo: Llama-3.2-1B
+      url: https://huggingface.co/meta-llama/Llama-3.2-1B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3.2 3B
+      mad_tag: pyt_train_llama-3.2-3b
+      model_repo: Llama-3.2-3B
+      url: https://huggingface.co/meta-llama/Llama-3.2-3B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3.2 Vision 11B
+      mad_tag: pyt_train_llama-3.2-vision-11b
+      model_repo: Llama-3.2-Vision-11B
+      url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision
+      precision: BF16
+      training_modes: [finetune_fw]
+    - model: Llama 3.2 Vision 90B
+      mad_tag: pyt_train_llama-3.2-vision-90b
+      model_repo: Llama-3.2-Vision-90B
+      url: https://huggingface.co/meta-llama/Llama-3.2-90B-Vision
+      precision: BF16
+      training_modes: [finetune_fw]
+    - model: Llama 3.1 8B
+      mad_tag: pyt_train_llama-3.1-8b
+      model_repo: Llama-3.1-8B
+      url: https://huggingface.co/meta-llama/Llama-3.1-8B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3.1 70B
+      mad_tag: pyt_train_llama-3.1-70b
+      model_repo: Llama-3.1-70B
+      url: https://huggingface.co/meta-llama/Llama-3.1-70B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
+    - model: Llama 3.1 405B
+      mad_tag: pyt_train_llama-3.1-405b
+      model_repo: Llama-3.1-405B
+      url: https://huggingface.co/meta-llama/Llama-3.1-405B
+      precision: BF16
+      training_modes: [finetune_qlora, HF_finetune_lora]
+    - model: Llama 3 8B
+      mad_tag: pyt_train_llama-3-8b
+      model_repo: Llama-3-8B
+      url: https://huggingface.co/meta-llama/Meta-Llama-3-8B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3 70B
+      mad_tag: pyt_train_llama-3-70b
+      model_repo: Llama-3-70B
+      url: https://huggingface.co/meta-llama/Meta-Llama-3-70B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 2 7B
+      mad_tag: pyt_train_llama-2-7b
+      model_repo: Llama-2-7B
+      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
+    - model: Llama 2 13B
+      mad_tag: pyt_train_llama-2-13b
+      model_repo: Llama-2-13B
+      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 2 70B
+      mad_tag: pyt_train_llama-2-70b
+      model_repo: Llama-2-70B
+      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
+      precision: BF16
+      training_modes: [finetune_lora, finetune_qlora, HF_finetune_lora]
--- a/docs/data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
@@ -0,0 +1,58 @@
+dockers:
+  - pull_tag: rocm/megatron-lm:v25.7_py310
+    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a
+    components:
+      ROCm: 6.4.2
+      Primus: v0.1.0-rc1
+      PyTorch: 2.8.0a0+gitd06a406
+      Python: "3.10"
+      Transformer Engine: 2.1.0.dev0+ba586519
+      hipBLASLt: 37ba1d36
+      Triton: 3.3.0
+      RCCL: 2.22.3
+model_groups:
+  - group: Meta Llama
+    tag: llama
+    models:
+      - model: Llama 3.3 70B
+        mad_tag: primus_pyt_megatron_lm_train_llama-3.3-70b
+        config_name: llama3.3_70B-pretrain.yaml
+      - model: Llama 3.1 70B
+        mad_tag: primus_pyt_megatron_lm_train_llama-3.1-70b
+        config_name: llama3.1_70B-pretrain.yaml
+      - model: Llama 3.1 8B
+        mad_tag: primus_pyt_megatron_lm_train_llama-3.1-8b
+        config_name: llama3.1_8B-pretrain.yaml
+      - model: Llama 2 7B
+        mad_tag: primus_pyt_megatron_lm_train_llama-2-7b
+        config_name: llama2_7B-pretrain.yaml
+      - model: Llama 2 70B
+        mad_tag: primus_pyt_megatron_lm_train_llama-2-70b
+        config_name: llama2_70B-pretrain.yaml
+  - group: DeepSeek
+    tag: deepseek
+    models:
+      - model: DeepSeek-V3 (proxy)
+        mad_tag: primus_pyt_megatron_lm_train_deepseek-v3-proxy
+        config_name: deepseek_v3-pretrain.yaml
+      - model: DeepSeek-V2-Lite
+        mad_tag: primus_pyt_megatron_lm_train_deepseek-v2-lite-16b
+        config_name: deepseek_v2_lite-pretrain.yaml
+  - group: Mistral AI
+    tag: mistral
+    models:
+      - model: Mixtral 8x7B
+        mad_tag: primus_pyt_megatron_lm_train_mixtral-8x7b
+        config_name: mixtral_8x7B_v0.1-pretrain.yaml
+      - model: Mixtral 8x22B (proxy)
+        mad_tag: primus_pyt_megatron_lm_train_mixtral-8x22b-proxy
+        config_name: mixtral_8x22B_v0.1-pretrain.yaml
+  - group: Qwen
+    tag: qwen
+    models:
+      - model: Qwen 2.5 7B
+        mad_tag: primus_pyt_megatron_lm_train_qwen2.5-7b
+        config_name: primus_qwen2.5_7B-pretrain.yaml
+      - model: Qwen 2.5 72B
+        mad_tag: primus_pyt_megatron_lm_train_qwen2.5-72b
+        config_name: qwen2.5_72B-pretrain.yaml
--- a/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
@@ -1,38 +1,17 @@
-unified_docker:
-  latest:
-    pull_tag: rocm/pytorch-training:v25.6
-    docker_hub_url: https://hub.docker.com/r/rocm/pytorch-training/tags
-    rocm_version: 6.4.1
-    pytorch_version: 2.8.0a0+git7d205b2
-    python_version: 3.10.17
-    transformer_engine_version: 1.14.0+2f85f5f2
-    flash_attention_version: 3.0.0.post1
-    hipblaslt_version: 0.15.0-8c6919d
-    triton_version: 3.3.0
+dockers:
+  - pull_tag: rocm/pytorch-training:v25.7
+    docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.7/images/sha256-cc6fd840ab89cb81d926fc29eca6d075aee9875a55a522675a4b9231c9a0a712
+    components:
+      ROCm: 6.4.2
+      PyTorch: 2.8.0a0+gitd06a406
+      Python: 3.10.18
+      Transformer Engine: 2.2.0.dev0+94e53dd8
+      Flash Attention: 3.0.0.post1
+      hipBLASLt: 1.1.0-4b9a52edfc
+      Triton: 3.3.0
 model_groups:
-  - group: Pre-training
-    tag: pre-training
-    models:
-    - model: Llama 3.1 8B
-      mad_tag: pyt_train_llama-3.1-8b
-      model_repo: Llama-3.1-8B
-      url: https://huggingface.co/meta-llama/Llama-3.1-8B
-      precision: BF16
-      training_modes: [pretrain]
-    - model: Llama 3.1 70B
-      mad_tag: pyt_train_llama-3.1-70b
-      model_repo: Llama-3.1-70B
-      url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
-      precision: BF16
-      training_modes: [pretrain]
-    - model: FLUX.1-dev
-      mad_tag: pyt_train_flux
-      model_repo: Flux
-      url: https://huggingface.co/black-forest-labs/FLUX.1-dev
-      precision: BF16
-      training_modes: [pretrain]
-  - group: Fine-tuning
-    tag: fine-tuning
+  - group: Meta Llama
+    tag: llama
    models:
    - model: Llama 4 Scout 17B-16E
      mad_tag: pyt_train_llama-4-scout-17b-16e
@@ -75,19 +54,19 @@ model_groups:
      model_repo: Llama-3.1-8B
      url: https://huggingface.co/meta-llama/Llama-3.1-8B
      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
+      training_modes: [pretrain, finetune_fw, finetune_lora, HF_pretrain]
    - model: Llama 3.1 70B
      mad_tag: pyt_train_llama-3.1-70b
      model_repo: Llama-3.1-70B
-      url: https://huggingface.co/meta-llama/Llama-3.1-70B
+      url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
      precision: BF16
-      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
+      training_modes: [pretrain, finetune_fw, finetune_lora]
    - model: Llama 3.1 405B
      mad_tag: pyt_train_llama-3.1-405b
      model_repo: Llama-3.1-405B
      url: https://huggingface.co/meta-llama/Llama-3.1-405B
      precision: BF16
-      training_modes: [finetune_qlora, HF_finetune_lora]
+      training_modes: [finetune_qlora]
    - model: Llama 3 8B
      mad_tag: pyt_train_llama-3-8b
      model_repo: Llama-3-8B
@@ -117,4 +96,67 @@ model_groups:
      model_repo: Llama-2-70B
      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
      precision: BF16
-      training_modes: [finetune_lora, finetune_qlora, HF_finetune_lora]
+      training_modes: [finetune_lora, finetune_qlora]
+  - group: OpenAI
+    tag: openai
+    models:
+    - model: GPT OSS 20B
+      mad_tag: pyt_train_gpt_oss_20b
+      model_repo: GPT-OSS-20B
+      url: https://huggingface.co/openai/gpt-oss-20b
+      precision: BF16
+      training_modes: [HF_finetune_lora]
+    - model: GPT OSS 120B
+      mad_tag: pyt_train_gpt_oss_120b
+      model_repo: GPT-OSS-120B
+      url: https://huggingface.co/openai/gpt-oss-120b
+      precision: BF16
+      training_modes: [HF_finetune_lora]
+  - group: Qwen
+    tag: qwen
+    models:
+    - model: Qwen 3 8B
+      mad_tag: pyt_train_qwen3-8b
+      model_repo: Qwen3-8B
+      url: https://huggingface.co/Qwen/Qwen3-8B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Qwen 3 32B
+      mad_tag: pyt_train_qwen3-32b
+      model_repo: Qwen3-32
+      url: https://huggingface.co/Qwen/Qwen3-32B
+      precision: BF16
+      training_modes: [finetune_lora]
+    - model: Qwen 2.5 32B
+      mad_tag: pyt_train_qwen2.5-32b
+      model_repo: Qwen2.5-32B
+      url: https://huggingface.co/Qwen/Qwen2.5-32B
+      precision: BF16
+      training_modes: [finetune_lora]
+    - model: Qwen 2.5 72B
+      mad_tag: pyt_train_qwen2.5-72b
+      model_repo: Qwen2.5-72B
+      url: https://huggingface.co/Qwen/Qwen2.5-72B
+      precision: BF16
+      training_modes: [finetune_lora]
+    - model: Qwen 2 1.5B
+      mad_tag: pyt_train_qwen2-1.5b
+      model_repo: Qwen2-1.5B
+      url: https://huggingface.co/Qwen/Qwen2-1.5B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Qwen 2 7B
+      mad_tag: pyt_train_qwen2-7b
+      model_repo: Qwen2-7B
+      url: https://huggingface.co/Qwen/Qwen2-7B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+  - group: Flux
+    tag: flux
+    models:
+    - model: FLUX.1-dev
+      mad_tag: pyt_train_flux
+      model_repo: Flux
+      url: https://huggingface.co/black-forest-labs/FLUX.1-dev
+      precision: BF16
+      training_modes: [pretrain]
--- a/docs/how-to/deep-learning-rocm.rst
+++ b/docs/how-to/deep-learning-rocm.rst
@@ -2,58 +2,146 @@
   :description: How to install deep learning frameworks for ROCm
   :keywords: deep learning, frameworks, ROCm, install, PyTorch, TensorFlow, JAX, MAGMA, DeepSpeed, ML, AI

-********************************************
-Installing deep learning frameworks for ROCm
-********************************************
+**********************************
+Deep learning frameworks for ROCm
+**********************************

-ROCm provides a comprehensive ecosystem for deep learning development, including
-:ref:`libraries <artificial-intelligence-apis>` for optimized deep learning operations and ROCm-aware versions of popular
-deep learning frameworks and libraries such as PyTorch, TensorFlow, and JAX. ROCm works closely with these
-frameworks to ensure that framework-specific optimizations take advantage of AMD accelerator and GPU architectures.
+Deep learning frameworks provide environments for machine learning, training, fine-tuning, inference, and performance optimization.

-The following guides provide information on compatibility and supported
-features for these ROCm-enabled deep learning frameworks.
+ROCm offers a complete ecosystem for developing and running deep learning applications efficiently. It also provides ROCm-compatible versions of popular frameworks and libraries, such as PyTorch, TensorFlow, JAX, and others.

-* :doc:`PyTorch compatibility <../compatibility/ml-compatibility/pytorch-compatibility>`
-* :doc:`TensorFlow compatibility <../compatibility/ml-compatibility/tensorflow-compatibility>`
-* :doc:`JAX compatibility <../compatibility/ml-compatibility/jax-compatibility>`
-* :doc:`verl compatibility <../compatibility/ml-compatibility/verl-compatibility>`
-* :doc:`Stanford Megatron-LM compatibility <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>`
-* :doc:`DGL compatibility <../compatibility/ml-compatibility/dgl-compatibility>`
-* :doc:`Megablocks compatibility <../compatibility/ml-compatibility/megablocks-compatibility>`
-* :doc:`Taichi compatibility <../compatibility/ml-compatibility/taichi-compatibility>`
+The AMD ROCm organization actively contributes to open-source development and collaborates closely with framework organizations. This collaboration ensures that framework-specific optimizations effectively leverage AMD GPUs and accelerators.

-This chart steps through typical installation workflows for installing deep learning frameworks for ROCm.
+The table below summarizes information about ROCm-enabled deep learning frameworks. It includes details on ROCm compatibility and third-party tool support, installation steps and options, and links to GitHub resources. For a complete list of supported framework versions on ROCm, see the :doc:`Compatibility matrix <../compatibility/compatibility-matrix>` topic.

-.. image:: ../data/how-to/framework_install_2024_07_04.png
-   :alt: Flowchart for installing ROCm-aware machine learning frameworks
-   :align: center
+.. list-table:: 
+    :header-rows: 1
+    :widths: 5 3 6 3

-See the installation instructions to get started.
+    * - Framework
+      - Installation
+      - Installation options
+      - GitHub

-* :doc:`PyTorch for ROCm <rocm-install-on-linux:install/3rd-party/pytorch-install>`
-* :doc:`TensorFlow for ROCm <rocm-install-on-linux:install/3rd-party/tensorflow-install>`
-* :doc:`JAX for ROCm <rocm-install-on-linux:install/3rd-party/jax-install>`
-* :doc:`verl for ROCm <rocm-install-on-linux:install/3rd-party/verl-install>`
-* :doc:`Stanford Megatron-LM for ROCm <rocm-install-on-linux:install/3rd-party/stanford-megatron-lm-install>`
-* :doc:`DGL for ROCm <rocm-install-on-linux:install/3rd-party/dgl-install>`
-* :doc:`Megablocks for ROCm <rocm-install-on-linux:install/3rd-party/megablocks-install>`
-* :doc:`Taichi for ROCm <rocm-install-on-linux:install/3rd-party/taichi-install>`
+    * - `PyTorch <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/pytorch-compatibility.html>`__
+      - .. raw:: html

-.. note::
+          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html"><i class="fas fa-link fa-lg"></i></a>
+      - 
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-a-docker-image-with-pytorch-pre-installed>`__
+        - `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-a-wheels-package>`__
+        - `ROCm Base Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-the-pytorch-rocm-base-docker-image>`__
+        - `Upstream Docker file <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-the-pytorch-upstream-dockerfile>`__
+      - .. raw:: html

-   For guidance on installing ROCm itself, refer to :doc:`ROCm installation for Linux <rocm-install-on-linux:index>`.
+          <a href="https://github.com/ROCm/pytorch"><i class="fab fa-github fa-lg"></i></a>
+
+    * - `TensorFlow <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/tensorflow-compatibility.html>`__
+      - .. raw:: html
+
+          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html"><i class="fas fa-link fa-lg"></i></a>
+      - 
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html#using-a-docker-image-with-tensorflow-pre-installed>`__
+        - `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html#using-a-wheels-package>`__
+
+      - .. raw:: html
+
+          <a href="https://github.com/ROCm/tensorflow-upstream"><i class="fab fa-github fa-lg"></i></a> 
+
+    * - `JAX <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/jax-compatibility.html>`__
+      - .. raw:: html
+
+          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/jax-install.html"><i class="fas fa-link fa-lg"></i></a>
+      - 
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/jax-install.html#using-a-prebuilt-docker-image>`__
+      - .. raw:: html
+
+          <a href="https://github.com/ROCm/jax"><i class="fab fa-github fa-lg"></i></a>
+
+    * - `verl <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/verl-compatibility.html>`__
+      - .. raw:: html
+
+          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/verl-install.html"><i class="fas fa-link fa-lg"></i></a>
+      - 
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/verl-install.html#use-a-prebuilt-docker-image-with-verl-pre-installed>`__
+      - .. raw:: html
+
+          <a href="https://github.com/ROCm/verl"><i class="fab fa-github fa-lg"></i></a>
+
+    * - `Stanford Megatron-LM <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/stanford-megatron-lm-compatibility.html>`__
+      - .. raw:: html
+
+          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/stanford-megatron-lm-install.html"><i class="fas fa-link fa-lg"></i></a>
+      - 
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/stanford-megatron-lm-install.html#use-a-prebuilt-docker-image-with-stanford-megatron-lm-pre-installed>`__
+      - .. raw:: html
+
+          <a href="https://github.com/ROCm/Stanford-Megatron-LM"><i class="fab fa-github fa-lg"></i></a>
+
+    * - `DGL <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/dgl-compatibility.html>`__
+      - .. raw:: html
+
+          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/dgl-install.html"><i class="fas fa-link fa-lg"></i></a>
+      - 
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/dgl-install.html#use-a-prebuilt-docker-image-with-dgl-pre-installed>`__
+      - .. raw:: html
+
+          <a href="https://github.com/ROCm/dgl"><i class="fab fa-github fa-lg"></i></a> 
+
+    * - `Megablocks <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/megablocks-compatibility.html>`__
+      - .. raw:: html
+
+          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/megablocks-install.html"><i class="fas fa-link fa-lg"></i></a>
+      - 
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/megablocks-install.html#using-a-prebuilt-docker-image-with-megablocks-pre-installed>`__
+      - .. raw:: html
+
+          <a href="https://github.com/ROCm/megablocks"><i class="fab fa-github fa-lg"></i></a>
+
+    * - `Taichi <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/taichi-compatibility.html>`__
+      - .. raw:: html
+
+          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html"><i class="fas fa-link fa-lg"></i></a>
+      - 
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html#use-a-prebuilt-docker-image-with-taichi-pre-installed>`__
+        - `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html#use-a-wheels-package>`__
+
+      - .. raw:: html
+
+          <a href="https://github.com/ROCm/taichi"><i class="fab fa-github fa-lg"></i></a>
+
+    * - `Ray <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/ray-compatibility.html>`__
+      - .. raw:: html
+
+          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/ray-install.html"><i class="fas fa-link fa-lg"></i></a>
+      - 
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/ray-install.html#using-a-prebuilt-docker-image-with-ray-pre-installed>`__
+        - `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/ray-install.html#install-ray-on-bare-metal-or-a-custom-container>`__
+        - `ROCm Base Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/ray-install.html#build-your-own-docker-image>`__
+      - .. raw:: html
+
+          <a href="https://github.com/ROCm/ray"><i class="fab fa-github fa-lg"></i></a>
+
+    * - `llama.cpp <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/llama-cpp-compatibility.html>`__
+      - .. raw:: html
+
+          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/llama-cpp-install.html"><i class="fas fa-link fa-lg"></i></a>
+      - 
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/llama-cpp-install.html#use-a-prebuilt-docker-image-with-llama-cpp-pre-installed>`__
+      - .. raw:: html
+
+          <a href="https://github.com/ROCm/llama.cpp"><i class="fab fa-github fa-lg"></i></a>

 Learn how to use your ROCm deep learning environment for training, fine-tuning, inference, and performance optimization
 through the following guides.

 * :doc:`rocm-for-ai/index`

-* :doc:`Training <rocm-for-ai/training/index>`
+* :doc:`Use ROCm for training <rocm-for-ai/training/index>`

-* :doc:`Fine-tuning LLMs <rocm-for-ai/fine-tuning/index>`
+* :doc:`Use ROCm for fine-tuning LLMs <rocm-for-ai/fine-tuning/index>`

-* :doc:`Inference <rocm-for-ai/inference/index>`
+* :doc:`Use ROCm for AI inference <rocm-for-ai/inference/index>`

-* :doc:`Inference optimization <rocm-for-ai/inference-optimization/index>`
+* :doc:`Use ROCm for AI inference optimization <rocm-for-ai/inference-optimization/index>`

--- a/docs/how-to/rocm-for-ai/inference-optimization/workload.rst
+++ b/docs/how-to/rocm-for-ai/inference-optimization/workload.rst
@@ -939,7 +939,7 @@ hipBLASLt benchmarking
 The GEMM library
 `hipBLASLt <https://rocm.docs.amd.com/projects/hipBLASLt/en/latest/index.html>`_
 provides a benchmark tool for its supported operations. Refer to the
-`documentation <https://github.com/ROCm/hipBLASLt/blob/develop/clients/benchmarks/README.md>`_
+`documentation <https://github.com/ROCm/hipBLASLt/blob/develop/clients/bench/README.md>`_
 for details.

 * Example 1: Benchmark mix fp8 GEMM
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.0-20250812.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.10.0-20250812.rst
@@ -0,0 +1,445 @@
+:orphan:
+
+.. meta::
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
+                 ROCm vLLM Docker image.
+   :keywords: model, MAD, automation, dashboarding, validate
+
+**********************************
+vLLM inference performance testing
+**********************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm vLLM
+   inference performance documentation. See :doc:`../vllm` for the latest version.
+
+.. _vllm-benchmark-unified-docker-812:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.0_20250812-benchmark-models.yaml
+
+   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
+   {% set model_groups = data.vllm_benchmark.model_groups %}
+
+   The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
+   a prebuilt, optimized environment for validating large language model (LLM)
+   inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
+   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
+   accelerators and includes the following components:
+
+   .. list-table::
+      :header-rows: 1
+
+      * - Software component
+        - Version
+
+      * - `ROCm <https://github.com/ROCm/ROCm>`__
+        - {{ unified_docker.rocm_version }}
+
+      * - `vLLM <https://docs.vllm.ai/en/latest>`__
+        - {{ unified_docker.vllm_version }}
+
+      * - `PyTorch <https://github.com/ROCm/pytorch>`__
+        - {{ unified_docker.pytorch_version }}
+
+      * - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`__
+        - {{ unified_docker.hipblaslt_version }}
+
+With this Docker image, you can quickly test the :ref:`expected
+inference performance numbers <vllm-benchmark-performance-measurements-812>` for
+MI300X series accelerators.
+
+What's new
+==========
+
+The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <vllm-history>`.
+
+* Upgraded to vLLM v0.10.
+
+* FP8 KV cache support via AITER.
+
+* Full graph capture support via AITER.
+
+Supported models
+================
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.0_20250812-benchmark-models.yaml
+
+   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
+   {% set model_groups = data.vllm_benchmark.model_groups %}
+
+   .. _vllm-benchmark-available-models-812:
+
+   The following models are supported for inference performance benchmarking
+   with vLLM and ROCm. Some instructions, commands, and recommendations in this
+   documentation might vary by model -- select one to get started.
+
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+      <div class="row">
+         <div class="col-2 me-2 model-param-head">Model group</div>
+         <div class="row col-10">
+   {% for model_group in model_groups %}
+            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+   {% endfor %}
+         </div>
+      </div>
+
+      <div class="row mt-1">
+         <div class="col-2 me-2 model-param-head">Model</div>
+         <div class="row col-10">
+   {% for model_group in model_groups %}
+      {% set models = model_group.models %}
+      {% for model in models %}
+         {% if models|length % 3 == 0 %}
+            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% else %}
+            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% endif %}
+      {% endfor %}
+   {% endfor %}
+         </div>
+      </div>
+      </div>
+
+   .. _vllm-benchmark-vllm-812:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. note::
+
+         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
+         Some models require access authorization prior to use via an external license agreement through a third party.
+
+      {% endfor %}
+   {% endfor %}
+
+.. note::
+
+   vLLM is a toolkit and library for LLM inference and serving. AMD implements
+   high-performance custom kernels and modules in vLLM to enhance performance.
+   See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
+   more information.
+
+.. _vllm-benchmark-performance-measurements-812:
+
+Performance measurements
+========================
+
+To evaluate performance, the
+`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+page provides reference throughput and serving measurements for inferencing popular AI models.
+
+.. important::
+
+   The performance data presented in
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+   only reflects the latest version of this inference benchmarking environment.
+   The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting training.
+
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.10.0_20250812-benchmark-models.yaml
+
+   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
+   {% set model_groups = data.vllm_benchmark.model_groups %}
+
+   Pull the Docker image
+   =====================
+
+   Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
+   Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull {{ unified_docker.pull_tag }}
+
+   Benchmarking
+   ============
+
+   Once the setup is complete, choose between two options to reproduce the
+   benchmark results:
+
+   .. _vllm-benchmark-mad-812:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. tab-set::
+
+         .. tab-item:: MAD-integrated benchmarking
+
+            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+               directory and install the required packages on the host machine.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD
+                  pip install -r requirements.txt
+
+            2. Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
+               using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
+
+               .. code-block:: shell
+
+                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+                  madengine run \
+                      --tags {{model.mad_tag}} \
+                      --keep-model-dir \
+                      --live-output \
+                      --timeout 28800
+
+            MAD launches a Docker container with the name
+            ``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the
+            model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv``
+            and ``{{ model.mad_tag }}_serving.csv``.
+
+            Although the :ref:`available models
+            <vllm-benchmark-available-models-812>` are preconfigured to collect
+            offline throughput and online serving performance data, you can
+            also change the benchmarking parameters. See the standalone
+            benchmarking tab for more information.
+
+            {% if model.tunableop %}
+
+            .. note::
+
+               For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
+               TunableOp automatically explores different implementations and configurations of certain PyTorch
+               operators to find the fastest one for your hardware.
+
+               By default, ``{{model.mad_tag}}`` runs with TunableOp disabled (see
+               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To enable it, include
+               the ``--tunableop on`` argument in your run.
+
+               Enabling TunableOp triggers a two-pass run -- a warm-up followed by the
+               performance-collection run.
+
+            {% endif %}
+
+         .. tab-item:: Standalone benchmarking
+
+            .. rubric:: Download the Docker image and required scripts
+
+            1. Run the vLLM benchmark tool independently by starting the
+               `Docker container <{{ unified_docker.docker_hub_url }}>`_
+               as shown in the following snippet.
+
+               .. code-block:: shell
+
+                  docker pull {{ unified_docker.pull_tag }}
+                  docker run -it \
+                      --device=/dev/kfd \
+                      --device=/dev/dri \
+                      --group-add video \
+                      --shm-size 16G \
+                      --security-opt seccomp=unconfined \
+                      --security-opt apparmor=unconfined \
+                      --cap-add=SYS_PTRACE \
+                      -v $(pwd):/workspace \
+                      --env HUGGINGFACE_HUB_CACHE=/workspace \
+                      --name test \
+                      {{ unified_docker.pull_tag }}
+
+            2. In the Docker container, clone the ROCm MAD repository and navigate to the
+               benchmark scripts directory at ``~/MAD/scripts/vllm``.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD/scripts/vllm
+
+            3. To start the benchmark, use the following command with the appropriate options.
+
+               .. code-block::
+
+                  ./run.sh \
+                      --config $CONFIG_CSV \
+                      --model_repo {{ model.model_repo }} \
+                      <overrides>
+
+               .. dropdown:: Benchmark options
+                  :open:
+
+                  .. list-table::
+                     :header-rows: 1
+                     :align: center
+
+                     * - Name
+                       - Options
+                       - Description
+
+                     * - ``--config``
+                       - ``configs/default.csv``
+                       - Run configs from the CSV for the chosen model repo and benchmark.
+
+                     * -
+                       - ``configs/extended.csv``
+                       - 
+
+                     * -
+                       - ``configs/performance.csv``
+                       - 
+
+                     * - ``--benchmark``
+                       - ``throughput``
+                       - Measure offline end-to-end throughput.
+
+                     * - 
+                       - ``serving``
+                       - Measure online serving performance.
+
+                     * - 
+                       - ``all``
+                       - Measure both throughput and serving.
+
+                     * - `<overrides>`
+                       - See `run.sh <https://github.com/ROCm/MAD/blob/develop/scripts/vllm/run.sh>`__ for more info.
+                       - Additional overrides to the config CSV.
+
+                  The input sequence length, output sequence length, and tensor parallel (TP) are
+                  already configured. You don't need to specify them with this script.
+
+               .. note::
+
+                  For best performance, it's recommended to run with ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``.
+
+                  If you encounter the following error, pass your access-authorized Hugging
+                  Face token to the gated models.
+
+                  .. code-block::
+
+                     OSError: You are trying to access a gated repo.
+
+                     # pass your HF_TOKEN
+                     export HF_TOKEN=$your_personal_hf_token
+
+            .. rubric:: Benchmarking examples
+
+            Here are some examples of running the benchmark with various options:
+
+            * Throughput benchmark
+
+              Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
+
+              .. code-block:: shell
+
+                 export MAD_MODEL_NAME={{ model.mad_tag }}
+                 ./run.sh \
+                     --config configs/default.csv \
+                     --model_repo {{model.model_repo}} \
+                     --benchmark throughput
+
+              Find the throughput benchmark report at ``./{{ model.mad_tag }}_throughput.csv``.
+
+            * Serving benchmark
+
+              Use this command to benchmark the serving performance of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
+
+              .. code-block::
+
+                 export MAD_MODEL_NAME={{ model.mad_tag }}
+                 ./run.sh \
+                     --config configs/default.csv \
+                     --model_repo {{model.model_repo}} \
+                     --benchmark serving
+
+              Find the serving benchmark report at ``./{{ model.mad_tag }}_serving.csv``.
+
+            .. raw:: html
+
+               <style>
+               mjx-container[jax="CHTML"][display="true"] {
+                  text-align: left;
+                  margin: 0;
+               }
+               </style>
+
+            .. note::
+
+               Throughput is calculated as:
+
+               - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
+
+               - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
+      {% endfor %}
+   {% endfor %}
+
+Advanced usage
+==============
+
+For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
+see the developer's guide at `<https://github.com/ROCm/vllm/tree/f94ec9beeca1071cc34f9d1e206d8c7f3ac76129/docs/dev-docker>`__.
+
+Reproducing the Docker image
+----------------------------
+
+To reproduce this ROCm/vLLM Docker image release, follow these steps:
+
+1. Clone the `vLLM repository <https://github.com/ROCm/vllm>`__.
+
+   .. code-block:: shell
+
+      git clone https://github.com/ROCm/vllm.git
+
+2. Checkout the specific release commit.
+
+   .. code-block:: shell
+
+      cd vllm
+      git checkout 340ea86dfe5955d6f9a9e767d6abab5aacf2c978
+
+3. Build the Docker image. Replace ``vllm-rocm`` with your desired image tag.
+
+   .. code-block:: shell
+
+      docker build -f docker/Dockerfile.rocm -t vllm-rocm .
+
+Further reading
+===============
+
+- To learn more about the options for latency and throughput benchmark scripts,
+  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
+
+- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
+
+- To learn more about system settings and management practices to configure your system for
+  AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+
+- For application performance optimization strategies for HPC and AI workloads,
+  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
+
+- To learn how to run community models from Hugging Face on AMD GPUs, see
+  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
+
+- To learn how to fine-tune LLMs and optimize inference, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`vllm-history` to find documentation for previous releases
+of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250702.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250702.rst
@@ -14,7 +14,7 @@ vLLM inference performance testing
   This documentation does not reflect the latest version of ROCm vLLM
   inference performance documentation. See :doc:`../vllm` for the latest version.

-.. _vllm-benchmark-unified-docker:
+.. _vllm-benchmark-unified-docker-702:

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250702-benchmark-models.yaml

@@ -77,7 +77,7 @@ vLLM inference performance testing
        </div>
      </div>

-   .. _vllm-benchmark-vllm:
+   .. _vllm-benchmark-vllm-702:

   {% for model_group in model_groups %}
      {% for model in model_group.models %}
@@ -159,7 +159,7 @@ vLLM inference performance testing
   Once the setup is complete, choose between two options to reproduce the
   benchmark results:

-   .. _vllm-benchmark-mad:
+   .. _vllm-benchmark-mad-702:

   {% for model_group in model_groups %}
      {% for model in model_group.models %}
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715.rst
@@ -0,0 +1,450 @@
+:orphan:
+
+.. meta::
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
+                 ROCm vLLM Docker image.
+   :keywords: model, MAD, automation, dashboarding, validate
+
+**********************************
+vLLM inference performance testing
+**********************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm vLLM
+   inference performance documentation. See :doc:`../vllm` for the latest version.
+
+.. _vllm-benchmark-unified-docker-715:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark-models.yaml
+
+   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
+   {% set model_groups = data.vllm_benchmark.model_groups %}
+
+   The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
+   a prebuilt, optimized environment for validating large language model (LLM)
+   inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
+   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
+   accelerators and includes the following components:
+
+   .. list-table::
+      :header-rows: 1
+
+      * - Software component
+        - Version
+
+      * - `ROCm <https://github.com/ROCm/ROCm>`__
+        - {{ unified_docker.rocm_version }}
+
+      * - `vLLM <https://docs.vllm.ai/en/latest>`__
+        - {{ unified_docker.vllm_version }}
+
+      * - `PyTorch <https://github.com/ROCm/pytorch>`__
+        - {{ unified_docker.pytorch_version }}
+
+      * - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`__
+        - {{ unified_docker.hipblaslt_version }}
+
+With this Docker image, you can quickly test the :ref:`expected
+inference performance numbers <vllm-benchmark-performance-measurements-715>` for
+MI300X series accelerators.
+
+What's new
+==========
+
+The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <vllm-history>`.
+
+* The ``--compilation-config-parameter`` is no longer required as its options are now enabled by default.
+  This parameter has been removed from the benchmarking script.
+
+* Resolved Llama 3.1 405 B custom all-reduce issue, eliminating the need for ``--disable-custom-all-reduce``.
+  This parameter has been removed from the benchmarking script.
+
+* Fixed a ``+rms_norm`` custom kernel issue.
+
+* Added quick reduce functionality. Set ``VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=FP`` to enable; supported modes are ``FP``, ``INT8``, ``INT6``, ``INT4``.
+
+* Implemented a workaround to potentially mitigate GPU crashes experienced with the Command R+ model, pending a driver fix.
+
+Supported models
+================
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark-models.yaml
+
+   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
+   {% set model_groups = data.vllm_benchmark.model_groups %}
+
+   .. _vllm-benchmark-available-models-715:
+
+   The following models are supported for inference performance benchmarking
+   with vLLM and ROCm. Some instructions, commands, and recommendations in this
+   documentation might vary by model -- select one to get started.
+
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+      <div class="row">
+         <div class="col-2 me-2 model-param-head">Model group</div>
+         <div class="row col-10">
+   {% for model_group in model_groups %}
+            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+   {% endfor %}
+         </div>
+      </div>
+
+      <div class="row mt-1">
+         <div class="col-2 me-2 model-param-head">Model</div>
+         <div class="row col-10">
+   {% for model_group in model_groups %}
+      {% set models = model_group.models %}
+      {% for model in models %}
+         {% if models|length % 3 == 0 %}
+            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% else %}
+            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% endif %}
+      {% endfor %}
+   {% endfor %}
+         </div>
+      </div>
+      </div>
+
+   .. _vllm-benchmark-vllm-715:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. note::
+
+         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
+         Some models require access authorization prior to use via an external license agreement through a third party.
+
+      {% endfor %}
+   {% endfor %}
+
+.. note::
+
+   vLLM is a toolkit and library for LLM inference and serving. AMD implements
+   high-performance custom kernels and modules in vLLM to enhance performance.
+   See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
+   more information.
+
+.. _vllm-benchmark-performance-measurements-715:
+
+Performance measurements
+========================
+
+To evaluate performance, the
+`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+page provides reference throughput and latency measurements for inferencing popular AI models.
+
+.. important::
+
+   The performance data presented in
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+   only reflects the latest version of this inference benchmarking environment.
+   The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting training.
+
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark-models.yaml
+
+   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
+   {% set model_groups = data.vllm_benchmark.model_groups %}
+
+   Pull the Docker image
+   =====================
+
+   Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
+   Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull {{ unified_docker.pull_tag }}
+
+   Benchmarking
+   ============
+
+   Once the setup is complete, choose between two options to reproduce the
+   benchmark results:
+
+   .. _vllm-benchmark-mad-715:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. tab-set::
+
+         .. tab-item:: MAD-integrated benchmarking
+
+            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+               directory and install the required packages on the host machine.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD
+                  pip install -r requirements.txt
+
+            2. Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
+               using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
+
+               .. code-block:: shell
+
+                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+                  madengine run \
+                      --tags {{model.mad_tag}} \
+                      --keep-model-dir \
+                      --live-output \
+                      --timeout 28800
+
+            MAD launches a Docker container with the name
+            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
+            model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
+
+            Although the :ref:`available models <vllm-benchmark-available-models-715>` are preconfigured
+            to collect latency and throughput performance data, you can also change the benchmarking
+            parameters. See the standalone benchmarking tab for more information.
+
+            {% if model.tunableop %}
+
+            .. note::
+
+               For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
+               TunableOp automatically explores different implementations and configurations of certain PyTorch
+               operators to find the fastest one for your hardware.
+
+               By default, ``{{model.mad_tag}}`` runs with TunableOp disabled
+               (see
+               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__).
+               To enable it, include the ``--tunableop on`` argument in your
+               run.
+
+               Enabling TunableOp triggers a two-pass run -- a warm-up followed
+               by the performance-collection run.
+
+            {% endif %}
+
+         .. tab-item:: Standalone benchmarking
+
+            .. rubric:: Download the Docker image and required scripts
+
+            1. Run the vLLM benchmark tool independently by starting the
+               `Docker container <{{ unified_docker.docker_hub_url }}>`_
+               as shown in the following snippet.
+
+               .. code-block:: shell
+
+                  docker pull {{ unified_docker.pull_tag }}
+                  docker run -it \
+                      --device=/dev/kfd \
+                      --device=/dev/dri \
+                      --group-add video \
+                      --shm-size 16G \
+                      --security-opt seccomp=unconfined \
+                      --security-opt apparmor=unconfined \
+                      --cap-add=SYS_PTRACE \
+                      -v $(pwd):/workspace \
+                      --env HUGGINGFACE_HUB_CACHE=/workspace \
+                      --name test \
+                      {{ unified_docker.pull_tag }}
+
+            2. In the Docker container, clone the ROCm MAD repository and navigate to the
+               benchmark scripts directory at ``~/MAD/scripts/vllm``.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD/scripts/vllm
+
+            3. To start the benchmark, use the following command with the appropriate options.
+
+               .. dropdown:: Benchmark options
+                  :open:
+
+                  .. list-table::
+                     :header-rows: 1
+                     :align: center
+
+                     * - Name
+                       - Options
+                       - Description
+
+                     * - ``$test_option``
+                       - latency
+                       - Measure decoding token latency
+
+                     * -
+                       - throughput
+                       - Measure token generation throughput
+
+                     * -
+                       - all
+                       - Measure both throughput and latency
+
+                     * - ``$num_gpu``
+                       - 1 or 8
+                       - Number of GPUs
+
+                     * - ``$datatype``
+                       - ``float16`` or ``float8``
+                       - Data type
+
+                  The input sequence length, output sequence length, and tensor parallel (TP) are
+                  already configured. You don't need to specify them with this script.
+
+               Command:
+
+               .. code-block::
+
+                  ./vllm_benchmark_report.sh \
+                      -s $test_option \
+                      -m {{model.model_repo}} \
+                      -g $num_gpu \
+                      -d {{model.precision}}
+
+               .. note::
+
+                  For best performance, it's recommend to run with ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``.
+
+                  If you encounter the following error, pass your access-authorized Hugging
+                  Face token to the gated models.
+
+                  .. code-block::
+
+                     OSError: You are trying to access a gated repo.
+
+                     # pass your HF_TOKEN
+                     export HF_TOKEN=$your_personal_hf_token
+
+            .. rubric:: Benchmarking examples
+
+            Here are some examples of running the benchmark with various options:
+
+            * Latency benchmark
+
+              Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
+
+              .. code-block::
+
+                 ./vllm_benchmark_report.sh \
+                     -s latency \
+                     -m {{model.model_repo}} \
+                     -g 8 \
+                     -d {{model.precision}}
+
+              Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
+
+            * Throughput benchmark
+
+              Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
+
+              .. code-block:: shell
+
+                 ./vllm_benchmark_report.sh \
+                     -s throughput \
+                     -m {{model.model_repo}} \
+                     -g 8 \
+                     -d {{model.precision}}
+
+              Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
+
+            .. raw:: html
+
+               <style>
+               mjx-container[jax="CHTML"][display="true"] {
+                  text-align: left;
+                  margin: 0;
+               }
+               </style>
+
+            .. note::
+
+               Throughput is calculated as:
+
+               - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
+
+               - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
+      {% endfor %}
+   {% endfor %}
+
+Advanced usage
+==============
+
+For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
+see the developer's guide at `<https://github.com/ROCm/vllm/tree/f94ec9beeca1071cc34f9d1e206d8c7f3ac76129/docs/dev-docker>`__.
+
+Reproducing the Docker image
+----------------------------
+
+To reproduce this ROCm/vLLM Docker image release, follow these steps:
+
+1. Clone the `vLLM repository <https://github.com/ROCm/vllm>`__.
+
+   .. code-block:: shell
+
+      git clone https://github.com/ROCm/vllm.git
+
+2. Checkout the specific release commit.
+
+   .. code-block:: shell
+
+      cd vllm
+      git checkout b432b7a285aa0dcb9677380936ffa74931bb6d6f
+
+3. Build the Docker image. Replace ``vllm-rocm`` with your desired image tag.
+
+   .. code-block:: shell
+
+      docker build -f docker/Dockerfile.rocm -t vllm-rocm .
+
+Known issues and workarounds
+============================
+
+AITER does not support FP8 KV cache yet.
+
+Further reading
+===============
+
+- To learn more about the options for latency and throughput benchmark scripts,
+  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
+
+- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
+
+- To learn more about system settings and management practices to configure your system for
+  AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+
+- For application performance optimization strategies for HPC and AI workloads,
+  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
+
+- To learn how to run community models from Hugging Face on AMD GPUs, see
+  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
+
+- To learn how to fine-tune LLMs and optimize inference, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`vllm-history` to find documentation for previous releases
+of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst
@@ -7,7 +7,7 @@ vLLM inference performance testing version history
 This table lists previous versions of the ROCm vLLM inference Docker image for
 inference performance testing. For detailed information about available models
 for benchmarking, see the version-specific documentation. You can find tagged
-previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub <https://hub.docker.com/r/rocm/vllm/tags>`__.
+previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.1_20250909/images/sha256-1113268572e26d59b205792047bea0e61e018e79aeadceba118b7bf23cb3715c>`__.

 .. list-table::
   :header-rows: 1
@@ -16,14 +16,23 @@ previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub <https://hub.
     - Components
     - Resources

-   * - ``rocm/vllm:rocm6.4.1_vllm_0.9.1_20250715``
+   * - ``rocm/vllm:rocm6.4.1_vllm_0.10.0_20250812``
       (latest)
+     - 
+       * ROCm 6.4.1
+       * vLLM 0.10.0
+       * PyTorch 2.7.0
+     - 
+       * :doc:`Documentation <../vllm>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.0_20250812/images/sha256-4c277ad39af3a8c9feac9b30bf78d439c74d9b4728e788a419d3f1d0c30cacaa>`__
+
+   * - ``rocm/vllm:rocm6.4.1_vllm_0.9.1_20250715``
     - 
       * ROCm 6.4.1
       * vLLM 0.9.1
       * PyTorch 2.7.0
     - 
-       * :doc:`Documentation <../vllm>`
+       * :doc:`Documentation <vllm-0.9.1-20250715>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250715/images/sha256-4a429705fa95a58f6d20aceab43b1b76fa769d57f32d5d28bd3f4e030e2a78ea>`__

   * - ``rocm/vllm:rocm6.4.1_vllm_0.9.1_20250702``
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference.rst
@@ -31,22 +31,26 @@ PyTorch inference performance testing
   .. raw:: html

      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
-        <div class="row">
-          <div class="col-2 me-2 model-param-head">Model</div>
-          <div class="row col-10">
+         <div class="row gx-0">
+            <div class="col-2 me-1 px-2 model-param-head">Model</div>
+            <div class="row col-10 pe-0">
      {% for model_group in model_groups %}
-            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+               <div class="col-3 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
      {% endfor %}
            </div>
         </div>

-        <div class="row mt-1" style="display: none;">
-          <div class="col-2 me-2 model-param-head">Model</div>
-          <div class="row col-10">
+         <div class="row gx-0 pt-1" style="display: none;">
+            <div class="col-2 me-1 px-2 model-param-head">Variant</div>
+            <div class="row col-10 pe-0">
      {% for model_group in model_groups %}
         {% set models = model_group.models %}
         {% for model in models %}
-            <div class="col-12 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% if models|length % 3 == 0 %}
+               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% else %}
+               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% endif %}
         {% endfor %}
      {% endfor %}
            </div>
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/sglang.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/sglang.rst
@@ -2,19 +2,19 @@
   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and SGLang
   :keywords: model, MAD, automation, dashboarding, validate

-************************************
-SGLang inference performance testing
-************************************
+*****************************************************************
+SGLang inference performance testing DeepSeek-R1-Distill-Qwen-32B
+*****************************************************************

 .. _sglang-benchmark-unified-docker:

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/sglang-benchmark-models.yaml

-   {% set unified_docker = data.sglang_benchmark.unified_docker.latest %}
+   {% set docker = data.dockers[0] %}

   `SGLang <https://docs.sglang.ai>`__ is a high-performance inference and
   serving engine for large language models (LLMs) and vision models. The
-   ROCm-enabled `SGLang Docker image <{{ unified_docker.docker_hub_url }}>`__
+   ROCm-enabled `SGLang Docker image <{{ docker.docker_hub_url }}>`__
   bundles SGLang with PyTorch, optimized for AMD Instinct MI300X series
   accelerators. It includes the following software components:

@@ -24,14 +24,10 @@ SGLang inference performance testing
      * - Software component
        - Version

-      * - `ROCm <https://github.com/ROCm/ROCm>`__
-        - {{ unified_docker.rocm_version }}
-
-      * - `SGLang <https://docs.sglang.ai/index.html>`__
-        - {{ unified_docker.sglang_version }} 
-
-      * - `PyTorch <https://github.com/pytorch/pytorch>`__
-        - {{ unified_docker.pytorch_version }} 
+      {% for component_name, component_version in docker.components.items() %}
+      * - {{ component_name }}
+        - {{ component_version }}
+      {% endfor %}

 System validation
 =================
@@ -50,8 +46,8 @@ system's configuration.

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/sglang-benchmark-models.yaml

-   {% set unified_docker = data.sglang_benchmark.unified_docker.latest %}
-   {% set model_groups = data.sglang_benchmark.model_groups %}
+   {% set unified_docker = data.dockers[0] %}
+   {% set model_groups = data.model_groups %}

   Pull the Docker image
   =====================
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
@@ -7,14 +7,13 @@
 vLLM inference performance testing
 **********************************

-.. _vllm-benchmark-unified-docker:
+.. _vllm-benchmark-unified-docker-909:

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml

-   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
-   {% set model_groups = data.vllm_benchmark.model_groups %}
+   {% set docker = data.dockers[0] %}

-   The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
+   The `ROCm vLLM Docker <{{ docker.docker_hub_url }}>`_ image offers
   a prebuilt, optimized environment for validating large language model (LLM)
   inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
@@ -26,20 +25,13 @@ vLLM inference performance testing
      * - Software component
        - Version

-      * - `ROCm <https://github.com/ROCm/ROCm>`__
-        - {{ unified_docker.rocm_version }}
-
-      * - `vLLM <https://docs.vllm.ai/en/latest>`__
-        - {{ unified_docker.vllm_version }}
-
-      * - `PyTorch <https://github.com/ROCm/pytorch>`__
-        - {{ unified_docker.pytorch_version }}
-
-      * - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`__
-        - {{ unified_docker.hipblaslt_version }}
+      {% for component_name, component_version in docker.components.items() %}
+      * - {{ component_name }}
+        - {{ component_version }}
+      {% endfor %}

 With this Docker image, you can quickly test the :ref:`expected
-inference performance numbers <vllm-benchmark-performance-measurements>` for
+inference performance numbers <vllm-benchmark-performance-measurements-909>` for
 MI300X series accelerators.

 What's new
@@ -47,27 +39,23 @@ What's new

 The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <previous-versions/vllm-history>`.

-* The ``--compilation-config-parameter`` is no longer required as its options are now enabled by default.
-  This parameter has been removed from the benchmarking script.
+* Upgraded to vLLM v0.10.1.

-* Resolved Llama 3.1 405 B custom all-reduce issue, eliminating the need for ``--disable-custom-all-reduce``.
-  This parameter has been removed from the benchmarking script.
+* Set ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1`` by default for better performance.

-* Fixed a ``+rms_norm`` custom kernel issue.
+* Set ``VLLM_ROCM_USE_AITER_RMSNORM=0`` by default to avoid various issues with torch compile.

-* Added quick reduce functionality. Set ``VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=FP`` to enable; supported modes are ``FP``, ``INT8``, ``INT6``, ``INT4``.
-
-* Implemented a workaround to potentially mitigate GPU crashes experienced with the Command R+ model, pending a driver fix.
+.. _vllm-benchmark-supported-models-909:

 Supported models
 ================

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml

-   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
-   {% set model_groups = data.vllm_benchmark.model_groups %}
+   {% set docker = data.dockers[0] %}
+   {% set model_groups = data.model_groups %}

-   .. _vllm-benchmark-available-models:
+   .. _vllm-benchmark-available-models-909:

   The following models are supported for inference performance benchmarking
   with vLLM and ROCm. Some instructions, commands, and recommendations in this
@@ -76,25 +64,25 @@ Supported models
   .. raw:: html

      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
-      <div class="row">
-         <div class="col-2 me-2 model-param-head">Model group</div>
-         <div class="row col-10">
+         <div class="row gx-0">
+            <div class="col-2 me-1 px-2 model-param-head">Model</div>
+            <div class="row col-10 pe-0">
      {% for model_group in model_groups %}
-            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+               <div class="col-3 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
      {% endfor %}
            </div>
         </div>

-      <div class="row mt-1">
-         <div class="col-2 me-2 model-param-head">Model</div>
-         <div class="row col-10">
+         <div class="row gx-0 pt-1">
+            <div class="col-2 me-1 px-2 model-param-head">Variant</div>
+            <div class="row col-10 pe-0">
      {% for model_group in model_groups %}
         {% set models = model_group.models %}
         {% for model in models %}
            {% if models|length % 3 == 0 %}
-            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
            {% else %}
-            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
            {% endif %}
         {% endfor %}
      {% endfor %}
@@ -102,36 +90,32 @@ Supported models
         </div>
      </div>

-   .. _vllm-benchmark-vllm:
+   .. _vllm-benchmark-vllm-909:

   {% for model_group in model_groups %}
      {% for model in model_group.models %}

-   .. container:: model-doc {{model.mad_tag}}
+   .. container:: model-doc {{ model.mad_tag }}

      .. note::

         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
         Some models require access authorization prior to use via an external license agreement through a third party.
+      {% if model.precision == "float8" and model.model_repo.startswith("amd") %}
+         This model uses FP8 quantization via `AMD Quark <https://quark.docs.amd.com/latest/>`__ for efficient inference on AMD accelerators.
+      {% endif %}

      {% endfor %}
   {% endfor %}

-.. note::
-
-   vLLM is a toolkit and library for LLM inference and serving. AMD implements
-   high-performance custom kernels and modules in vLLM to enhance performance.
-   See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
-   more information.
-
-.. _vllm-benchmark-performance-measurements:
+.. _vllm-benchmark-performance-measurements-909:

 Performance measurements
 ========================

 To evaluate performance, the
 `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
-page provides reference throughput and latency measurements for inferencing popular AI models.
+page provides reference throughput and serving measurements for inferencing popular AI models.

 .. important::

@@ -157,18 +141,18 @@ system's configuration.

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml

-   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
-   {% set model_groups = data.vllm_benchmark.model_groups %}
+   {% set docker = data.dockers[0] %}
+   {% set model_groups = data.model_groups %}

   Pull the Docker image
   =====================

-   Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
+   Download the `ROCm vLLM Docker image <{{ docker.docker_hub_url }}>`_.
   Use the following command to pull the Docker image from Docker Hub.

   .. code-block:: shell

-      docker pull {{ unified_docker.pull_tag }}
+      docker pull {{ docker.pull_tag }}

   Benchmarking
   ============
@@ -176,7 +160,7 @@ system's configuration.
   Once the setup is complete, choose between two options to reproduce the
   benchmark results:

-   .. _vllm-benchmark-mad:
+   .. _vllm-benchmark-mad-909:

   {% for model_group in model_groups %}
      {% for model in model_group.models %}
@@ -187,6 +171,9 @@ system's configuration.

         .. tab-item:: MAD-integrated benchmarking

+            The following run command is tailored to {{ model.model }}.
+            See :ref:`vllm-benchmark-supported-models-909` to switch to another available model.
+
            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
               directory and install the required packages on the host machine.

@@ -209,12 +196,15 @@ system's configuration.
                      --timeout 28800

            MAD launches a Docker container with the name
-            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
-            model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
+            ``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the
+            model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv``
+            and ``{{ model.mad_tag }}_serving.csv``.

-            Although the :ref:`available models <vllm-benchmark-available-models>` are preconfigured
-            to collect latency and throughput performance data, you can also change the benchmarking
-            parameters. See the standalone benchmarking tab for more information.
+            Although the :ref:`available models
+            <vllm-benchmark-available-models-909>` are preconfigured to collect
+            offline throughput and online serving performance data, you can
+            also change the benchmarking parameters. See the standalone
+            benchmarking tab for more information.

            {% if model.tunableop %}

@@ -224,28 +214,38 @@ system's configuration.
               TunableOp automatically explores different implementations and configurations of certain PyTorch
               operators to find the fastest one for your hardware.

-               By default, ``{{model.mad_tag}}`` runs with TunableOp disabled
-               (see
-               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__).
-               To enable it, include the ``--tunableop on`` argument in your
-               run.
+               By default, ``{{model.mad_tag}}`` runs with TunableOp disabled (see
+               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To enable it, include
+               the ``--tunableop on`` argument in your run.

-               Enabling TunableOp triggers a two-pass run -- a warm-up followed
-               by the performance-collection run.
+               Enabling TunableOp triggers a two-pass run -- a warm-up followed by the
+               performance-collection run.

            {% endif %}

         .. tab-item:: Standalone benchmarking

-            .. rubric:: Download the Docker image and required scripts
+            The following commands are optimized for {{ model.model }}.
+            See :ref:`vllm-benchmark-supported-models-909` to switch to another available model.

-            1. Run the vLLM benchmark tool independently by starting the
-               `Docker container <{{ unified_docker.docker_hub_url }}>`_
-               as shown in the following snippet.
+            .. seealso::
+
+               For more information on configuration, see the `config files
+               <https://github.com/ROCm/MAD-private/tree/develop/scripts/vllm/configs>`__
+               in the MAD repository. Refer to the `vLLM engine <https://docs.vllm.ai/en/latest/configuration/engine_args.html#engineargs>`__
+               for descriptions of available configuration options
+               and `Benchmarking vLLM <https://github.com/vllm-project/vllm/blob/main/benchmarks/README.md>`__ for
+               additional benchmarking information.
+
+            .. rubric:: Launch the container
+
+            You can run the vLLM benchmark tool independently by starting the
+            `Docker container <{{ docker.docker_hub_url }}>`_ as shown
+            in the following snippet.

            .. code-block:: shell

-                  docker pull {{ unified_docker.pull_tag }}
+               docker pull {{ docker.pull_tag }}
               docker run -it \
                   --device=/dev/kfd \
                   --device=/dev/dri \
@@ -257,66 +257,102 @@ system's configuration.
                   -v $(pwd):/workspace \
                   --env HUGGINGFACE_HUB_CACHE=/workspace \
                   --name test \
-                      {{ unified_docker.pull_tag }}
+                   {{ docker.pull_tag }}

-            2. In the Docker container, clone the ROCm MAD repository and navigate to the
-               benchmark scripts directory at ``~/MAD/scripts/vllm``.
+            .. rubric:: Throughput command
+
+            Use the following command to start the throughput benchmark.

            .. code-block:: shell

-                  git clone https://github.com/ROCm/MAD
-                  cd MAD/scripts/vllm
+               model={{ model.model_repo }}
+               tp={{ model.config.tp }}
+               num_prompts=1024
+               in=128
+               out=128
+               dtype={{ model.config.dtype }}
+               kv_cache_dtype={{ model.config.kv_cache_dtype }}
+               max_num_seqs=1024
+               max_seq_len_to_capture={{ model.config.max_seq_len_to_capture }}
+               max_num_batched_tokens={{ model.config.max_num_batched_tokens }}
+               max_model_len={{ model.config.max_model_len }}

-            3. To start the benchmark, use the following command with the appropriate options.
+               vllm bench throughput --model $model \
+                   -tp $tp \
+                   --num-prompts $num_prompts \
+                   --input-len $in \
+                   --output-len $out \
+                   --dtype $dtype \
+                   --kv-cache-dtype $kv_cache_dtype \
+                   --max-num-seqs $max_num_seqs \
+                   --max-seq-len-to-capture $max_seq_len_to_capture \
+                   --max-num-batched-tokens $max_num_batched_tokens \
+                   --max-model-len $max_model_len \
+                   --trust-remote-code \
+                   --output-json ${model}_throughput.json \
+                   --gpu-memory-utilization 0.9

-               .. dropdown:: Benchmark options
-                  :open:
+            .. rubric:: Serving command

-                  .. list-table::
-                     :header-rows: 1
-                     :align: center
+            1. Start the server using the following command:

-                     * - Name
-                       - Options
-                       - Description
+               .. code-block:: shell

-                     * - ``$test_option``
-                       - latency
-                       - Measure decoding token latency
+                  model={{ model.model_repo }}
+                  tp={{ model.config.tp }}
+                  dtype={{ model.config.dtype }}
+                  kv_cache_dtype={{ model.config.kv_cache_dtype }}
+                  max_num_seqs=256
+                  max_seq_len_to_capture={{ model.config.max_seq_len_to_capture }}
+                  max_num_batched_tokens={{ model.config.max_num_batched_tokens }}
+                  max_model_len={{ model.config.max_model_len }}

-                     * -
-                       - throughput
-                       - Measure token generation throughput
+                  vllm serve $model \
+                      -tp $tp \
+                      --dtype $dtype \
+                      --kv-cache-dtype $kv_cache_dtype \
+                      --max-num-seqs $max_num_seqs \
+                      --max-seq-len-to-capture $max_seq_len_to_capture \
+                      --max-num-batched-tokens $max_num_batched_tokens \
+                      --max-model-len $max_model_len \
+                      --no-enable-prefix-caching \
+                      --swap-space 16 \
+                      --disable-log-requests \
+                      --trust-remote-code \
+                      --gpu-memory-utilization 0.9

-                     * -
-                       - all
-                       - Measure both throughput and latency
+               Wait until the model has loaded and the server is ready to accept requests.

-                     * - ``$num_gpu``
-                       - 1 or 8
-                       - Number of GPUs
+            2. On another terminal on the same machine, run the benchmark:

-                     * - ``$datatype``
-                       - ``float16`` or ``float8``
-                       - Data type
+               .. code-block:: shell

-                  The input sequence length, output sequence length, and tensor parallel (TP) are
-                  already configured. You don't need to specify them with this script.
+                  # Connect to the container
+                  docker exec -it test bash

-               Command:
+                  # Wait for the server to start
+                  until curl -s http://localhost:8000/v1/models; do sleep 30; done

-               .. code-block::
-
-                  ./vllm_benchmark_report.sh \
-                      -s $test_option \
-                      -m {{model.model_repo}} \
-                      -g $num_gpu \
-                      -d {{model.precision}}
+                  # Run the benchmark
+                  model={{ model.model_repo }}
+                  max_concurrency=1
+                  num_prompts=10
+                  in=128
+                  out=128
+                  vllm bench serve --model $model \
+                      --percentile-metrics "ttft,tpot,itl,e2el" \
+                      --dataset-name random \
+                      --ignore-eos \
+                      --max-concurrency $max_concurrency \
+                      --num-prompts $num_prompts \
+                      --random-input-len $in \
+                      --random-output-len $out \
+                      --trust-remote-code \
+                      --save-result \
+                      --result-filename ${model}_serving.json

            .. note::

-                  For best performance, it's recommend to run with ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``.
-
               If you encounter the following error, pass your access-authorized Hugging
               Face token to the gated models.

@@ -327,38 +363,6 @@ system's configuration.
                  # pass your HF_TOKEN
                  export HF_TOKEN=$your_personal_hf_token

-            .. rubric:: Benchmarking examples
-
-            Here are some examples of running the benchmark with various options:
-
-            * Latency benchmark
-
-              Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
-
-              .. code-block::
-
-                 ./vllm_benchmark_report.sh \
-                     -s latency \
-                     -m {{model.model_repo}} \
-                     -g 8 \
-                     -d {{model.precision}}
-
-              Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
-
-            * Throughput benchmark
-
-              Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
-
-              .. code-block:: shell
-
-                 ./vllm_benchmark_report.sh \
-                     -s throughput \
-                     -m {{model.model_repo}} \
-                     -g 8 \
-                     -d {{model.precision}}
-
-              Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
-
            .. raw:: html

               <style>
@@ -382,7 +386,7 @@ Advanced usage
 ==============

 For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
-see the developer's guide at `<https://github.com/ROCm/vllm/tree/f94ec9beeca1071cc34f9d1e206d8c7f3ac76129/docs/dev-docker>`__.
+see the developer's guide at `<https://github.com/ROCm/vllm/blob/documentation/docs/dev-docker/README.md>`__.

 Reproducing the Docker image
 ----------------------------
@@ -400,7 +404,7 @@ To reproduce this ROCm/vLLM Docker image release, follow these steps:
   .. code-block:: shell

      cd vllm
-      git checkout b432b7a285aa0dcb9677380936ffa74931bb6d6f
+      git checkout 6663000a391911eba96d7864a26ac42b07f6ef29

 3. Build the Docker image. Replace ``vllm-rocm`` with your desired image tag.

@@ -408,11 +412,6 @@ To reproduce this ROCm/vLLM Docker image release, follow these steps:

      docker build -f docker/Dockerfile.rocm -t vllm-rocm .

-Known issues and workarounds
-============================
-
-AITER does not support FP8 KV cache yet.
-
 Further reading
 ===============

@@ -424,15 +423,12 @@ Further reading
 - To learn more about system settings and management practices to configure your system for
  AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.

+- See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
+  a brief introduction to vLLM and optimization strategies.
+
 - For application performance optimization strategies for HPC and AI workloads,
  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.

- To learn how to run community models from Hugging Face on AMD GPUs, see
-  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
-
- To learn how to fine-tune LLMs and optimize inference, see
-  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
-
 - For a list of other ready-made Docker images for AI with ROCm, see
  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.

--- a/docs/how-to/rocm-for-ai/install.rst
+++ b/docs/how-to/rocm-for-ai/install.rst
@@ -1,14 +1,14 @@
 .. meta::
-   :description: How to install ROCm and popular machine learning frameworks.
+   :description: How to install ROCm and popular deep learning frameworks.
   :keywords: ROCm, AI, LLM, train, fine-tune, FSDP, DeepSpeed, LLaMA, tutorial

 .. _rocm-for-ai-install:

-***********************************************
-Installing ROCm and machine learning frameworks
-***********************************************
+********************************************
+Installing ROCm and deep learning frameworks
+********************************************

-Before getting started, install ROCm and supported machine learning frameworks.
+Before getting started, install ROCm and supported deep learning frameworks.

 .. grid:: 1

@@ -22,7 +22,7 @@ If you’re new to ROCm, refer to the :doc:`ROCm quick start install guide for L
 <rocm-install-on-linux:install/quick-start>`.

 If you’re using a Radeon GPU for graphics-accelerated applications, refer to the
-`Radeon installation instructions <https://rocm.docs.amd.com/projects/radeon/en/docs-6.1.3/docs/install/native_linux/install-radeon.html>`_.
+`Radeon installation instructions <https://rocm.docs.amd.com/projects/radeon/en/latest/docs/install/native_linux/howto_native_linux.html>`_.

 You can install ROCm on :doc:`compatible systems <rocm-install-on-linux:reference/system-requirements>` via your Linux
 distribution's package manager. See the following documentation resources to get started:
@@ -43,29 +43,16 @@ distribution's package manager. See the following documentation resources to get
      If you encounter any issues during installation, refer to the
      :doc:`Installation troubleshooting <rocm-install-on-linux:reference/install-faq>` guide.

-Machine learning frameworks
-===========================
+Deep learning frameworks
+========================

-ROCm supports popular machine learning frameworks and libraries including `PyTorch
+ROCm supports deep learning frameworks and libraries including `PyTorch
 <https://pytorch.org/blog/pytorch-for-amd-rocm-platform-now-available-as-python-package>`_, `TensorFlow
-<https://tensorflow.org>`_, `JAX <https://jax.readthedocs.io/en/latest>`_, and `DeepSpeed
-<https://cloudblogs.microsoft.com/opensource/2022/03/21/supporting-efficient-large-model-training-on-amd-instinct-gpus-with-deepspeed/>`_.
+<https://tensorflow.org>`_, `JAX <https://jax.readthedocs.io/en/latest>`_, and more.

-Review the framework installation documentation. For ease-of-use, it's recommended to use official ROCm prebuilt Docker
+Review the :doc:`framework installation documentation <../deep-learning-rocm>`. For ease-of-use, it's recommended to use official ROCm prebuilt Docker
 images with the framework pre-installed.

-* :doc:`PyTorch for ROCm <rocm-install-on-linux:install/3rd-party/pytorch-install>`
-
-* :doc:`TensorFlow for ROCm <rocm-install-on-linux:install/3rd-party/tensorflow-install>`
-
-* :doc:`JAX for ROCm <rocm-install-on-linux:install/3rd-party/jax-install>`
-
-* :doc:`verl for ROCm <rocm-install-on-linux:install/3rd-party/verl-install>`
-
-* :doc:`Stanford Megatron-LM for ROCm <rocm-install-on-linux:install/3rd-party/jax-install>`
-
-* :doc:`DGL for ROCm <rocm-install-on-linux:install/3rd-party/jax-install>`
-
 Next steps
 ==========

--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
@@ -2,9 +2,9 @@
   :description: How to train a model using JAX MaxText for ROCm.
   :keywords: ROCm, AI, LLM, train, jax, torch, Llama, flux, tutorial, docker

-**************************************
-Training a model with MaxText for ROCm
-**************************************
+******************************************
+Training a model with JAX MaxText for ROCm
+******************************************

 MaxText is a high-performance, open-source framework built on the Google JAX
 machine learning library to train LLMs at scale. The MaxText framework for
@@ -12,70 +12,108 @@ ROCm is an optimized fork of the upstream
 `<https://github.com/AI-Hypercomputer/maxtext>`__ enabling efficient AI workloads
 on AMD MI300X series accelerators.

-The MaxText for ROCm training Docker (``rocm/jax-training:maxtext-v25.5``) image
+The MaxText for ROCm training Docker image
 provides a prebuilt environment for training on AMD Instinct MI300X and MI325X accelerators,
 including essential components like JAX, XLA, ROCm libraries, and MaxText utilities.
 It includes the following software components:

-+--------------------------+--------------------------------+
-| Software component       | Version                        |
-+==========================+================================+
-| ROCm                     | 6.3.4                          |
-+--------------------------+--------------------------------+
-| JAX                      | 0.4.35                         |
-+--------------------------+--------------------------------+
-| Python                   | 3.10.12                        |
-+--------------------------+--------------------------------+
-| Transformer Engine       | 1.12.0.dev0+b8b92dc            |
-+--------------------------+--------------------------------+
-| hipBLASLt                | 0.13.0-ae9c477a                |
-+--------------------------+--------------------------------+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml

-Supported features and models
-=============================
+   {% set dockers = data.dockers %}
+   .. tab-set::

-MaxText provides the following key features to train large language models efficiently:
+      {% for docker in dockers %}
+      {% set jax_version = docker.components["JAX"] %}
+
+      .. tab-item:: JAX {{ jax_version }}
+         :sync: {{ docker.pull_tag }}
+
+         .. list-table::
+            :header-rows: 1
+
+            * - Software component
+              - Version
+
+            {% for component_name, component_version in docker.components.items() %}
+            * - {{ component_name }}
+              - {{ component_version }}
+
+            {% endfor %}
+         {% if jax_version == "0.6.0" %}
+         .. note::
+
+            Shardy is a new config in JAX 0.6.0. You might get related errors if it's
+            not configured correctly. For now you can turn it off by setting
+            ``shardy=False`` during the training run. You can also follow the `migration
+            guide <https://docs.jax.dev/en/latest/shardy_jax_migration.html>`__ to enable
+            it.
+
+            The provided multi-node training scripts in this documentation are
+            not currently supported with JAX 0.6.0. For multi-node training, use the JAX 0.5.0
+            Docker image.
+         {% endif %}
+
+      {% endfor %}
+
+MaxText with on ROCm provides the following key features to train large language models efficiently:

 - Transformer Engine (TE)

- Flash Attention (FA) 3
+- Flash Attention (FA) 3 -- with or without sequence input packing

 - GEMM tuning

 - Multi-node support

-.. _amd-maxtext-model-support:
+- NANOO FP8 quantization support

-The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
+.. _amd-maxtext-model-support-v257:

-* Llama 3.3 70B
+Supported models
+================

-* Llama 3.1 8B
+The following models are pre-optimized for performance on AMD Instinct MI300
+series accelerators. Some instructions, commands, and available training
+configurations in this documentation might vary by model -- select one to get
+started.

-* Llama 3.1 70B
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml

-* Llama 3 8B
+   {% set model_groups = data.model_groups %}
+   .. raw:: html

-* Llama 3 70B
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+         <div class="row gx-0">
+            <div class="col-2 me-1 px-2 model-param-head">Model</div>
+            <div class="row col-10 pe-0">
+      {% for model_group in model_groups %}
+               <div class="col-4 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+      {% endfor %}
+            </div>
+         </div>

-* Llama 2 7B
-
-* Llama 2 70B
-
-* DeepSeek-V2-Lite
+         <div class="row gx-0 pt-1">
+            <div class="col-2 me-1 px-2 model-param-head">Variant</div>
+            <div class="row col-10 pe-0">
+      {% for model_group in model_groups %}
+         {% set models = model_group.models %}
+         {% for model in models %}
+            {% if models|length % 3 == 0 %}
+               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% else %}
+               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% endif %}
+         {% endfor %}
+      {% endfor %}
+            </div>
+         </div>
+      </div>

 .. note::

   Some models, such as Llama 3, require an external license agreement through
   a third party (for example, Meta).

-Unsupported features
--------------------
-
-Currently, MaxText's default packed input format is not supported. Using this format
-with the current Docker image results in incorrect attention calculations
-across different input sequences. Support for packed input format is planned for a future release.
-
 System validation
 =================

@@ -98,14 +136,14 @@ This Docker image is optimized for specific model configurations outlined
 as follows. Performance can vary for other training workloads, as AMD
 doesn’t validate configurations and run conditions outside those described.

-.. _amd-maxtext-multi-node-setup:
+.. _amd-maxtext-multi-node-setup-v257:

 Multi-node setup
 ----------------

 For multi-node environments, ensure you have all the necessary packages for
 your network device, such as, RDMA. If you're not using a multi-node setup
-with RDMA, skip ahead to :ref:`amd-maxtext-download-docker`.
+with RDMA, skip ahead to :ref:`amd-maxtext-get-started-v257`.

 1. Install the following packages to build and install the RDMA driver.

@@ -170,7 +208,7 @@ with RDMA, skip ahead to :ref:`amd-maxtext-download-docker`.

   e. RDMA interface

-      Ensure the :ref:`required packages <amd-maxtext-multi-node-setup>` are installed on all nodes.
+      Ensure the :ref:`required packages <amd-maxtext-multi-node-setup-v257>` are installed on all nodes.
      Then, set the RDMA interfaces to use for communication.

      .. code-block:: bash
@@ -180,196 +218,203 @@ with RDMA, skip ahead to :ref:`amd-maxtext-download-docker`.
         # If using Mellanox NIC
         export NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_8,mlx5_9

-.. _amd-maxtext-download-docker:
+.. _amd-maxtext-get-started-v257:

-Pull the Docker image
---------------------
+Benchmarking
+============

-1. Use the following command to pull the Docker image from Docker Hub.
+Once the setup is complete, choose between two options to reproduce the
+benchmark results:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/jax-maxtext-benchmark-models.yaml
+
+   .. _vllm-benchmark-mad:
+
+   {% set dockers = data.dockers %}
+   {% set model_groups = data.model_groups %}
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. tab-set::
+
+         {% if model.mad_tag and "single-node" in model.doc_options %}
+         .. tab-item:: MAD-integrated benchmarking
+
+            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+               directory and install the required packages on the host machine.

               .. code-block:: shell

-      docker pull rocm/jax-training:maxtext-v25.5
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD
+                  pip install -r requirements.txt

-2. Use the following command to launch the Docker container. Note that the benchmarking scripts
-   used in the :ref:`following section <amd-maxtext-get-started>` automatically launch the Docker container
-   and execute the benchmark.
+            2. Use this command to run the performance benchmark test on the {{ model.model }} model
+               using one GPU with the :literal:`{{model.precision}}` data type on the host machine.

               .. code-block:: shell

-      docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME/.ssh:/root/.ssh --shm-size 128G --name maxtext_training rocm/jax-training:maxtext-v25.5
+                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+                  madengine run \
+                      --tags {{model.mad_tag}} \
+                      --keep-model-dir \
+                      --live-output \
+                      --timeout 28800

-.. _amd-maxtext-get-started:
+            MAD launches a Docker container with the name
+            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
+            model are collected in the following path: ``~/MAD/perf.csv/``.
+         {% endif %}

-Getting started
-===============
+         .. tab-item:: Standalone benchmarking

-The following examples demonstrate how to get started with single node
-and multi-node training using the benchmarking scripts provided at
-`<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/>`__.
+            .. rubric:: Download the Docker image and required scripts

-.. important::
+            Run the JAX MaxText benchmark tool independently by starting the
+            Docker container as shown in the following snippet.

-   The provided scripts launch a Docker container and execute a benchmark. Ensure you run these commands outside of any existing Docker container.
+            .. tab-set::
+               {% for docker in dockers %}
+               {% set jax_version = docker.components["JAX"] %}

-Before running any benchmarks, ensure the ``$HF_HOME`` environment variable is
-set correctly and points to your Hugging Face cache directory. Refer to the
-README at `<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/>`__
-for more detailed instructions.
-
-Single node training benchmarking examples
------------------------------------------
-
-* Example 1: Single node training with Llama 2 7B
-
-  Download the benchmarking script:
+               .. tab-item:: JAX {{ jax_version }}
+                  :sync: {{ docker.pull_tag }}

                  .. code-block:: shell

-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_7b.sh
+                     docker pull {{ docker.pull_tag }}
+               {% endfor %}

-  Run the single node training benchmark:
+            {% if model.model_repo and "single-node" in model.doc_options %}
+            .. rubric:: Single node training
+
+            1. Set up environment variables.

               .. code-block:: shell

-     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama2_7b.sh
+                  export MAD_SECRETS_HFTOKEN=<Your Hugging Face token>
+                  export HF_HOME=<Location of saved/cached Hugging Face models>

-* Example 2: Single node training with Llama 2 70B
+               ``MAD_SECRETS_HFTOKEN`` is your Hugging Face access token to access models, tokenizers, and data.
+               See `User access tokens <https://huggingface.co/docs/hub/en/security-tokens>`__.

-  Download the benchmarking script:
+               ``HF_HOME`` is where ``huggingface_hub`` will store local data. See `huggingface_hub CLI <https://huggingface.co/docs/huggingface_hub/main/en/guides/cli#huggingface-cli-download>`__.
+               If you already have downloaded or cached Hugging Face artifacts, set this variable to that path.
+               Downloaded files typically get cached to ``~/.cache/huggingface``.
+
+            2. Launch the Docker container.
+
+               .. tab-set::
+                  {% for docker in dockers %}
+                  {% set jax_version = docker.components["JAX"] %}
+
+                  .. tab-item:: JAX {{ jax_version }}
+                     :sync: {{ docker.pull_tag }}

                     .. code-block:: shell

-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_70b.sh
+                        docker run -it \
+                            --device=/dev/dri \
+                            --device=/dev/kfd \
+                            --network host \
+                            --ipc host \
+                            --group-add video \
+                            --cap-add=SYS_PTRACE \
+                            --security-opt seccomp=unconfined \
+                            --privileged \
+                            -v $HOME:$HOME \
+                            -v $HOME/.ssh:/root/.ssh \
+                            -v $HF_HOME:/hf_cache \
+                            -e HF_HOME=/hf_cache \
+                            -e MAD_SECRETS_HFTOKEN=$MAD_SECRETS_HFTOKEN
+                            --shm-size 64G \
+                            --name training_env \
+                            {{ docker.pull_tag }}
+                  {% endfor %}

-  Run the single node training benchmark:
+            3. In the Docker container, clone the ROCm MAD repository and navigate to the
+               benchmark scripts directory at ``MAD/scripts/jax-maxtext``.

               .. code-block:: shell

-     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama2_70b.sh
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD/scripts/jax-maxtext

-* Example 3: Single node training with Llama 3 8B
-
-  Download the benchmarking script:
+            4. Run the setup scripts to install libraries and datasets needed
+               for benchmarking.

               .. code-block:: shell

-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_8b.sh
+                  ./jax-maxtext_benchmark_setup.sh -m {{ model.model_repo }}

-  Run the single node training benchmark:
+            5. To run the training benchmark without quantization, use the following command:

               .. code-block:: shell

-     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama3_8b.sh
+                  ./jax-maxtext_benchmark_report.sh -m {{ model.model_repo }}

-* Example 4: Single node training with Llama 3 70B
-
-  Download the benchmarking script:
+               For quantized training, use the following command:

               .. code-block:: shell

-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_70b.sh
+                  ./jax-maxtext_benchmark_report.sh -m {{ model.model_repo }} -q nanoo_fp8

-  Run the single node training benchmark:
+               .. important::

-  .. code-block:: shell
+                  Quantized training is not supported with the JAX 0.6.0 Docker image; support
+                  will be added in a future release. For quantized training, use the JAX 0.5.0
+                  Docker image: ``rocm/jax-training:maxtext-v25.7``.

-     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama3_70b.sh
+            {% endif %}
+            {% if model.multinode_training_script and "multi-node" in model.doc_options %}
+            .. rubric:: Multi-node training

-* Example 5: Single node training with Llama 3.3 70B
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3.3_70b.sh
-
-  Run the single node training benchmark:
-
-  .. code-block:: shell
-
-     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama3.3_70b.sh
-
-* Example 6: Single node training with DeepSeek V2 16B
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/deepseek_v2_16b.sh
-
-  Run the single node training benchmark:
-
-  .. code-block:: shell
-
-     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./deepseek_v2_16b.sh
+            The following examples use SLURM to run on multiple nodes.

            .. note::

-     The reported TFLOP/s by MaxText for DeepSeek is not accurate. Use
-     the tokens/s as a performance indicator.
+               The following scripts will launch the Docker container and run the
+               benchmark. Run them outside of any Docker container.

-Multi-node training benchmarking examples
-----------------------------------------
+            1. Make sure ``$HF_HOME`` is set before running the test. See
+               `ROCm benchmarking <https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/readme.md>`__
+               for more details on downloading the Llama models before running the
+               benchmark.

-The following examples use SLURM for running on multiple nodes -- the commands might need to be adjusted for your
-own cluster setup.
+            2. To run multi-node training for {{ model.model }}, 
+               use the
+               `multi-node training script <https://github.com/ROCm/MAD/blob/develop/scripts/jax-maxtext/gpu-rocm/{{ model.multinode_training_script }}>`__
+               under the ``scripts/jax-maxtext/gpu-rocm/`` directory.

-* Example 1: Multi-node training with Llama 2 7B
-
-  Download the benchmarking script:
+            3. Run the multi-node training benchmark script.

               .. code-block:: shell

-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_7b_multinode.sh
+                  sbatch -N <num_nodes> {{ model.multinode_training_script }}

-  Run the multi-node training benchmark. For example:
+         {% else %}
+            .. rubric:: Multi-node training

-  .. code-block:: shell
+            For multi-node training examples, choose a model from :ref:`amd-maxtext-model-support-v257`
+            with an available `multi-node training script <https://github.com/ROCm/MAD/tree/develop/scripts/jax-maxtext/gpu-rocm>`__.
+         {% endif %}
+      {% endfor %}
+   {% endfor %}

-     sbatch -N <num_nodes> llama2_7b_multinode.sh
+Further reading
+===============

-* Example 2: Multi-node training with Llama 2 70B
+- See the ROCm/maxtext benchmarking README at `<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/readme.md>`__.

-  Download the benchmarking script:
+- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.

-  .. code-block:: shell
+- To learn more about system settings and management practices to configure your system for
+  AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.

-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_70b_multinode.sh
-
-  Run the multi-node training benchmark. For example:
-
-  .. code-block:: shell
-
-     sbatch -N <num_nodes> llama2_70b_multinode.sh
-
-* Example 3: Multi-node training with Llama 3 8B model
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_8b_multinode.sh
-
-  Run the multi-node training benchmark. For example:
-
-  .. code-block:: shell
-
-     sbatch -N <num_nodes> llama3_8b_multinode.sh
-
-* Example 4: Multi-node training with Llama 3 70B model
-
-  Download the benchmarking script:
-
-  .. code-block:: shell
-
-     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_70b_multinode.sh
-
-  Run the multi-node training benchmark. For example:
-
-  .. code-block:: shell
-
-     sbatch -N <num_nodes> llama3_70b_multinode.sh
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.

 Previous versions
 =================
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
@@ -1,3 +1,5 @@
+:orphan:
+
 .. meta::
   :description: How to train a model using Megatron-LM for ROCm.
   :keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
@@ -6,6 +8,14 @@
 Training a model with Megatron-LM for ROCm
 ******************************************

+.. caution::
+
+   The ROCm Megatron-LM framework now has limited support with this Docker
+   environment; it now focuses on Primus with Megatron-Core. See :doc:`primus-megatron`.
+
+   To learn how to migrate your existing workloads to Primus with Megatron-Core,
+   see :doc:`previous-versions/megatron-lm-primus-migration-guide`.
+
 The `Megatron-LM framework for ROCm <https://github.com/ROCm/Megatron-LM>`_ is
 a specialized fork of the robust Megatron-LM, designed to enable efficient
 training of large-scale language models on AMD GPUs. By leveraging AMD
@@ -20,13 +30,17 @@ essential components, including PyTorch, ROCm libraries, and Megatron-LM
 utilities. It contains the following software components to accelerate training
 workloads:

+.. note::
+
+   This Docker environment is based on Python 3.10 and Ubuntu 22.04. For an alternative environment with
+   Python 3.12 and Ubuntu 24.04, see the :doc:`previous ROCm Megatron-LM v25.6 Docker release <previous-versions/megatron-lm-v25.6>`.
+
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml

   {% set dockers = data.dockers %}
-   {% if dockers|length > 1 %}
   .. tab-set::

-      {% for docker in data.dockers %}
+      {% for docker in dockers %}
      .. tab-item:: ``{{ docker.pull_tag }}``
         :sync: {{ docker.pull_tag }}

@@ -42,28 +56,14 @@ workloads:

            {% endfor %}
      {% endfor %}
-   {% elif dockers|length == 1 %}
-   .. list-table::
-      :header-rows: 1
-
-      * - Software component
-        - Version
-
-      {% for component_name, component_version in docker.components %}
-      * - {{ component_name }}
-        - {{ component_version }}
-
-      {% endfor %}
-   {% endif %}

   .. _amd-megatron-lm-model-support:

-   The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
-
   Supported models
   ================

-   The following models are supported for training performance benchmarking with Megatron-LM and ROCm.
+   The following models are supported for training performance benchmarking with Megatron-LM and ROCm
+   on AMD Instinct MI300X series accelerators.
   Some instructions, commands, and training recommendations in this documentation might
   vary by model -- select one to get started.

@@ -71,25 +71,25 @@ workloads:
   .. raw:: html

      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
-           <div class="row">
-             <div class="col-2 me-2 model-param-head">Model</div>
-             <div class="row col-10">
+         <div class="row gx-0">
+            <div class="col-2 me-1 px-2 model-param-head">Model</div>
+            <div class="row col-10 pe-0">
      {% for model_group in model_groups %}
-               <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+               <div class="col-3 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
      {% endfor %}
            </div>
         </div>

-           <div class="row mt-1">
-             <div class="col-2 me-2 model-param-head">Model variant</div>
-             <div class="row col-10">
+         <div class="row gx-0 pt-1">
+            <div class="col-2 me-1 px-2 model-param-head">Variant</div>
+            <div class="row col-10 pe-0">
      {% for model_group in model_groups %}
         {% set models = model_group.models %}
         {% for model in models %}
            {% if models|length % 3 == 0 %}
-               <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
            {% else %}
-               <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
            {% endif %}
         {% endfor %}
      {% endfor %}
@@ -177,7 +177,7 @@ Download the Docker image
      {% if dockers|length > 1 %}
      .. tab-set::

-         {% for docker in data.dockers %}
+         {% for docker in dockers %}
         .. tab-item:: {{ docker.doc_name }}
            :sync: {{ docker.pull_tag }}

@@ -227,10 +227,17 @@ Download the Docker image
      docker start megatron_training_env
      docker exec -it megatron_training_env bash

-The Docker container includes a pre-installed, verified version of the ROCm
-Megatron-LM development branch
-`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev>`__, including necessary
-training scripts.
+4. **Megatron-LM backward compatibility setup** -- this Docker is primarily intended for use with Primus, but it maintains Megatron-LM compatibility with limited support.
+   To roll back to using Megatron-LM, follow these steps:
+
+   .. code-block:: shell
+
+      cd /workspace/Megatron-LM/
+      pip uninstall megatron-core
+      pip install -e .
+
+The Docker container hosts
+`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev>`__ at verified commit ``e8e9edc``.

 .. _amd-megatron-lm-environment-setup:

--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history.rst
@@ -17,12 +17,21 @@ previous releases of the ``ROCm/jax-training`` Docker image on `Docker Hub <http
     - Components
     - Resources

-   * - 25.5 (latest)
+   * - 25.7 (latest)
+     - 
+       * ROCm 6.4.1
+       * JAX 0.6.0, 0.5.0
+     - 
+       * :doc:`Documentation <../jax-maxtext>`
+       * `Docker Hub (JAX 0.6.0) <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7-jax060/images/sha256-7352212ae033a76dca2b9dceffc23c1b5f1a61a7a560082cf747a9bf1acfc9ce>`__
+       * `Docker Hub (JAX 0.5.0) <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.7/images/sha256-45f4c727d4019a63fc47313d3a5f5a5105569539294ddfd2d742218212ae9025>`__
+
+   * - 25.5
     - 
       * ROCm 6.3.4
       * JAX 0.4.35
     - 
-       * :doc:`Documentation <../jax-maxtext>`
+       * :doc:`Documentation <jax-maxtext-v25.5>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/jax-training/maxtext-v25.5/images/sha256-4e0516358a227cae8f552fb866ec07e2edcf244756f02e7b40212abfbab5217b>`__

   * - 25.4
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4.rst
@@ -51,7 +51,7 @@ MaxText provides the following key features to train large language models effic

 - Multi-node support

-.. _amd-maxtext-model-support:
+.. _amd-maxtext-model-support-v254:

 The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.

--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.5.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.5.rst
@@ -0,0 +1,385 @@
+:orphan:
+
+.. meta::
+   :description: How to train a model using JAX MaxText for ROCm.
+   :keywords: ROCm, AI, LLM, train, jax, torch, Llama, flux, tutorial, docker
+
+**************************************
+Training a model with MaxText for ROCm
+**************************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm JAX MaxText
+   training performance documentation. See :doc:`../jax-maxtext` for the latest version.
+
+MaxText is a high-performance, open-source framework built on the Google JAX
+machine learning library to train LLMs at scale. The MaxText framework for
+ROCm is an optimized fork of the upstream
+`<https://github.com/AI-Hypercomputer/maxtext>`__ enabling efficient AI workloads
+on AMD MI300X series accelerators.
+
+The MaxText for ROCm training Docker (``rocm/jax-training:maxtext-v25.5``) image
+provides a prebuilt environment for training on AMD Instinct MI300X and MI325X accelerators,
+including essential components like JAX, XLA, ROCm libraries, and MaxText utilities.
+It includes the following software components:
+
+--------------------------+--------------------------------+
+| Software component       | Version                        |
+==========================+================================+
+| ROCm                     | 6.3.4                          |
+--------------------------+--------------------------------+
+| JAX                      | 0.4.35                         |
+--------------------------+--------------------------------+
+| Python                   | 3.10.12                        |
+--------------------------+--------------------------------+
+| Transformer Engine       | 1.12.0.dev0+b8b92dc            |
+--------------------------+--------------------------------+
+| hipBLASLt                | 0.13.0-ae9c477a                |
+--------------------------+--------------------------------+
+
+Supported features and models
+=============================
+
+MaxText provides the following key features to train large language models efficiently:
+
+- Transformer Engine (TE)
+
+- Flash Attention (FA) 3
+
+- GEMM tuning
+
+- Multi-node support
+
+.. _amd-maxtext-model-support-v255:
+
+The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
+
+* Llama 3.3 70B
+
+* Llama 3.1 8B
+
+* Llama 3.1 70B
+
+* Llama 3 8B
+
+* Llama 3 70B
+
+* Llama 2 7B
+
+* Llama 2 70B
+
+* DeepSeek-V2-Lite
+
+.. note::
+
+   Some models, such as Llama 3, require an external license agreement through
+   a third party (for example, Meta).
+
+Unsupported features
+--------------------
+
+Currently, MaxText's default packed input format is not supported. Using this format
+with the current Docker image results in incorrect attention calculations
+across different input sequences. Support for packed input format is planned for a future release.
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting training.
+
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
+Environment setup
+=================
+
+This Docker image is optimized for specific model configurations outlined
+as follows. Performance can vary for other training workloads, as AMD
+doesn’t validate configurations and run conditions outside those described.
+
+.. _amd-maxtext-multi-node-setup-v255:
+
+Multi-node setup
+----------------
+
+For multi-node environments, ensure you have all the necessary packages for
+your network device, such as, RDMA. If you're not using a multi-node setup
+with RDMA, skip ahead to :ref:`amd-maxtext-download-docker-v255`.
+
+1. Install the following packages to build and install the RDMA driver.
+
+   .. code-block:: shell
+
+      sudo apt install iproute2 -y
+      sudo apt install -y linux-headers-"$(uname-r)" libelf-dev
+      sudo apt install -y gcc make libtool autoconf librdmacm-dev rdmacm-utils infiniband-diags ibverbs-utils perftest ethtool libibverbs-dev rdma-core strace libibmad5 libibnetdisc5 ibverbs-providers libibumad-dev libibumad3 libibverbs1 libnl-3-dev libnl-route-3-dev
+
+   Refer to your NIC manufacturer's documentation for further steps on
+   compiling and installing the RoCE driver. For example, for Broadcom,
+   see `Compiling Broadcom NIC software from source <https://docs.broadcom.com/doc/957608-AN2XX#G3.484341>`_
+   in `Ethernet networking guide for AMD Instinct MI300X GPU clusters <https://docs.broadcom.com/doc/957608-AN2XX>`_.
+
+2. Set the following environment variables.
+
+   a. Master address
+
+      Change ``localhost`` to the master node's resolvable hostname or IP address:
+
+      .. code-block:: bash
+
+         export MASTER_ADDR="${MASTER_ADDR:-localhost}"
+
+   b. Number of nodes
+
+      Set the number of nodes you want to train on (for example, ``2``, ``4``, or ``8``):
+
+      .. code-block:: bash
+
+         export NNODES="${NNODES:-1}"
+
+   c. Node ranks
+
+      Set the rank of each node (``0`` for master, ``1`` for the first worker node, and so on)
+      Node ranks should be unique across all nodes in the cluster.
+
+      .. code-block:: bash
+
+         export NODE_RANK="${NODE_RANK:-0}"
+
+   d. Network interface
+
+      Update the network interface in the script to match your system's network interface. To
+      find your network interface, run the following (outside of any Docker container):
+
+      .. code-block:: bash
+
+         ip a
+
+      Look for an active interface with an IP address in the same subnet as
+      your other nodes. Then, update the following variable in the script, for
+      example:
+
+      .. code-block:: bash
+
+         export NCCL_SOCKET_IFNAME=ens50f0np0
+
+      This variable specifies which network interface to use for inter-node communication.
+      Setting this variable to the incorrect interface can result in communication failures
+      or significantly reduced performance.
+
+   e. RDMA interface
+
+      Ensure the :ref:`required packages <amd-maxtext-multi-node-setup-v255>` are installed on all nodes.
+      Then, set the RDMA interfaces to use for communication.
+
+      .. code-block:: bash
+
+         # If using Broadcom NIC
+         export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
+         # If using Mellanox NIC
+         export NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_8,mlx5_9
+
+.. _amd-maxtext-download-docker-v255:
+
+Pull the Docker image
+---------------------
+
+1. Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull rocm/jax-training:maxtext-v25.5
+
+2. Use the following command to launch the Docker container. Note that the benchmarking scripts
+   used in the :ref:`following section <amd-maxtext-get-started-v255>` automatically launch the Docker container
+   and execute the benchmark.
+
+   .. code-block:: shell
+
+      docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME/.ssh:/root/.ssh --shm-size 128G --name maxtext_training rocm/jax-training:maxtext-v25.5
+
+.. _amd-maxtext-get-started-v255:
+
+Getting started
+===============
+
+The following examples demonstrate how to get started with single node
+and multi-node training using the benchmarking scripts provided at
+`<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/>`__.
+
+.. important::
+
+   The provided scripts launch a Docker container and execute a benchmark. Ensure you run these commands outside of any existing Docker container.
+
+Before running any benchmarks, ensure the ``$HF_HOME`` environment variable is
+set correctly and points to your Hugging Face cache directory. Refer to the
+README at `<https://github.com/ROCm/maxtext/blob/main/benchmarks/gpu-rocm/>`__
+for more detailed instructions.
+
+Single node training benchmarking examples
+------------------------------------------
+
+* Example 1: Single node training with Llama 2 7B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_7b.sh
+
+  Run the single node training benchmark:
+
+  .. code-block:: shell
+
+     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama2_7b.sh
+
+* Example 2: Single node training with Llama 2 70B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_70b.sh
+
+  Run the single node training benchmark:
+
+  .. code-block:: shell
+
+     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama2_70b.sh
+
+* Example 3: Single node training with Llama 3 8B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_8b.sh
+
+  Run the single node training benchmark:
+
+  .. code-block:: shell
+
+     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama3_8b.sh
+
+* Example 4: Single node training with Llama 3 70B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_70b.sh
+
+  Run the single node training benchmark:
+
+  .. code-block:: shell
+
+     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama3_70b.sh
+
+* Example 5: Single node training with Llama 3.3 70B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3.3_70b.sh
+
+  Run the single node training benchmark:
+
+  .. code-block:: shell
+
+     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./llama3.3_70b.sh
+
+* Example 6: Single node training with DeepSeek V2 16B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/deepseek_v2_16b.sh
+
+  Run the single node training benchmark:
+
+  .. code-block:: shell
+
+     IMAGE="rocm/jax-training:maxtext-v25.5" bash ./deepseek_v2_16b.sh
+
+  .. note::
+
+     The reported TFLOP/s by MaxText for DeepSeek is not accurate. Use
+     the tokens/s as a performance indicator.
+
+Multi-node training benchmarking examples
+-----------------------------------------
+
+The following examples use SLURM for running on multiple nodes -- the commands might need to be adjusted for your
+own cluster setup.
+
+* Example 1: Multi-node training with Llama 2 7B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_7b_multinode.sh
+
+  Run the multi-node training benchmark. For example:
+
+  .. code-block:: shell
+
+     sbatch -N <num_nodes> llama2_7b_multinode.sh
+
+* Example 2: Multi-node training with Llama 2 70B
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama2_70b_multinode.sh
+
+  Run the multi-node training benchmark. For example:
+
+  .. code-block:: shell
+
+     sbatch -N <num_nodes> llama2_70b_multinode.sh
+
+* Example 3: Multi-node training with Llama 3 8B model
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_8b_multinode.sh
+
+  Run the multi-node training benchmark. For example:
+
+  .. code-block:: shell
+
+     sbatch -N <num_nodes> llama3_8b_multinode.sh
+
+* Example 4: Multi-node training with Llama 3 70B model
+
+  Download the benchmarking script:
+
+  .. code-block:: shell
+
+     wget https://raw.githubusercontent.com/ROCm/maxtext/refs/heads/main/benchmarks/gpu-rocm/llama3_70b_multinode.sh
+
+  Run the multi-node training benchmark. For example:
+
+  .. code-block:: shell
+
+     sbatch -N <num_nodes> llama3_70b_multinode.sh
+
+Previous versions
+=================
+
+See :doc:`jax-maxtext-history` to find documentation for previous releases
+of the ``ROCm/jax-training`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst
@@ -16,12 +16,20 @@ previous releases of the ``ROCm/megatron-lm`` Docker image on `Docker Hub <https
     - Components
     - Resources

-   * - v25.6 (latest)
+   * - v25.7 (latest)
+     - 
+       * ROCm 
+       * PyTorch 
+     - 
+       * :doc:`Documentation <../megatron-lm>`
+       * `Docker Hub (py310) <https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a>`__
+
+   * - v25.6
     - 
       * ROCm 6.4.1
       * PyTorch 2.8.0a0+git7d205b2
     - 
-       * :doc:`Documentation <../megatron-lm>`
+       * :doc:`Documentation <megatron-lm-v25.6>`
       * `Docker Hub (py312) <https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py312/images/sha256-482ff906532285bceabdf2bda629bd32cb6174d2d07f4243a736378001b28df0>`__
       * `Docker Hub (py310) <https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py310/images/sha256-9627bd9378684fe26cb1a10c7dd817868f553b33402e49b058355b0f095568d6>`__

--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-primus-migration-guide.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-primus-migration-guide.rst
@@ -0,0 +1,175 @@
+:orphan:
+
+**********************************************************************
+Migrating workloads to Primus (Megatron-Core backend) from Megatron-LM
+**********************************************************************
+
+Primus supports Megatron-Core as backend optimization library,
+replacing ROCm Megatron-LM. This document outlines the steps to migrate
+workload from ROCm Megatron-LM to Primus with the Megatron-Core backend.
+
+Model architecture
+==================
+
+ROCm Megatron-LM defines model architecture parameters in the training scripts;
+for example, the Llama 3 8B model parameters are defined in
+`examples/llama/train_llama3.sh <https://github.com/ROCm/Megatron-LM/blob/rocm_dev/examples/llama/train_llama3.sh#L117>`__
+as shown below:
+
+.. code-block:: bash
+
+   HIDDEN_SIZE=4096 
+   FFN_HIDDEN_SIZE=14336 
+   NUM_LAYERS=32 
+   NUM_HEADS=32 
+   NUM_KV_HEADS=8
+
+Primus defines the model architecture through model YAML configuration files
+inside the ``primus/configs/models/megatron/`` repository. For example, Llama 3 8B
+model architecture parameters are defined in
+`primus/configs/models/megatron/llama3_8B.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/llama3_8B.yaml>`__
+as shown below:
+
+.. code-block:: yaml
+
+   bases:
+     - llama3_base.yaml
+
+   tokenizer_type: Llama3Tokenizer
+   tokenizer_model: meta-llama/Llama-3.1-8B
+
+   ffn_hidden_size: 14336
+   hidden_size: 4096
+   num_attention_heads: 32
+   num_layers: 32
+   num_query_groups: 8
+
+Primus' model config files follow a hierarchical design, meaning that new model
+config YAMLs can inherit existing model config files by importing them as
+bases. For example,
+`llama3.1_8B.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/llama3.1_8B.yaml>`__
+uses ``llama3_8B.yaml`` as a base config and overrides few parameters, as shown below.
+In this example, ``llama3.1_8B`` overrides the ``max_position_embeddings`` value:
+
+.. code-block:: yaml
+
+   bases:
+     - llama3_8B.yaml
+
+   tokenizer_type: Llama3Tokenizer
+   tokenizer_model: meta-llama/Llama-3.1-8B
+
+   max_position_embeddings: 131072
+
+.. tip::
+
+   Primus provides ``llama_base.yaml`` as the base configuration, which can be
+   used as bases for additional model architectures. For example,
+   `mixtral_base.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/mixtral_base.yaml>`__
+   and
+   `deepseek_v3_base.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/deepseek_v3_base.yaml>`__
+   define ``llama_base.yaml`` as its base.
+
+   .. code-block:: yaml
+
+      # Example mixtral_base.yaml:
+
+      bases:
+        - llama_base.yaml
+
+      init_method_std: 0.01
+      rotary_base: 1000000
+      qk_layernorm: false
+
+      group_query_attention: true
+      num_query_groups: 8
+
+      # moe parameters
+      num_experts: 8
+      moe_router_topk: 2
+      moe_router_load_balancing_type: aux_loss
+      moe_aux_loss_coeff: 1e-2
+      moe_grouped_gemm: true
+      moe_token_dispatcher_type: alltoall
+
+It is recommended to add a new ``${MODEL_NAME}_base.yaml`` to add a new
+category of model and define new models on top of it. For example, to add
+Qwen2.5 models in Primus, we define
+`qwen2.5_base.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/qwen2.5_base.yaml>`__
+and build
+`qwen2.5_7B.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/qwen2.5_7B.yaml>`__
+and
+`qwen2.5_72B.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/qwen2.5_72B.yaml>`__
+using ``qwen2.5_base.yaml`` as the base config.
+
+Training parameters
+===================
+
+ROCm Megatron-LM also defines the training parameters, like batch size,
+tensor-parallelism, precision, as so on, in the training scripts. For example,
+Llama3 8B model parameters are defined in
+`examples/llama/train_llama3.sh <https://github.com/ROCm/Megatron-LM/blob/rocm_dev/examples/llama/train_llama3.sh>`__
+as shown below:
+
+.. code-block:: bash
+
+   TP="${TP:-8}"
+   PP="${PP:-1}"
+   CP="${CP:-1}"
+   MBS="${MBS:-1}"
+   BS="${BS:-8}"
+
+Primus defines the training parameters in top-level YAML files -- see
+`examples/megatron/configs/
+<https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1/examples/megatron/configs>`__.
+For example, the `llama3.1_8B-pretrain.yaml
+<https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/examples/megatron/configs/llama3.1_8B-pretrain.yaml>`__
+configuration imports the ``llama3.1_8B.yaml`` model architecture file. Users can then override
+the default training parameters in ``llama3.1_8B-pretrain.yaml``.
+
+.. code-block:: yaml
+
+   # model to run
+   model: llama3.1_8B.yaml  # Model architecture yaml
+   overrides:
+     # log
+     # disable_wandb: false
+     # disable_tensorboard: false
+     stderr_sink_level: DEBUG
+
+     log_avg_skip_iterations: 2
+     log_avg_reset_interval: 50
+
+     train_iters: 50
+     micro_batch_size: 2
+     global_batch_size: 128
+
+     seq_length: 8192
+     max_position_embeddings: 8192
+
+     lr: 1.0e-5
+     min_lr: 0.0
+     lr_warmup_iters: 2
+     lr_decay_iters: null
+     lr_decay_style: cosine
+     weight_decay: 0.1
+     adam_beta1: 0.9
+     adam_beta2: 0.95
+     eod_mask_loss: true
+     init_method_std: 0.008
+     norm_epsilon: 1.0e-6
+
+Backward compatibility with Megatron-LM
+=======================================
+
+The Dockerized environment used for Primus maintains compatibility with Megatron-LM with
+limited support. To roll back to using Megatron-LM, follow these steps.
+
+.. code-block:: shell
+
+   cd /workspace/Megatron-LM/
+   pip uninstall megatron-core
+   pip install -e .
+
+Once Megatron-LM is installed, follow :doc:`the documentation <../megatron-lm>` to run workloads as
+usual.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v24.12-dev.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v24.12-dev.rst
@@ -18,7 +18,7 @@ Training a model with ROCm Megatron-LM
 The ROCm Megatron-LM framework is a specialized fork of the robust Megatron-LM, designed to
 enable efficient training of large-scale language models on AMD GPUs. By leveraging AMD Instinct™ MI300X
 accelerators, AMD Megatron-LM delivers enhanced scalability, performance, and resource utilization for AI
-workloads. It is purpose-built to :ref:`support models <amd-megatron-lm-model-support>`
+workloads. It is purpose-built to :ref:`support models <amd-megatron-lm-model-support-24-12>`
 like Meta's Llama 2, Llama 3, and Llama 3.1, enabling developers to train next-generation AI models with greater
 efficiency. See the GitHub repository at `<https://github.com/ROCm/Megatron-LM>`__.

@@ -67,7 +67,7 @@ Megatron-LM provides the following key features to train large language models e

 - Pre-training

-.. _amd-megatron-lm-model-support:
+.. _amd-megatron-lm-model-support-24-12:

 The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.

--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3.rst
@@ -67,7 +67,7 @@ Megatron-LM provides the following key features to train large language models e

 - Pre-training

-.. _amd-megatron-lm-model-support:
+.. _amd-megatron-lm-model-support-25-3:

 The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.

@@ -278,7 +278,7 @@ handle a variety of input sequences, including unseen words or domain-specific t
   .. tab-item:: Llama
      :sync: llama

-      To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``Llama2Tokenizer``.
+      To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support-25-3>`, use the ``Llama2Tokenizer``.

      To train any of Llama 3 and Llama 3.1 models that this Docker image supports, use the ``HuggingFaceTokenizer``.
      Set the Hugging Face model link in the ``TOKENIZER_MODEL`` variable.
@@ -292,7 +292,7 @@ handle a variety of input sequences, including unseen words or domain-specific t
   .. tab-item:: DeepSeek V2
      :sync: deepseek

-      To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``DeepSeekV2Tokenizer``.
+      To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support-25-3>`, use the ``DeepSeekV2Tokenizer``.

 Multi-node training
 ^^^^^^^^^^^^^^^^^^^
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4.rst
@@ -67,7 +67,7 @@ Megatron-LM provides the following key features to train large language models e

 - Pre-training

-.. _amd-megatron-lm-model-support:
+.. _amd-megatron-lm-model-support-25-4:

 The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.

@@ -291,7 +291,7 @@ or ``${DATA_DIR}/tokenizer_llama2``.
   .. tab-item:: Llama
      :sync: llama

-      To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``Llama2Tokenizer``
+      To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support-25-4>`, use the ``Llama2Tokenizer``
      or the default ``HuggingFaceTokenizer``.

      To train any of Llama 3 and Llama 3.1 models that this Docker image supports, use the ``HuggingFaceTokenizer``.
@@ -320,7 +320,7 @@ or ``${DATA_DIR}/tokenizer_llama2``.
   .. tab-item:: DeepSeek V2
      :sync: deepseek

-      To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``DeepSeekV2Tokenizer``.
+      To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support-25-4>`, use the ``DeepSeekV2Tokenizer``.

 Multi-node training
 ^^^^^^^^^^^^^^^^^^^
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.6.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.6.rst
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst
@@ -16,12 +16,20 @@ previous releases of the ``ROCm/pytorch-training`` Docker image on `Docker Hub <
     - Components
     - Resources

+   * - v25.7
+     - 
+       * ROCm 6.4.2
+       * PyTorch 2.8.0a0+gitd06a406
+     - 
+       * :doc:`Documentation <../pytorch-training>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.7/images/sha256-cc6fd840ab89cb81d926fc29eca6d075aee9875a55a522675a4b9231c9a0a712>`__
+
   * - v25.6
     - 
       * ROCm 6.3.4
       * PyTorch 2.8.0a0+git7d205b2
     - 
-       * :doc:`Documentation <../pytorch-training>`
+       * :doc:`Documentation <pytorch-training-v25.6>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.6/images/sha256-a4cea3c493a4a03d199a3e81960ac071d79a4a7a391aa9866add3b30a7842661>`__

   * - v25.5
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5.rst
@@ -437,3 +437,8 @@ Once the setup is complete, choose between two options to start benchmarking:

           ./pytorch_benchmark_report.sh -t HF_finetune_lora -p BF16 -m Llama-2-70B

+Previous versions
+=================
+
+See :doc:`pytorch-training-history` to find documentation for previous releases
+of the ``ROCm/pytorch-training`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.6.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.6.rst
@@ -0,0 +1,456 @@
+:orphan:
+
+.. meta::
+   :description: How to train a model using PyTorch for ROCm.
+   :keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
+
+**************************************
+Training a model with PyTorch for ROCm
+**************************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm vLLM
+   performance benchmark documentation. See :doc:`../pytorch-training` for the latest version.
+
+PyTorch is an open-source machine learning framework that is widely used for
+model training with GPU-optimized components for transformer-based models.
+
+The `PyTorch for ROCm training Docker <https://hub.docker.com/layers/rocm/pytorch-training/v25.6/images/sha256-a4cea3c493a4a03d199a3e81960ac071d79a4a7a391aa9866add3b30a7842661>`_
+(``rocm/pytorch-training:v25.6``) image provides a prebuilt optimized environment for fine-tuning and pretraining a
+model on AMD Instinct MI325X and MI300X accelerators. It includes the following software components to accelerate
+training workloads:
+
+--------------------------+--------------------------------+
+| Software component       | Version                        |
+==========================+================================+
+| ROCm                     | 6.3.4                          |
+--------------------------+--------------------------------+
+| PyTorch                  | 2.8.0a0+git7d205b2             |
+--------------------------+--------------------------------+
+| Python                   | 3.10.17                        |
+--------------------------+--------------------------------+
+| Transformer Engine       | 1.14.0+2f85f5f2                |
+--------------------------+--------------------------------+
+| Flash Attention          | 3.0.0.post1                    |
+--------------------------+--------------------------------+
+| hipBLASLt                | 0.15.0-8c6919d                 |
+--------------------------+--------------------------------+
+| Triton                   | 3.3.0                          |
+--------------------------+--------------------------------+
+
+.. _amd-pytorch-training-model-support-v256:
+
+Supported models
+================
+
+The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X accelerators.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.6-benchmark-models.yaml
+
+   {% set unified_docker = data.unified_docker.latest %}
+   {% set model_groups = data.model_groups %}
+
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+        <div class="row">
+          <div class="col-2 me-2 model-param-head">Workload</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+            <div class="col-6 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+   {% endfor %}
+          </div>
+        </div>
+
+        <div class="row mt-1">
+          <div class="col-2 me-2 model-param-head">Model</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+      {% set models = model_group.models %}
+      {% for model in models %}
+         {% if models|length % 3 == 0 %}
+            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% else %}
+            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% endif %}
+      {% endfor %}
+   {% endfor %}
+          </div>
+        </div>
+      </div>
+
+   .. note::
+
+      Some models require an external license agreement through a third party (for example, Meta).
+
+   .. _amd-pytorch-training-performance-measurements-v256:
+
+   Performance measurements
+   ========================
+
+   To evaluate performance, the
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
+   page provides reference throughput and latency measurements for training
+   popular AI models.
+
+   .. note::
+
+      The performance data presented in
+      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
+      should not be interpreted as the peak performance achievable by AMD
+      Instinct MI325X and MI300X accelerators or ROCm software.
+
+   System validation
+   =================
+
+   Before running AI workloads, it's important to validate that your AMD hardware is configured
+   correctly and performing optimally.
+
+   If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+   can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+   optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+   before starting training.
+
+   To test for optimal performance, consult the recommended :ref:`System health benchmarks
+   <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+   system's configuration.
+
+   This Docker image is optimized for specific model configurations outlined
+   below. Performance can vary for other training workloads, as AMD
+   doesn’t validate configurations and run conditions outside those described.
+
+   Benchmarking
+   ============
+
+   Once the setup is complete, choose between two options to start benchmarking:
+
+   .. tab-set::
+
+      .. tab-item:: MAD-integrated benchmarking
+
+         Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+         directory and install the required packages on the host machine.
+
+         .. code-block:: shell
+
+            git clone https://github.com/ROCm/MAD
+            cd MAD
+            pip install -r requirements.txt
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+         .. container:: model-doc {{ model.mad_tag }}
+
+            For example, use this command to run the performance benchmark test on the {{ model.model }} model
+            using one GPU with the {{ model.precision }} data type on the host machine.
+
+            .. code-block:: shell
+
+               export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+               madengine run \
+                   --tags {{ model.mad_tag }} \
+                   --keep-model-dir \
+                   --live-output \
+                   --timeout 28800
+
+            MAD launches a Docker container with the name
+            ``container_ci-{{ model.mad_tag }}``, for example. The latency and throughput reports of the
+            model are collected in the following path: ``~/MAD/perf.csv``.
+
+      {% endfor %}
+   {% endfor %}
+
+      .. tab-item:: Standalone benchmarking
+
+         .. rubric:: Download the Docker image and required packages
+
+         Use the following command to pull the Docker image from Docker Hub.
+
+         .. code-block:: shell
+
+            docker pull {{ unified_docker.pull_tag }}
+
+         Run the Docker container.
+
+         .. code-block:: shell
+
+            docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env {{ unified_docker.pull_tag }}
+
+         Use these commands if you exit the ``training_env`` container and need to return to it.
+
+         .. code-block:: shell
+
+            docker start training_env
+            docker exec -it training_env bash
+
+         In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
+         repository and navigate to the benchmark scripts directory
+         ``/workspace/MAD/scripts/pytorch_train``.
+
+         .. code-block:: shell
+
+            git clone https://github.com/ROCm/MAD
+            cd MAD/scripts/pytorch_train
+
+         .. rubric:: Prepare training datasets and dependencies
+
+         The following benchmarking examples require downloading models and datasets
+         from Hugging Face. To ensure successful access to gated repos, set your
+         ``HF_TOKEN``.
+
+         .. code-block:: shell
+
+            export HF_TOKEN=$your_personal_hugging_face_access_token
+
+         Run the setup script to install libraries and datasets needed for benchmarking.
+
+         .. code-block:: shell
+
+            ./pytorch_benchmark_setup.sh
+
+         .. container:: model-doc pyt_train_llama-3.1-8b
+
+            ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 8B:
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Library
+                 - Reference
+
+               * - ``accelerate``
+                 - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
+
+               * - ``datasets``
+                 - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
+
+         .. container:: model-doc pyt_train_llama-3.1-70b
+
+            ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 70B:
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Library
+                 - Reference
+
+               * - ``datasets``
+                 - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
+
+               * - ``torchdata``
+                 - `TorchData <https://pytorch.org/data/beta/index.html>`_
+
+               * - ``tomli``
+                 - `Tomli <https://pypi.org/project/tomli/>`_
+
+               * - ``tiktoken``
+                 - `tiktoken <https://github.com/openai/tiktoken>`_
+
+               * - ``blobfile``
+                 - `blobfile <https://pypi.org/project/blobfile/>`_
+
+               * - ``tabulate``
+                 - `tabulate <https://pypi.org/project/tabulate/>`_
+
+               * - ``wandb``
+                 - `Weights & Biases <https://github.com/wandb/wandb>`_
+
+               * - ``sentencepiece``
+                 - `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
+
+               * - ``tensorboard``
+                 - `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
+
+         .. container:: model-doc pyt_train_flux
+
+            ``pytorch_benchmark_setup.sh`` installs the following libraries for FLUX:
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Library
+                 - Reference
+
+               * - ``accelerate``
+                 - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
+
+               * - ``datasets``
+                 - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
+
+               * - ``sentencepiece``
+                 - `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
+
+               * - ``tensorboard``
+                 - `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
+
+               * - ``csvkit``
+                 - `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1
+
+               * - ``deepspeed``
+                 - `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2
+
+               * - ``diffusers``
+                 - `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0
+
+               * - ``GitPython``
+                 - `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44
+
+               * - ``opencv-python-headless``
+                 - `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84
+
+               * - ``peft``
+                 - `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0
+
+               * - ``protobuf``
+                 - `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2
+
+               * - ``pytest``
+                 - `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4
+
+               * - ``python-dotenv``
+                 - `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1
+
+               * - ``seaborn``
+                 - `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2
+
+               * - ``transformers``
+                 - `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0
+
+         ``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face:
+
+         * `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+         {% if model_group.tag == "pre-training" and model.mad_tag in ["pyt_train_llama-3.1-8b", "pyt_train_llama-3.1-70b", "pyt_train_flux"] %}
+
+         .. container:: model-doc {{ model.mad_tag }}
+
+            .. rubric:: Pretraining
+
+            To start the pre-training benchmark, use the following command with the
+            appropriate options. See the following list of options and their descriptions.
+
+            .. code-block:: shell
+
+               ./pytorch_benchmark_report.sh -t pretrain -m {{ model.model_repo }} -p $datatype -s $sequence_length
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Name
+                 - Options
+                 - Description
+
+            {% if model.mad_tag == "pyt_train_llama-3.1-8b" %}
+               * - ``$datatype``
+                 - ``BF16`` or ``FP8``
+                 - Only Llama 3.1 8B supports FP8 precision.
+            {% else %}
+               * - ``$datatype``
+                 - ``BF16``
+                 - Only Llama 3.1 8B supports FP8 precision.
+            {% endif %}
+
+               * - ``$sequence_length``
+                 - Sequence length for the language model.
+                 - Between 2048 and 8192. 8192 by default.
+
+            {% if model.mad_tag == "pyt_train_flux" %}
+            .. container:: model-doc {{ model.mad_tag }}
+
+               .. note::
+
+                  Occasionally, downloading the Flux dataset might fail. In the event of this
+                  error, manually download it from Hugging Face at
+                  `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
+                  and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
+                  the required dataset.
+            {% endif %}
+         {% endif %}
+
+         {% if model_group.tag == "fine-tuning" %}
+         .. container:: model-doc {{ model.mad_tag }}
+
+            .. rubric:: Fine-tuning
+
+            To start the fine-tuning benchmark, use the following command with the
+            appropriate options. See the following list of options and their descriptions.
+
+            .. code-block:: shell
+
+               ./pytorch_benchmark_report.sh -t $training_mode -m {{ model.model_repo }} -p BF16 -s $sequence_length
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Name
+                 - Options
+                 - Description
+
+               * - ``$training_mode``
+                 - ``finetune_fw``
+                 - Full weight fine-tuning (BF16 supported)
+
+               * -
+                 - ``finetune_lora``
+                 - LoRA fine-tuning (BF16 supported)
+
+               * -
+                 - ``finetune_qlora``
+                 - QLoRA fine-tuning (BF16 supported)
+
+               * -
+                 - ``HF_finetune_lora``
+                 - LoRA fine-tuning with Hugging Face PEFT
+
+               * - ``$datatype``
+                 - ``BF16``
+                 - All models support BF16.
+
+               * - ``$sequence_length``
+                 - Between 2048 and 16384.
+                 - Sequence length for the language model.
+
+            .. note::
+
+               {{ model.model }} currently supports the following fine-tuning methods:
+
+            {% for method in model.training_modes %}
+               * ``{{ method }}``
+            {% endfor %}
+            {% if model.training_modes|length < 4 %}
+
+               The upstream `torchtune <https://github.com/pytorch/torchtune>`_ repository
+               does not currently provide YAML configuration files for other combinations of
+               model to fine-tuning method
+               However, you can still configure your own YAML files to enable support for
+               fine-tuning methods not listed here by following existing patterns in the
+               ``/workspace/torchtune/recipes/configs`` directory.
+            {% endif %}
+         {% endif %}
+      {% endfor %}
+   {% endfor %}
+
+               .. rubric:: Benchmarking examples
+
+               For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.
+
+Further reading
+===============
+
+- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
+
+- To learn more about system settings and management practices to configure your system for
+  AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`pytorch-training-history` to find documentation for previous releases
+of the ``ROCm/pytorch-training`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst
@@ -0,0 +1,602 @@
+.. meta::
+   :description: How to train a model using Megatron-LM for ROCm.
+   :keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
+
+**********************************************
+Training a model with Primus and Megatron-Core
+**********************************************
+
+`Primus <https://github.com/AMD-AIG-AIMA/Primus>`__ is a unified and flexible
+LLM training framework designed to streamline training. It streamlines LLM
+training on AMD Instinct accelerators using a modular, reproducible configuration paradigm.
+Primus is backend-agnostic and supports multiple training engines -- including Megatron-Core.
+
+.. note::
+
+   Primus with the Megatron-Core backend is intended to replace ROCm
+   Megatron-LM in this Dockerized training environment. To learn how to migrate
+   workloads from Megatron-LM to Primus with Megatron-Core, see
+   :doc:`previous-versions/megatron-lm-primus-migration-guide`.
+
+For ease of use, AMD provides a ready-to-use Docker image for MI300 series accelerators
+containing essential components for Primus and Megatron-Core.
+
+.. note::
+
+   This Docker environment is based on Python 3.10 and Ubuntu 22.04. For an alternative environment with
+   Python 3.12 and Ubuntu 24.04, see the :doc:`previous ROCm Megatron-LM v25.6 Docker release <previous-versions/megatron-lm-v25.6>`.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
+
+   {% set dockers = data.dockers %}
+   {% set docker = dockers[0] %}
+   .. list-table::
+      :header-rows: 1
+
+      * - Software component
+        - Version
+
+      {% for component_name, component_version in docker.components.items() %}
+      * - {{ component_name }}
+        - {{ component_version }}
+      {% endfor %}
+
+.. _amd-primus-megatron-lm-model-support:
+
+Supported models
+================
+
+The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
+Some instructions, commands, and training examples in this documentation might
+vary by model -- select one to get started.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
+
+   {% set model_groups = data.model_groups %}
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+         <div class="row gx-0">
+            <div class="col-2 me-1 px-2 model-param-head">Model</div>
+            <div class="row col-10 pe-0">
+      {% for model_group in model_groups %}
+               <div class="col-3 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+      {% endfor %}
+            </div>
+         </div>
+
+         <div class="row gx-0 pt-1">
+            <div class="col-2 me-1 px-2 model-param-head">Variant</div>
+            <div class="row col-10 pe-0">
+      {% for model_group in model_groups %}
+         {% set models = model_group.models %}
+         {% for model in models %}
+            {% if models|length % 3 == 0 %}
+               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% else %}
+               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% endif %}
+         {% endfor %}
+      {% endfor %}
+            </div>
+         </div>
+      </div>
+
+.. note::
+
+   Some models, such as Llama, require an external license agreement through
+   a third party (for example, Meta).
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting training.
+
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
+.. _mi300x-amd-primus-megatron-lm-training:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
+
+   {% set dockers = data.dockers %}
+      {% set docker = dockers[0] %}
+
+   Environment setup
+   =================
+
+   Use the following instructions to set up the environment, configure the script to train models, and
+   reproduce the benchmark results on MI300X series accelerators with the ``{{ docker.pull_tag }}`` image.
+
+   .. _amd-primus-megatron-lm-requirements:
+
+   Download the Docker image
+   -------------------------
+
+   1. Use the following command to pull the Docker image from Docker Hub.
+
+      .. code-block:: shell
+
+         docker pull {{ docker.pull_tag }}
+
+   2. Launch the Docker container.
+
+      .. code-block:: shell
+
+         docker run -it \
+             --device /dev/dri \
+             --device /dev/kfd \
+             --device /dev/infiniband \
+             --network host --ipc host \
+             --group-add video \
+             --cap-add SYS_PTRACE \
+             --security-opt seccomp=unconfined \
+             --privileged \
+             -v $HOME:$HOME \
+             --shm-size 128G \
+             --name primus_training_env \
+             {{ docker.pull_tag }}
+
+3. Use these commands if you exit the ``primus_training_env`` container and need to return to it.
+
+   .. code-block:: shell
+
+      docker start primus_training_env
+      docker exec -it primus_training_env bash
+
+The Docker container hosts verified release tag ``v0.1.0-rc1`` of the `Primus
+<https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1>`__ repository.
+
+.. _amd-primus-megatron-lm-environment-setup:
+
+Configuration
+=============
+
+Primus defines a training configuration in YAML for each model in
+`examples/megatron/configs <https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1/examples/megatron/configs>`__.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
+
+   {% set model_groups = data.model_groups %}
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+   .. container:: model-doc {{ model.mad_tag }}
+
+      To update training parameters for {{ model.model }}, you can update ``examples/megatron/configs/{{ model.config_name }}``.
+      Note that training configuration YAML files for other models follow this naming convention.
+
+      {% endfor %}
+   {% endfor %}
+
+.. note::
+
+   See :ref:`Key options <amd-primus-megatron-lm-benchmark-test-vars>` for more information on configuration options.
+
+Dataset options
+---------------
+
+You can use either mock data or real data for training.
+
+* Mock data can be useful for testing and validation. Use the ``mock_data`` field to toggle between mock and real data. The default
+  value is ``true`` for enabled.
+
+  .. code-block:: yaml
+
+     mock_data: true
+
+* If you're using a real dataset, update the ``train_data_path`` field to point to the location of your dataset.
+
+  .. code-block:: bash
+
+     mock_data: false
+     train_data_path: /path/to/your/dataset
+
+  Ensure that the files are accessible inside the Docker container.
+
+.. _amd-primus-megatron-lm-tokenizer:
+
+Tokenizer
+---------
+
+In Primus, each model uses a tokenizer from Hugging Face. For example, Llama
+3.1 8B model uses ``tokenizer_model: meta-llama/Llama-3.1-8B`` and
+``tokenizer_type: Llama3Tokenizer`` defined in the `llama3.1-8B model
+<https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1/primus/configs/models/megatron/llama3.1_8B.yaml>`__
+definition. As such, you need to set the ``HF_TOKEN`` environment variable with
+right permissions to access the tokenizer for each model.
+
+.. code-block:: bash
+
+   # Export your HF_TOKEN in the workspace
+   export HF_TOKEN=<your_hftoken>
+
+.. _amd-primus-megatron-lm-run-training:
+
+Run training
+============
+
+Use the following example commands to set up the environment, configure
+:ref:`key options <amd-primus-megatron-lm-benchmark-test-vars>`, and run training on
+MI300X series accelerators with the AMD Megatron-LM environment.
+
+Single node training
+--------------------
+
+To run training on a single node, navigate to ``/workspace/Primus`` and use the following setup command:
+
+.. code-block:: shell
+
+   pip install -r requirements.txt
+   export HSA_NO_SCRATCH_RECLAIM=1
+   export NVTE_CK_USES_BWD_V3=1
+
+Once setup is complete, run the appropriate training command.
+
+.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.3-70b
+
+   To run pre-training for Llama 3.3 70B BF16, run:
+
+   .. code-block:: shell
+
+      EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
+      bash ./examples/run_pretrain.sh \
+          --micro_batch_size 2 \
+          --global_batch_size 16 \
+          --train_iters 50
+
+.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b
+
+   To run pre-training for Llama 3.1 8B FP8, run:
+
+   .. code-block:: shell
+
+      EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
+      bash ./examples/run_pretrain.sh \
+          --train_iters 50 \
+          --fp8 hybrid
+
+   For Llama 3.1 8B BF16, use the following command:
+
+   .. code-block:: shell
+
+      EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
+      bash ./examples/run_pretrain.sh --train_iters 50
+
+.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-70b
+
+   To run pre-training for Llama 3.1 70B BF16, run:
+
+   .. code-block:: shell
+
+      EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
+      bash ./examples/run_pretrain.sh \
+           --train_iters 50
+
+   To run the training on a single node for Llama 3.1 70B FP8 with proxy, use the following command:
+
+   .. code-block:: shell
+
+      EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
+      bash ./examples/run_pretrain.sh \
+          --train_iters 50 \
+          --num_layers 40 \
+          --fp8 hybrid \
+          --no_fp8_weight_transpose_cache true
+
+   .. note::
+
+      Use two or more nodes to run the *full* Llama 70B model with FP8 precision.
+
+.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-7b
+
+   To run pre-training for Llama 2 7B FP8, run:
+
+   .. code-block:: shell
+
+      EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
+      bash ./examples/run_pretrain.sh \
+          --train_iters 50 \
+          --fp8 hybrid
+
+   To run pre-training for Llama 2 7B BF16, run:
+
+   .. code-block:: shell
+
+      EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
+      bash ./examples/run_pretrain.sh --train_iters 50
+
+.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-70b
+
+   To run pre-training for Llama 2 70B BF16, run:
+
+   .. code-block:: shell
+
+      EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
+      bash ./examples/run_pretrain.sh --train_iters 50 
+
+.. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v3-proxy
+
+   To run training on a single node for DeepSeek-V3 (MoE with expert parallel) with 3-layer proxy, 
+   use the following command:
+
+   .. code-block:: shell
+
+      EXP=examples/megatron/configs/deepseek_v3-pretrain.yaml \
+      bash examples/run_pretrain.sh \
+          --num_layers 3 \
+          --moe_layer_freq 1 \
+          --train_iters 50
+
+.. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v2-lite-16b
+
+   To run training on a single node for DeepSeek-V2-Lite (MoE with expert parallel),
+   use the following command:
+
+   .. code-block:: shell
+
+      EXP=examples/megatron/configs/deepseek_v2_lite-pretrain.yaml \
+      bash examples/run_pretrain.sh \
+          --global_batch_size 256 \
+          --train_iters 50
+
+.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x7b
+
+   To run training on a single node for Mixtral 8x7B (MoE with expert parallel),
+   use the following command:
+
+   .. code-block:: shell
+
+      EXP=examples/megatron/configs/mixtral_8x7B_v0.1-pretrain.yaml \
+      bash examples/run_pretrain.sh --train_iters 50
+
+.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x22b-proxy
+
+   To run training on a single node for Mixtral 8x7B (MoE with expert parallel) with 4-layer proxy,
+   use the following command:
+
+   .. code-block:: shell
+
+      EXP=examples/megatron/configs/mixtral_8x22B_v0.1-pretrain.yaml \
+      bash examples/run_pretrain.sh \
+          --num_layers 4 \
+          --pipeline_model_parallel_size 1 \
+          --micro_batch_size 1 \
+          --global_batch_size 16 \
+          --train_iters 50
+
+.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-7b
+
+   To run training on a single node for Qwen 2.5 7B BF16, use the following
+   command:
+
+   .. code-block:: shell
+
+      EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \
+      bash examples/run_pretrain.sh --train_iters 50
+
+   For FP8, use the following command.
+
+   .. code-block:: shell
+
+      EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \
+      bash examples/run_pretrain.sh \
+          --train_iters 50 \
+          --fp8 hybrid
+
+.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-72b
+
+   To run the training on a single node for Qwen 2.5 72B BF16, use the following command.
+
+   .. code-block:: shell
+
+      EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \
+      bash examples/run_pretrain.sh --train_iters 50
+
+Multi-node training examples
+----------------------------
+
+To run training on multiple nodes, you can use the
+`run_slurm_pretrain.sh <https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1/examples/run_slurm_pretrain.sh>`__
+to launch the multi-node workload. Use the following steps to setup your environment:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
+
+   {% set dockers = data.dockers %}
+   {% set docker = dockers[0] %}
+
+   .. code-block:: shell
+
+      cd /workspace/Primus/
+      export DOCKER_IMAGE={{ docker.pull_tag }}
+      export HF_TOKEN=<your_HF_token>
+      export HSA_NO_SCRATCH_RECLAIM=1
+      export NVTE_CK_USES_BWD_V3=1
+      export NCCL_IB_HCA=<your_NCCL_IB_HCA> # specify which RDMA interfaces to use for communication
+      export NCCL_SOCKET_IFNAME=<your_NCCL_SOCKET_IFNAME> # your Network Interface
+      export GLOO_SOCKET_IFNAME=<your_GLOO_SOCKET_IFNAME> # your Network Interface
+      export NCCL_IB_GID_INDEX=3 # Set InfiniBand GID index for NCCL communication. Default is 3 for ROCE
+
+.. note::
+
+   * Make sure correct network drivers are installed on the nodes. If inside a Docker, either install the drivers inside the Docker container or pass the network drivers from the host while creating Docker container.
+   * If ``NCCL_IB_HCA`` and ``NCCL_SOCKET_IFNAME`` are not set, Primus will try to auto-detect. However, since NICs can vary accross different cluster, it is encouraged to explicitly export your NCCL parameters for the cluster.
+   * To find your network interface, you can use ``ip a``.
+   * To find RDMA interfaces, you can use ``ibv_devices`` to get the list of all the RDMA/IB  devices.
+
+.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.3-70b
+
+   To train Llama 3.3 70B FP8 on 8 nodes, run:
+
+   .. code-block:: shell
+
+      NNODES=8 EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
+      bash examples/run_slurm_pretrain.sh \
+          --micro_batch_size 4 \
+          --global_batch_size 256 \
+          --recompute_num_layers 80 \
+          --no_fp8_weight_transpose_cache true \
+          --fp8 hybrid
+
+   To train Llama 3.3 70B BF16 on 8 nodes, run:
+
+   .. code-block:: shell
+
+      NNODES=8 EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
+      bash examples/run_slurm_pretrain.sh \
+          --micro_batch_size 1 \
+          --global_batch_size 256 \
+          --recompute_num_layers 12
+
+.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b
+
+   To train Llama 3.1 8B FP8 on 8 nodes, run:
+
+   .. code-block:: shell
+
+      # Adjust the training parameters. For e.g., `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case 
+      NNODES=8 EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
+      bash ./examples/run_slurm_pretrain.sh \
+          --global_batch_size 1024 \
+          --fp8 hybrid
+
+.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-70b
+
+   To train Llama 3.1 70B FP8 on 8 nodes, run:
+
+   .. code-block:: shell
+
+      NNODES=8 EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
+      bash examples/run_slurm_pretrain.sh \
+          --micro_batch_size 4 \
+          --global_batch_size 256 \
+          --recompute_num_layers 80 \
+          --no_fp8_weight_transpose_cache true \
+          --fp8 hybrid
+
+   To train Llama 3.1 70B BF16 on 8 nodes, run:
+
+   .. code-block:: shell
+
+      NNODES=8 EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
+      bash examples/run_slurm_pretrain.sh \
+          --micro_batch_size 1 \
+          --global_batch_size 256 \
+          --recompute_num_layers 12
+
+.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-7b
+
+   To train Llama 2 8B FP8 on 8 nodes, run:
+
+   .. code-block:: shell
+
+      # Adjust the training parameters. For e.g., `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case 
+      NNODES=8 EXP=examples/megatron/configs/llama2_7B-pretrain.yaml bash ./examples/run_slurm_pretrain.sh --global_batch_size 2048 --fp8 hybrid
+
+.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-70b
+
+   To train Llama 2 70B FP8 on 8 nodes, run:
+
+   .. code-block:: shell
+
+      NNODES=8 EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
+      bash examples/run_slurm_pretrain.sh \
+          --micro_batch_size 10 \
+          --global_batch_size 640 \
+          --recompute_num_layers 80 \
+          --no_fp8_weight_transpose_cache true \
+          --fp8 hybrid
+
+   To train Llama 2 70B BF16 on 8 nodes, run:
+
+   .. code-block:: shell
+
+      NNODES=8 EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
+      bash ./examples/run_slurm_pretrain.sh \
+          --micro_batch_size 2 \
+          --global_batch_size 1536 \
+          --recompute_num_layers 12
+
+.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x7b
+
+   To train Mixtral 8x7B BF16 on 8 nodes, run:
+
+   .. code-block:: shell
+
+      NNODES=8 EXP=examples/megatron/configs/mixtral_8x7B_v0.1-pretrain.yaml \
+      bash examples/run_slurm_pretrain.sh \
+          --micro_batch_size 2 \
+          --global_batch_size 256
+
+.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-72b
+
+   To train Qwen2.5 72B FP8 on 8 nodes, run:
+
+   .. code-block:: shell
+
+      NNODES=8 EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \
+      bash examples/run_slurm_pretrain.sh \
+          --micro_batch_size 8 \
+          --global_batch_size 512 \
+          --recompute_num_layers 80 \
+          --no_fp8_weight_transpose_cache true \
+          --fp8 hybrid
+
+.. _amd-primus-megatron-lm-benchmark-test-vars:
+
+Key options
+-----------
+
+The following are key options to take note of
+
+fp8
+  ``hybrid`` enables FP8 GEMMs.
+
+use_torch_fsdp2
+  ``use_torch_fsdp2: 1``  enables torch fsdp-v2. If FSDP is enabled,
+  set ``use_distributed_optimizer`` and ``overlap_param_gather`` to ``false``.
+
+profile
+  To enable PyTorch profiling, set these parameters:
+
+  .. code-block:: yaml
+
+     profile: true
+     use_pytorch_profiler: true
+     profile_step_end: 7
+     profile_step_start: 6
+
+train_iters
+  The total number of iterations (default: 50).
+
+mock_data
+  True by default.
+
+micro_batch_size
+  Micro batch size.
+
+global_batch_size
+  Global batch size.
+
+recompute_granularity
+  For activation checkpointing.
+
+num_layers
+  For using a reduced number of layers as with proxy models.
+
+Previous versions
+=================
+
+See :doc:`previous-versions/megatron-lm-history` to find documentation for previous releases
+of the ``ROCm/megatron-lm`` Docker image.
+
+This training environment now uses Primus with Megatron as the primary
+configuration. Limited support for the legacy ROCm Megatron-LM is still
+available. For instructions on using ROCm Megatron-LM, see the
+:doc:`megatron-lm` document.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
@@ -9,28 +9,25 @@ Training a model with PyTorch for ROCm
 PyTorch is an open-source machine learning framework that is widely used for
 model training with GPU-optimized components for transformer-based models.

-The `PyTorch for ROCm training Docker <https://hub.docker.com/r/rocm/pytorch-training/tags>`_
-(``rocm/pytorch-training:v25.6``) image provides a prebuilt optimized environment for fine-tuning and pretraining a
-model on AMD Instinct MI325X and MI300X accelerators. It includes the following software components to accelerate
-training workloads:
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml

-+--------------------------+--------------------------------+
-| Software component       | Version                        |
-+==========================+================================+
-| ROCm                     | 6.3.4                          |
-+--------------------------+--------------------------------+
-| PyTorch                  | 2.8.0a0+git7d205b2             |
-+--------------------------+--------------------------------+
-| Python                   | 3.10.17                        |
-+--------------------------+--------------------------------+
-| Transformer Engine       | 1.14.0+2f85f5f2                |
-+--------------------------+--------------------------------+
-| Flash Attention          | 3.0.0.post1                    |
-+--------------------------+--------------------------------+
-| hipBLASLt                | 0.15.0-8c6919d                 |
-+--------------------------+--------------------------------+
-| Triton                   | 3.3.0                          |
-+--------------------------+--------------------------------+
+   {% set dockers = data.dockers %}
+   {% set docker = dockers[0] %}
+   The `PyTorch for ROCm training Docker <{{ docker.docker_hub_url }}>`__
+   (``{{ docker.pull_tag }}``) image provides a prebuilt optimized environment for fine-tuning and pretraining a
+   model on AMD Instinct MI325X and MI300X accelerators. It includes the following software components to accelerate
+   training workloads:
+
+   .. list-table::
+      :header-rows: 1
+
+      * - Software component
+        - Version
+
+      {% for component_name, component_version in docker.components.items() %}
+      * - {{ component_name }}
+        - {{ component_version }}
+      {% endfor %}

 .. _amd-pytorch-training-model-support:

@@ -38,34 +35,35 @@ Supported models
 ================

 The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X accelerators.
+Some instructions, commands, and training recommendations in this documentation might
+vary by model -- select one to get started.

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml

-   {% set unified_docker = data.unified_docker.latest %}
+   {% set unified_docker = data.dockers[0] %}
   {% set model_groups = data.model_groups %}
-
   .. raw:: html

      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
-        <div class="row">
-          <div class="col-2 me-2 model-param-head">Workload</div>
-          <div class="row col-10">
+         <div class="row gx-0">
+            <div class="col-2 me-1 px-2 model-param-head">Model</div>
+            <div class="row col-10 pe-0">
      {% for model_group in model_groups %}
-            <div class="col-6 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+               <div class="col-3 px-2 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
      {% endfor %}
            </div>
         </div>

-        <div class="row mt-1">
-          <div class="col-2 me-2 model-param-head">Model</div>
-          <div class="row col-10">
+         <div class="row gx-0 pt-1">
+            <div class="col-2 me-1 px-2 model-param-head">Variant</div>
+            <div class="row col-10 pe-0">
      {% for model_group in model_groups %}
         {% set models = model_group.models %}
         {% for model in models %}
            {% if models|length % 3 == 0 %}
-            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+               <div class="col-4 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
            {% else %}
-            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+               <div class="col-6 px-2 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
            {% endif %}
         {% endfor %}
      {% endfor %}
@@ -73,56 +71,88 @@ The following models are pre-optimized for performance on the AMD Instinct MI325
         </div>
      </div>

-   .. note::

-      Some models require an external license agreement through a third party (for example, Meta).
+   .. _amd-pytorch-training-supported-training-modes:

-   .. _amd-pytorch-training-performance-measurements:
+   The following table lists supported training modes per model.

-   Performance measurements
-   ========================
+   .. dropdown:: Supported training modes

-   To evaluate performance, the
-   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
-   page provides reference throughput and latency measurements for training
-   popular AI models.
+      .. list-table::
+         :header-rows: 1
+
+         * - Model
+           - Supported training modes
+
+      {% for model_group in model_groups %}
+         {% set models = model_group.models %}
+         {% for model in models %}
+         * - {{ model.model }}
+           - ``{{ model.training_modes | join('``, ``') }}``
+
+         {% endfor %}
+      {% endfor %}

      .. note::

+         Some model and fine-tuning combinations are not listed. This is
+         because the `upstream torchtune repository <https://github.com/pytorch/torchtune>`__
+         doesn't provide default YAML configurations for them.
+         For advanced usage, you can create a custom configuration to enable
+         unlisted fine-tuning methods by using an existing file in the
+         ``/workspace/torchtune/recipes/configs`` directory as a template.
+
+.. _amd-pytorch-training-performance-measurements:
+
+Performance measurements
+========================
+
+To evaluate performance, the
+`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
+page provides reference throughput and latency measurements for training
+popular AI models.
+
+.. note::
+
   The performance data presented in
   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
   should not be interpreted as the peak performance achievable by AMD
   Instinct MI325X and MI300X accelerators or ROCm software.

-   System validation
-   =================
+System validation
+=================

-   Before running AI workloads, it's important to validate that your AMD hardware is configured
-   correctly and performing optimally.
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.

-   If you have already validated your system settings, including aspects like NUMA auto-balancing, you
-   can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
-   optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
-   before starting training.
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting training.

-   To test for optimal performance, consult the recommended :ref:`System health benchmarks
-   <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
-   system's configuration.
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.

-   This Docker image is optimized for specific model configurations outlined
-   below. Performance can vary for other training workloads, as AMD
-   doesn’t validate configurations and run conditions outside those described.
+This Docker image is optimized for specific model configurations outlined
+below. Performance can vary for other training workloads, as AMD
+doesn’t test configurations and run conditions outside those described.

-   Benchmarking
-   ============
+Run training
+============

-   Once the setup is complete, choose between two options to start benchmarking:
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
+
+   {% set unified_docker = data.dockers[0] %}
+   {% set model_groups = data.model_groups %}
+
+   Once the setup is complete, choose between two options to start benchmarking training:

   .. tab-set::

      .. tab-item:: MAD-integrated benchmarking

-         Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+         1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
            directory and install the required packages on the host machine.

            .. code-block:: shell
@@ -136,8 +166,8 @@ The following models are pre-optimized for performance on the AMD Instinct MI325

         .. container:: model-doc {{ model.mad_tag }}

-            For example, use this command to run the performance benchmark test on the {{ model.model }} model
-            using one GPU with the {{ model.precision }} data type on the host machine.
+            2. For example, use this command to run the performance benchmark test on the {{ model.model }} model
+               using one node with the {{ model.precision }} data type on the host machine.

               .. code-block:: shell

@@ -149,8 +179,8 @@ The following models are pre-optimized for performance on the AMD Instinct MI325
                      --timeout 28800

               MAD launches a Docker container with the name
-            ``container_ci-{{ model.mad_tag }}``, for example. The latency and throughput reports of the
-            model are collected in the following path: ``~/MAD/perf.csv``.
+               ``container_ci-{{ model.mad_tag }}``. The latency and throughput reports of the
+               model are collected in ``~/MAD/perf.csv``.

      {% endfor %}
   {% endfor %}
@@ -159,17 +189,30 @@ The following models are pre-optimized for performance on the AMD Instinct MI325

         .. rubric:: Download the Docker image and required packages

-         Use the following command to pull the Docker image from Docker Hub.
+         1. Use the following command to pull the Docker image from Docker Hub.

            .. code-block:: shell

               docker pull {{ unified_docker.pull_tag }}

-         Run the Docker container.
+         2. Run the Docker container.

            .. code-block:: shell

-            docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env {{ unified_docker.pull_tag }}
+               docker run -it \
+                   --device /dev/dri \
+                   --device /dev/kfd \
+                   --network host \
+                   --ipc host \
+                   --group-add video \
+                   --cap-add SYS_PTRACE \
+                   --security-opt seccomp=unconfined \
+                   --privileged \
+                   -v $HOME:$HOME \
+                   -v $HOME/.ssh:/root/.ssh \
+                   --shm-size 64G \
+                   --name training_env \
+                   {{ unified_docker.pull_tag }}

            Use these commands if you exit the ``training_env`` container and need to return to it.

@@ -178,7 +221,7 @@ The following models are pre-optimized for performance on the AMD Instinct MI325
               docker start training_env
               docker exec -it training_env bash

-         In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
+         3. In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
            repository and navigate to the benchmark scripts directory
            ``/workspace/MAD/scripts/pytorch_train``.

@@ -189,7 +232,7 @@ The following models are pre-optimized for performance on the AMD Instinct MI325

         .. rubric:: Prepare training datasets and dependencies

-         The following benchmarking examples require downloading models and datasets
+         1. The following benchmarking examples require downloading models and datasets
            from Hugging Face. To ensure successful access to gated repos, set your
            ``HF_TOKEN``.

@@ -197,7 +240,7 @@ The following models are pre-optimized for performance on the AMD Instinct MI325

               export HF_TOKEN=$your_personal_hugging_face_access_token

-         Run the setup script to install libraries and datasets needed for benchmarking.
+         2. Run the setup script to install libraries and datasets needed for benchmarking.

            .. code-block:: shell

@@ -317,64 +360,42 @@ The following models are pre-optimized for performance on the AMD Instinct MI325

   {% for model_group in model_groups %}
      {% for model in model_group.models %}
-         {% if model_group.tag == "pre-training" and model.mad_tag in ["pyt_train_llama-3.1-8b", "pyt_train_llama-3.1-70b", "pyt_train_flux"] %}
+         {% set training_modes = model.training_modes %}
+         {% set training_mode_descs = {
+            "pretrain": "Benchmark pre-training.",
+            "HF_pretrain": "Llama 3.1 8B pre-training with FP8 precision."
+         } %}
+         {% set available_modes = training_modes | select("in", ["pretrain", "HF_pretrain"]) | list %}
+         {% if available_modes %}

         .. container:: model-doc {{ model.mad_tag }}

-            .. rubric:: Pretraining
+            .. rubric:: Pre-training

            To start the pre-training benchmark, use the following command with the
            appropriate options. See the following list of options and their descriptions.

            .. code-block:: shell

-               ./pytorch_benchmark_report.sh -t pretrain -m {{ model.model_repo }} -p $datatype -s $sequence_length
-
-            .. list-table::
-               :header-rows: 1
-
-               * - Name
-                 - Options
-                 - Description
-
-            {% if model.mad_tag == "pyt_train_llama-3.1-8b" %}
-               * - ``$datatype``
-                 - ``BF16`` or ``FP8``
-                 - Only Llama 3.1 8B supports FP8 precision.
-            {% else %}
-               * - ``$datatype``
-                 - ``BF16``
-                 - Only Llama 3.1 8B supports FP8 precision.
-            {% endif %}
-
-               * - ``$sequence_length``
-                 - Sequence length for the language model.
-                 - Between 2048 and 8192. 8192 by default.
+               ./pytorch_benchmark_report.sh -t {% if available_modes | length == 1 %}{{ available_modes[0] }}{% else %}$training_mode{% endif %} \
+                   -m {{ model.model_repo }} \
+                   -p $datatype \
+                   -s $sequence_length

            {% if model.mad_tag == "pyt_train_flux" %}
            .. container:: model-doc {{ model.mad_tag }}

               .. note::

+                  Currently, FLUX models are not supported out-of-the-box on {{ unified_docker.pull_tag }}.
+                  To use FLUX, refer to the previous version of the ``pytorch-training`` Docker: :doc:`previous-versions/pytorch-training-v25.6`
+
                  Occasionally, downloading the Flux dataset might fail. In the event of this
                  error, manually download it from Hugging Face at
                  `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
                  and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
                  the required dataset.
            {% endif %}
-         {% endif %}
-
-         {% if model_group.tag == "fine-tuning" %}
-         .. container:: model-doc {{ model.mad_tag }}
-
-            .. rubric:: Fine-tuning
-
-            To start the fine-tuning benchmark, use the following command with the
-            appropriate options. See the following list of options and their descriptions.
-
-            .. code-block:: shell
-
-               ./pytorch_benchmark_report.sh -t $training_mode -m {{ model.model_repo }} -p BF16 -s $sequence_length

            .. list-table::
               :header-rows: 1
@@ -383,45 +404,91 @@ The following models are pre-optimized for performance on the AMD Instinct MI325
                 - Options
                 - Description

-               * - ``$training_mode``
-                 - ``finetune_fw``
-                 - Full weight fine-tuning (BF16 supported)
-
-               * -
-                 - ``finetune_lora``
-                 - LoRA fine-tuning (BF16 supported)
-
-               * -
-                 - ``finetune_qlora``
-                 - QLoRA fine-tuning (BF16 supported)
-
-               * -
-                 - ``HF_finetune_lora``
-                 - LoRA fine-tuning with Hugging Face PEFT
+               {% for mode in available_modes %}
+               * - {% if loop.first %}``$training_mode``{% endif %}
+                 - ``{{ mode }}``
+                 - {{ training_mode_descs[mode] }}
+               {% endfor %}

               * - ``$datatype``
-                 - ``BF16``
-                 - All models support BF16.
+                 - ``BF16``{% if model.mad_tag == "pyt_train_llama-3.1-8b" %} or ``FP8``{% endif %}
+                 - Only Llama 3.1 8B supports FP8 precision.
+
+               * - ``$sequence_length``
+                 - Sequence length for the language model.
+                 - Between 2048 and 8192. 8192 by default.
+         {% endif %}
+
+         {% set training_mode_descs = {
+            "finetune_fw": "Full weight fine-tuning (BF16 and FP8 supported).",
+            "finetune_lora": "LoRA fine-tuning (BF16 supported).",
+            "finetune_qlora": "QLoRA fine-tuning (BF16 supported).",
+            "HF_finetune_lora": "LoRA fine-tuning with Hugging Face PEFT.",
+         } %}
+         {% set available_modes = training_modes | select("in", ["finetune_fw", "finetune_lora", "finetune_qlora", "HF_finetune_lora"]) | list %}
+         {% if available_modes %}
+         .. container:: model-doc {{ model.mad_tag }}
+
+            .. rubric:: Fine-tuning
+
+            To start the fine-tuning benchmark, use the following command with the
+            appropriate options. See the following list of options and their descriptions.
+            See :ref:`supported training modes <amd-pytorch-training-supported-training-modes>`.
+
+            .. code-block:: shell
+
+               ./pytorch_benchmark_report.sh -t $training_mode \
+                   -m {{ model.model_repo }} \
+                   -p $datatype \
+                   -s $sequence_length
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Name
+                 - Options
+                 - Description
+
+               {% for mode in available_modes %}
+               * - {% if loop.first %}``$training_mode``{% endif %}
+                 - ``{{ mode }}``
+                 - {{ training_mode_descs[mode] }}
+               {% endfor %}
+
+               * - ``$datatype``
+                 - ``BF16``{% if "finetune_fw" in available_modes %} or ``FP8``{% endif %}
+                 - All models support BF16.{% if "finetune_fw" in available_modes %} FP8 is only available for full weight fine-tuning.{% endif %}

               * - ``$sequence_length``
                 - Between 2048 and 16384.
                 - Sequence length for the language model.

+            {% if model.mad_tag in ["pyt_train_llama3.2-vision-11b", "pyt_train_llama-3.2-vision-90b"] %}
            .. note::

-               {{ model.model }} currently supports the following fine-tuning methods:
+               For LoRA and QLoRA support with vision models (Llama 3.2 11B and 90B),
+               use the following torchtune commit for compatibility:

-            {% for method in model.training_modes %}
-               * ``{{ method }}``
-            {% endfor %}
-            {% if model.training_modes|length < 4 %}
+               .. code-block:: shell
+
+                  git checkout 48192e23188b1fc524dd6d127725ceb2348e7f0e
+
+            {% elif model.mad_tag in ["pyt_train_llama-2-7b", "pyt_train_llama-2-13b", "pyt_train_llama-2-70b"] %}
+            .. note::
+
+               You might encounter the following error with Llama 2: ``ValueError: seq_len (16384) of
+               input tensor should be smaller than max_seq_len (4096)``.
+               This error indicates that an input sequence is longer than the model's maximum context window.
+
+               Ensure your tokenized input does not exceed the model's ``max_seq_len`` (4096
+               tokens in this case). You can resolve this by truncating the input or splitting
+               it into smaller chunks before passing it to the model.
+
+               Note on reproducibility: The results in this guide are based on
+               commit ``b4c98ac`` from the upstream
+               `<https://github.com/pytorch/torchtune>`__ repository. For the
+               latest updates, you can use the main branch.

-               The upstream `torchtune <https://github.com/pytorch/torchtune>`_ repository
-               does not currently provide YAML configuration files for other combinations of
-               model to fine-tuning method
-               However, you can still configure your own YAML files to enable support for
-               fine-tuning methods not listed here by following existing patterns in the
-               ``/workspace/torchtune/recipes/configs`` directory.
            {% endif %}
         {% endif %}
      {% endfor %}
@@ -431,6 +498,50 @@ The following models are pre-optimized for performance on the AMD Instinct MI325

            For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.

+Multi-node training
+-------------------
+
+Pre-training
+~~~~~~~~~~~~
+
+Multi-node training with torchtitan is supported. The provided SLURM script is pre-configured for Llama 3 70B.
+
+To launch the training job on a SLURM cluster for Llama 3 70B, run the following commands from the MAD repository.
+
+.. code-block:: shell
+
+   # In the MAD repository
+   cd scripts/pytorch_train
+   sbatch run_slurm_train.sh
+
+Fine-tuning
+~~~~~~~~~~~
+
+Multi-node training with torchtune is supported. The provided SLURM script is pre-configured for Llama 3.3 70B.
+
+To launch the training job on a SLURM cluster for Llama 3.3 70B, run the following commands from the MAD repository.
+
+.. code-block:: shell
+
+   huggingface-cli login # Get access to HF Llama model space
+   huggingface-cli download meta-llama/Llama-3.3-70B-Instruct --local-dir ./models/Llama-3.3-70B-Instruct # Download the Llama 3.3 model locally
+   # In the MAD repository
+   cd scripts/pytorch_train
+   sbatch Torchtune_Multinode.sh
+
+.. note::
+
+   Information regarding benchmark setup:
+
+   * By default, Llama 3.3 70B is fine-tuned using ``alpaca_dataset``.
+   * You can adjust the torchtune `YAML configuration file
+     <https://github.com/pytorch/torchtune/blob/main/recipes/configs/llama3_3/70B_full_multinode.yaml>`__
+     if you're using a different model.
+   * The number of nodes and other parameters can be tuned in the SLURM script ``Torchtune_Multinode.sh``.
+   * Set the ``mounting_paths`` inside the SLURM script.
+
+Once the run is finished, you can find the log files in the ``result_torchtune/`` directory.
+
 Further reading
 ===============

--- a/docs/how-to/rocm-for-ai/training/index.rst
+++ b/docs/how-to/rocm-for-ai/training/index.rst
@@ -21,6 +21,8 @@ In this guide, you'll learn about:

 - Training a model

+  - :doc:`With Primus (Megatron-LM backend) <benchmark-docker/primus-megatron>`
+
  - :doc:`With Megatron-LM <benchmark-docker/megatron-lm>`

  - :doc:`With PyTorch <benchmark-docker/pytorch-training>`
--- a/docs/reference/gpu-arch-specs.rst
+++ b/docs/reference/gpu-arch-specs.rst
@@ -319,7 +319,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - Radeon AI PRO R9700
          - RDNA4
          - gfx1201
-          - 16
+          - 32
          - 64
          - 32 or 64
          - 128
--- a/docs/sphinx/_toc.yml.in
+++ b/docs/sphinx/_toc.yml.in
@@ -27,6 +27,28 @@ subtrees:
    title: ROCm on Radeon GPUs
  - file: how-to/deep-learning-rocm.md
    title: Deep learning frameworks
+    subtrees:
+    - entries:
+      - file: compatibility/ml-compatibility/pytorch-compatibility.rst
+        title: PyTorch compatibility
+      - file: compatibility/ml-compatibility/tensorflow-compatibility.rst
+        title: TensorFlow compatibility
+      - file: compatibility/ml-compatibility/jax-compatibility.rst
+        title: JAX compatibility
+      - file: compatibility/ml-compatibility/verl-compatibility.rst
+        title: verl compatibility
+      - file: compatibility/ml-compatibility/stanford-megatron-lm-compatibility.rst
+        title: Stanford Megatron-LM compatibility
+      - file: compatibility/ml-compatibility/dgl-compatibility.rst
+        title: DGL compatibility
+      - file: compatibility/ml-compatibility/megablocks-compatibility.rst
+        title: Megablocks compatibility
+      - file: compatibility/ml-compatibility/taichi-compatibility.rst
+        title: Taichi compatibility
+      - file: compatibility/ml-compatibility/ray-compatibility.rst
+        title: Ray compatibility
+      - file: compatibility/ml-compatibility/llama-cpp-compatibility.rst
+        title: llama.cpp compatibility
  - file: how-to/build-rocm.rst
    title: Build ROCm from source

@@ -44,8 +66,8 @@ subtrees:
        title: Training
        subtrees:
        - entries:
-          - file: how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
-            title: Train a model with Megatron-LM
+          - file: how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst
+            title: Train a model with Primus and Megatron-Core
          - file: how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
            title: Train a model with PyTorch
          - file: how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -234,7 +234,7 @@ sphinx-notfound-page==1.1.0
    # via rocm-docs-core
 sphinx-reredirects==0.1.6
    # via -r requirements.in
-sphinx-sitemap==2.7.2
+sphinx-sitemap==2.8.0
    # via -r requirements.in
 sphinxcontrib-applehelp==2.0.0
    # via sphinx
--- a/docs/sphinx/static/css/vllm-benchmark.css
+++ b/docs/sphinx/static/css/vllm-benchmark.css
@@ -7,15 +7,14 @@ html {
  --compat-head-color: var(--pst-color-surface);
  --compat-param-hover-color: var(--pst-color-link-hover);
  --compat-param-selected-color: var(--pst-color-primary);
+  --compat-border-color: var(--pst-color-border);
 }

 html[data-theme="light"] {
-  --compat-border-color: var(--pst-gray-500);
  --compat-param-disabled-color: var(--pst-gray-300);
 }

 html[data-theme="dark"] {
-  --compat-border-color: var(--pst-gray-600);
  --compat-param-disabled-color: var(--pst-gray-600);
 }

@@ -23,6 +22,7 @@ div#vllm-benchmark-ud-params-picker.container-fluid {
  padding: 0 0 1rem 0;
 }

+div[data-param-k="model-group"],
 div[data-param-k="model"] {
  background-color: var(--compat-bg-color);
  padding: 2px;
@@ -31,40 +31,19 @@ div[data-param-k="model"] {
  cursor: pointer;
 }

+div[data-param-k="model-group"][data-param-state="selected"],
 div[data-param-k="model"][data-param-state="selected"] {
  background-color: var(--compat-param-selected-color);
  color: var(--compat-fg-color);
 }

-div[data-param-k="model"][data-param-state="latest-version"] {
-  background-color: var(--compat-param-selected-color);
-  color: var(--compat-fg-color);
-}
-
-div[data-param-k="model"][data-param-state="disabled"] {
-  background-color: var(--compat-param-disabled-color);
-  text-decoration: line-through;
-  /* text-decoration-color: var(--pst-color-danger); */
-  cursor: auto;
-}
-
-div[data-param-k="model"]:not([data-param-state]):hover {
+div[data-param-k="model-group"]:hover,
+div[data-param-k="model"]:hover {
  background-color: var(--compat-param-hover-color);
-}
-
-div[data-param-k="model-group"] {
-  background-color: var(--compat-bg-color);
-  padding: 2px;
-  border: solid 1px var(--compat-border-color);
-  font-weight: 500;
-  cursor: pointer;
-}
-
-div[data-param-k="model-group"][data-param-state="selected"] {
-  background-color: var(--compat-param-selected-color);
  color: var(--compat-fg-color);
 }

+/*
 div[data-param-k="model-group"][data-param-state="latest-version"] {
  background-color: var(--compat-param-selected-color);
  color: var(--compat-fg-color);
@@ -73,26 +52,19 @@ div[data-param-k="model-group"][data-param-state="latest-version"] {
 div[data-param-k="model-group"][data-param-state="disabled"] {
  background-color: var(--compat-param-disabled-color);
  text-decoration: line-through;
-  /* text-decoration-color: var(--pst-color-danger); */
+  text-decoration-color: var(--pst-color-danger);
  cursor: auto;
 }
-
-div[data-param-k="model-group"]:not([data-param-state]):hover {
-  background-color: var(--compat-param-hover-color);
-}
+*/

 .model-param-head {
  background-color: var(--compat-head-color);
  padding: 0.15rem 0.15rem 0.15rem 0.67rem;
-  /* margin: 2px; */
-  border-right: solid 2px var(--compat-accent-color);
+  border-right: solid 4px var(--compat-accent-color);
  font-weight: 600;
 }

 .model-param {
-  /* padding: 2px; */
-  /* margin: 0 2px 0 2px; */
-  /* margin: 2px; */
  border: solid 1px var(--compat-border-color);
  font-weight: 500;
 }