Add pip module installs to test job

Quick fix
Remove nanobind install script
2026-01-10 07:08:08 -05:00 · 2025-09-09 19:40:46 +00:00 · 2025-09-09 19:29:12 +00:00 · 2025-09-09 19:27:41 +00:00 · 2025-09-09 17:27:21 +00:00 · 2025-09-09 17:18:17 +00:00
82 changed files with 5165 additions and 984 deletions
--- a/.azuredevops/components/HIP.yml
+++ b/.azuredevops/components/HIP.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: hip_clr_combined
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -35,93 +54,24 @@ parameters:
  type: object
  default:
    - llvm-project
-
-# hip and clr are tightly-coupled
-# run this same template for both repos
-# any changes for clr should just trigger HIP pipeline
-# similarly for hipother repo, for Nvidia backend
+    - ROCR-Runtime

 - name: jobMatrix
  type: object
  default:
    buildJobs:
-      - { os: ubuntu2204, packageManager: apt }
-      - { os: almalinux8, packageManager: dnf }
+      - { os: ubuntu2204, packageManager: apt, platform: amd }
+      - { os: ubuntu2204, packageManager: apt, platform: nvidia }
+      - { os: almalinux8, packageManager: dnf, platform: amd }
+      - { os: almalinux8, packageManager: dnf, platform: nvidia }

-# HIP with AMD backend
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: hip_clr_combined_${{ job.os }}_amd
-    pool:
-      vmImage: 'ubuntu-22.04'
-    ${{ if eq(job.os, 'almalinux8') }}:
-      container:
-        image: rocmexternalcicd.azurecr.io/manylinux228:latest
-        endpoint: ContainerService3
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-        packageManager: ${{ job.packageManager }}
-  # checkout triggering repo (either HIP or clr)
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      parameters:
-        checkoutRepo: ${{ parameters.checkoutRepo }}
-  # if this is triggered by HIP repo, matching repo is clr
-  # if this is triggered by clr repo, matching repo is HIP
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      parameters:
-        checkoutRepo: matching_repo
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      parameters:
-        checkoutRepo: hipother_repo
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmDependenciesAMD }}
-        aggregatePipeline: ${{ parameters.aggregatePipeline }}
-        os: ${{ job.os }}
-  # compile clr
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-      parameters:
-        componentName: clr
-        cmakeBuildDir: '$(Build.SourcesDirectory)/clr/build'
-        cmakeSourceDir: '$(Build.SourcesDirectory)/clr'
-        os: ${{ job.os }}
-        useAmdclang: false
-        extraBuildFlags: >-
-          -DHIP_COMMON_DIR=$(Build.SourcesDirectory)/HIP
-          -DHIP_PLATFORM=amd
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-          -DROCM_PATH=$(Agent.BuildDirectory)/rocm
-          -DHIPCC_BIN_DIR=$(Agent.BuildDirectory)/rocm/bin
-          -DCLR_BUILD_HIP=ON
-          -DCLR_BUILD_OCL=ON
-          -GNinja
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
-      parameters:
-        artifactName: amd
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
-      parameters:
-        artifactName: amd
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-    #   parameters:
-    #     aptPackages: ${{ parameters.aptPackages }}
-    #     pipModules: ${{ parameters.pipModules }}
-    #     environment: amd
-
-# HIP with Nvidia backend
- ${{ each job in parameters.jobMatrix.buildJobs }}:
-  - job: hip_clr_combined_${{ job.os }}_nvidia
+  - job: ${{ parameters.componentName }}_${{ job.os }}_${{ job.platform }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}
    pool:
      vmImage: 'ubuntu-22.04'
    ${{ if eq(job.os, 'almalinux8') }}:
@@ -140,49 +90,45 @@ jobs:
        pipModules: ${{ parameters.pipModules }}
        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-  # checkout triggering repo (either HIP or clr)
+    # full checkout of rocm-systems superrepo, we need clr, hip, and hipother
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
-  # if this is triggered by HIP repo, matching repo is clr
-  # if this is triggered by clr repo, matching repo is HIP
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      parameters:
-        checkoutRepo: matching_repo
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      parameters:
-        checkoutRepo: hipother_repo
+        # sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmDependenciesNvidia }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
        os: ${{ job.os }}
-    - script: 'ls -1R $(Agent.BuildDirectory)/rocm'
-      displayName: 'Artifact listing'
-  # compile clr
+        ${{ if eq(job.platform, 'amd') }}:
+          dependencyList: ${{ parameters.rocmDependenciesAMD }}
+        ${{ elseif eq(job.platform, 'nvidia') }}:
+          dependencyList: ${{ parameters.rocmDependenciesNvidia }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        componentName: clr
-        cmakeBuildDir: '$(Build.SourcesDirectory)/clr/build'
-        cmakeSourceDir: '$(Build.SourcesDirectory)/clr'
+        cmakeBuildDir: $(Agent.BuildDirectory)/s/projects/clr/build
+        cmakeSourceDir: $(Agent.BuildDirectory)/s/projects/clr
        os: ${{ job.os }}
        useAmdclang: false
        extraBuildFlags: >-
-          -DHIP_COMMON_DIR=$(Build.SourcesDirectory)/HIP
-          -DHIP_PLATFORM=nvidia
+          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+          -DROCM_PATH=$(Agent.BuildDirectory)/rocm
          -DHIPCC_BIN_DIR=$(Agent.BuildDirectory)/rocm/bin
+          -DHIP_COMMON_DIR=$(Agent.BuildDirectory)/s/projects/hip
+          -DHIPNV_DIR=$(Agent.BuildDirectory)/s/projects/hipother/hipnv
+          -DHIP_PLATFORM=${{ job.platform }}
          -DCLR_BUILD_HIP=ON
-          -DCLR_BUILD_OCL=OFF
-          -DHIPNV_DIR=$(Build.SourcesDirectory)/hipother/hipnv
+          -DCLR_BUILD_OCL=ON
          -GNinja
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+      parameters:
+        artifactName: ${{ job.platform }}
+        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
-        artifactName: nvidia
+        artifactName: ${{ job.platform }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
-    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-    #   parameters:
-    #     aptPackages: ${{ parameters.aptPackages }}
-    #     pipModules: ${{ parameters.pipModules }}
-    #     environment: nvidia
--- a/.azuredevops/components/MIOpen.yml
+++ b/.azuredevops/components/MIOpen.yml
@@ -123,7 +123,7 @@ jobs:
    - template: /.azuredevops/variables-global.yml
    - name: ROCM_PATH
      value: $(Agent.BuildDirectory)/rocm
-    pool: ${{ variables.HIGH_BUILD_POOL }}
+    pool: ${{ variables.MEDIUM_BUILD_POOL }}
    workspace:
      clean: all
    steps:
@@ -131,6 +131,7 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
@@ -149,6 +150,7 @@ jobs:
          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - task: Bash@3
      displayName: Build and install other dependencies
+      retryCountOnTaskFailure: 3
      inputs:
        targetType: inline
        workingDirectory: $(Agent.BuildDirectory)/s
@@ -210,6 +212,7 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
@@ -228,6 +231,7 @@ jobs:
          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - task: Bash@3
      displayName: Build and install other dependencies
+      retryCountOnTaskFailure: 3
      inputs:
        targetType: inline
        workingDirectory: $(Agent.BuildDirectory)/s
--- a/.azuredevops/components/Tensile.yml
+++ b/.azuredevops/components/Tensile.yml
@@ -171,6 +171,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - task: DownloadPipelineArtifact@2
      displayName: 'Download Pipeline Wheel Files'
+      retryCountOnTaskFailure: 3
      inputs:
        itemPattern: '**/*${{ job.os }}*.whl'
        targetPath: $(Agent.BuildDirectory)
--- a/.azuredevops/components/hipBLASLt.yml
+++ b/.azuredevops/components/hipBLASLt.yml
@@ -35,9 +35,13 @@ parameters:
    - ccache
    - gfortran
    - git
+    - libboost-filesystem-dev
+    - libboost-program-options-dev
    - libdrm-dev
+    - liblapack-dev
    - libmsgpack-dev
    - libnuma-dev
+    - libopenblas-dev
    - ninja-build
    - python3-pip
    - python3-venv
@@ -46,6 +50,12 @@ parameters:
  default:
    - joblib
    - "packaging>=22.0"
+    - pyyaml
+    - msgpack
+    - simplejson
+    - ujson
+    - orjson
+    - yappi
    - --upgrade
 - name: rocmDependencies
  type: object
@@ -81,12 +91,12 @@ parameters:
      - { pool: rocm-ci_medium_build_pool, os: ubuntu2204, packageManager: apt, target: gfx90a }
      - { pool: rocm-ci_medium_build_pool, os: ubuntu2204, packageManager: apt, target: gfx1201 }
      - { pool: rocm-ci_medium_build_pool, os: ubuntu2204, packageManager: apt, target: gfx1100 }
-      - { pool: rocm-ci_medium_build_pool, os: ubuntu2204, packageManager: apt, target: gfx1030 }
+      #- { pool: rocm-ci_medium_build_pool, os: ubuntu2204, packageManager: apt, target: gfx1030 }
      - { pool: rocm-ci_ultra_build_pool, os: almalinux8, packageManager: dnf, target: gfx942 }
      - { pool: rocm-ci_medium_build_pool, os: almalinux8, packageManager: dnf, target: gfx90a }
      - { pool: rocm-ci_medium_build_pool, os: almalinux8, packageManager: dnf, target: gfx1201 }
      - { pool: rocm-ci_medium_build_pool, os: almalinux8, packageManager: dnf, target: gfx1100 }
-      - { pool: rocm-ci_medium_build_pool, os: almalinux8, packageManager: dnf, target: gfx1030 }
+      #- { pool: rocm-ci_medium_build_pool, os: almalinux8, packageManager: dnf, target: gfx1030 }
    testJobs:
      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
@@ -168,8 +178,8 @@ jobs:
          mkdir -p $(Agent.BuildDirectory)/temp-deps
          cd $(Agent.BuildDirectory)/temp-deps
          # position-independent LAPACK is required for almalinux8 builds
-          cmake -DBUILD_GTEST=OFF -DBUILD_LAPACK=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON $(Agent.BuildDirectory)/s/deps
-          make
+          cmake -DBUILD_GTEST=OFF -DBUILD_LAPACK=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON $(Agent.BuildDirectory)/sparse/projects/hipblaslt/deps
+          make -j
          sudo make install
    - script: |
        mkdir -p $(CCACHE_DIR)
@@ -187,6 +197,8 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        os: ${{ job.os }}
+        cmakeSourceDir: $(Agent.BuildDirectory)/sparse/projects/hipblaslt
+        cmakeBuildDir: $(Agent.BuildDirectory)/sparse/projects/hipblaslt/build
        extraBuildFlags: >-
          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor
          -DCMAKE_INCLUDE_PATH=$(Agent.BuildDirectory)/rocm/llvm/include
@@ -195,7 +207,11 @@ jobs:
          -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
          -DCMAKE_C_COMPILER_LAUNCHER=ccache
          -DAMDGPU_TARGETS=${{ job.target }}
+          -DGPU_TARGETS=${{ job.target }}
          -DBUILD_CLIENTS_TESTS=ON
+          -DHIPBLASLT_ENABLE_ROCROLLER=ON
+          -DHIPBLASLT_ENABLE_FETCH=ON
+          -DHIPBLASLT_ENABLE_BLIS=OFF
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
--- a/.azuredevops/components/hipSPARSE.yml
+++ b/.azuredevops/components/hipSPARSE.yml
@@ -69,7 +69,7 @@ parameters:
      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
+      #- { os: ubuntu2204, packageManager: apt, target: gfx1030 }
      - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
    testJobs:
      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
--- a/.azuredevops/components/hipSPARSELt.yml
+++ b/.azuredevops/components/hipSPARSELt.yml
@@ -113,7 +113,8 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
-        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+        # ignore sparse checkout for monorepo case, we want access to hipblaslt directory
+        # sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
@@ -130,7 +131,10 @@ jobs:
      displayName: Create temp folder for external dependencies
  # hipSPARSELt already has a CMake script for external deps, so we can just run that
  # https://github.com/ROCm/hipSPARSELt/blob/develop/deps/CMakeLists.txt
-    - script: cmake $(Pipeline.Workspace)/s/deps
+    - ${{ if ne(parameters.sparseCheckoutDir, '') }}:
+        script: cmake $(Pipeline.Workspace)/s/projects/hipsparselt/deps
+      ${{ else }}:
+        script: cmake $(Pipeline.Workspace)/s/deps
      displayName: Configure hipSPARSELt external dependencies
      workingDirectory: $(Pipeline.Workspace)/deps
    - script: make
@@ -154,7 +158,11 @@ jobs:
          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm"
          -DROCM_PATH=$(Agent.BuildDirectory)/rocm
          -DBUILD_CLIENTS_TESTS=ON
+          -DBUILD_USE_LOCAL_TENSILE=OFF
          -GNinja
+        ${{ if ne(parameters.sparseCheckoutDir, '') }}:
+          cmakeSourceDir: $(Build.SourcesDirectory)/projects/hipsparselt
+          cmakeBuildDir: $(Build.SourcesDirectory)/projects/hipsparselt
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
        componentName: ${{ parameters.componentName }}
--- a/.azuredevops/components/llvm-project.yml
+++ b/.azuredevops/components/llvm-project.yml
@@ -30,7 +30,7 @@ parameters:
  default:
    buildJobs:
      - { os: ubuntu2204, packageManager: apt }
-      - { os: ubuntu2404, packageManager: apt }
+      # - { os: ubuntu2404, packageManager: apt }
      - { os: almalinux8, packageManager: dnf }

 jobs:
--- a/.azuredevops/components/origami.yml
+++ b/.azuredevops/components/origami.yml
@@ -0,0 +1,236 @@
+parameters:
+- name: componentName
+  type: string
+  default: origami
+- name: checkoutRepo
+  type: string
+  default: 'self'
+- name: checkoutRef
+  type: string
+  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
+# set to true if doing full build of ROCm stack
+# and dependencies are pulled from same pipeline
+- name: aggregatePipeline
+  type: boolean
+  default: false
+- name: aptPackages
+  type: object
+  default:
+    - cmake
+    - git
+    - ninja-build
+    - wget
+    - python3
+    - python3-dev
+    - python3-pip
+- name: pipModules
+  type: object
+  default:
+    - nanobind>=2.0.0
+- name: rocmDependencies
+  type: object
+  default:
+    - clr
+    - llvm-project
+    - rocm-cmake
+    - rocminfo
+    - ROCR-Runtime
+    - rocprofiler-register
+- name: rocmTestDependencies
+  type: object
+  default:
+    - clr
+    - llvm-project
+    - rocm-cmake
+    - rocminfo
+    - ROCR-Runtime
+    - rocprofiler-register
+
+- name: jobMatrix
+  type: object
+  default:
+    buildJobs:
+      - { os: ubuntu2204, packageManager: apt }
+      - { os: almalinux8, packageManager: dnf }
+    testJobs:
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+- name: downstreamComponentMatrix
+  type: object
+  default:
+    - hipBLASLt:
+      name: hipBLASLt
+      sparseCheckoutDir: projects/hipblaslt
+      skipUnifiedBuild: 'false'
+      buildDependsOn:
+        - origami_build
+
+jobs:
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
+  - job: origami_build_${{ job.os }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    - name: ROCM_PATH
+      value: $(Agent.BuildDirectory)/rocm
+    pool:
+      vmImage: ${{ variables.BASE_BUILD_POOL }}
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
+    workspace:
+      clean: all
+    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        pipModules: ${{ parameters.pipModules }}
+        packageManager: ${{ job.packageManager }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+      parameters:
+        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+      parameters:
+        checkoutRef: ${{ parameters.checkoutRef }}
+        dependencyList: ${{ parameters.rocmDependencies }}
+        os: ${{ job.os }}
+        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        os: ${{ job.os }}
+        extraBuildFlags: >-
+          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+          -DCMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
+          -DORIGAMI_BUILD_SHARED_LIBS=ON
+          -DORIGAMI_ENABLE_PYTHON=ON
+          -DORIGAMI_BUILD_TESTING=ON
+          -GNinja
+    - ${{ if ne(job.os, 'almalinux8') }}:
+      - task: PublishPipelineArtifact@1
+        displayName: 'Publish Build Directory Artifact'
+        inputs:
+          targetPath: '$(Agent.BuildDirectory)/s/build'
+          artifact: '${{ parameters.componentName }}_${{ job.os }}_build_dir'
+          publishLocation: 'pipeline'
+      - task: PublishPipelineArtifact@1
+        displayName: 'Publish Python Source Artifact'
+        inputs:
+          targetPath: '$(Agent.BuildDirectory)/s/python'
+          artifact: '${{ parameters.componentName }}_${{ job.os }}_python_src'
+          publishLocation: 'pipeline'
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
+      parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+        os: ${{ job.os }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
+        os: ${{ job.os }}
+        componentName: ${{ parameters.componentName }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
+
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
+    - job: origami_test_${{ job.os }}_${{ job.target }}
+      timeoutInMinutes: 120
+      dependsOn: origami_build_${{ job.os }}
+      condition:
+        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
+          eq(${{ parameters.aggregatePipeline }}, False)
+        )
+      variables:
+      - group: common
+      - template: /.azuredevops/variables-global.yml
+      pool: ${{ job.target }}_test_pool
+      workspace:
+        clean: all
+      steps:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+        parameters:
+          checkoutRepo: ${{ parameters.checkoutRepo }}
+          sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+          packageManager: ${{ job.packageManager }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+        parameters:
+          preTargetFilter: ${{ parameters.componentName }}
+          os: ${{ job.os }}
+      - task: DownloadPipelineArtifact@2
+        displayName: 'Download Build Directory Artifact'
+        inputs:
+          artifact: '${{ parameters.componentName }}_${{ job.os }}_build_dir'
+          path: '$(Agent.BuildDirectory)/s/build'
+      - task: DownloadPipelineArtifact@2
+        displayName: 'Download Python Source Artifact'
+        inputs:
+          artifact: '${{ parameters.componentName }}_${{ job.os }}_python_src'
+          path: '$(Agent.BuildDirectory)/s/python'
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
+          dependencyList: ${{ parameters.rocmTestDependencies }}
+          os: ${{ job.os }}
+          gpuTarget: ${{ job.target }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      - script: |
+          export PYTHONPATH=$(Agent.BuildDirectory)/s/build/python:$PYTHONPATH
+
+          echo "--- Running origami_test.py ---"
+          python3 $(Agent.BuildDirectory)/s/python/origami_test.py
+          
+          echo "--- Running origami_grid_test.py ---"
+          python3 $(Agent.BuildDirectory)/s/python/origami_grid_test.py
+        displayName: 'Run Python Binding Tests'
+        condition: succeeded()
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+          environment: test
+          gpuTarget: ${{ job.target }}
+
+- ${{ if parameters.triggerDownstreamJobs }}:
+  - ${{ each component in parameters.downstreamComponentMatrix }}:
+    - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
+      - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
+        parameters:
+          checkoutRepo: ${{ parameters.checkoutRepo }}
+          sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
+          buildDependsOn: ${{ component.buildDependsOn }}
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
+          triggerDownstreamJobs: true
+          unifiedBuild: ${{ parameters.unifiedBuild }}
--- a/.azuredevops/components/rccl.yml
+++ b/.azuredevops/components/rccl.yml
@@ -76,7 +76,7 @@ jobs:
    - template: /.azuredevops/variables-global.yml
    - name: HIP_ROCCLR_HOME
      value: $(Build.BinariesDirectory)/rocm
-    pool: ${{ variables.HIGH_BUILD_POOL }}
+    pool: ${{ variables.MEDIUM_BUILD_POOL }}
    workspace:
      clean: all
    steps:
--- a/.azuredevops/components/rocBLAS.yml
+++ b/.azuredevops/components/rocBLAS.yml
@@ -84,12 +84,12 @@ parameters:
      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
      - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
+      #- { os: ubuntu2204, packageManager: apt, target: gfx1030 }
      - { os: almalinux8, packageManager: dnf, target: gfx942 }
      - { os: almalinux8, packageManager: dnf, target: gfx90a }
      - { os: almalinux8, packageManager: dnf, target: gfx1201 }
      - { os: almalinux8, packageManager: dnf, target: gfx1100 }
-      - { os: almalinux8, packageManager: dnf, target: gfx1030 }
+      #- { os: almalinux8, packageManager: dnf, target: gfx1030 }
    testJobs:
      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
@@ -115,6 +115,13 @@ parameters:
 #        buildDependsOn:
 #          - rocBLAS_build
 #          - rocPRIM_build
+    # temporary rocblas->hipblas downstream path while the SOLVERs are disabled
+    - hipBLAS:
+      name: hipBLAS
+      sparseCheckoutDir: projects/hipblas
+      skipUnifiedBuild: 'false'
+      buildDependsOn:
+        - rocBLAS_build

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
--- a/.azuredevops/components/rocDecode.yml
+++ b/.azuredevops/components/rocDecode.yml
@@ -8,6 +8,25 @@ parameters:
 - name: checkoutRef
  type: string
  default: ''
+- name: rocPyDecodeRepo
+  type: string
+  default: rocpydecode_repo
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -56,10 +75,23 @@ parameters:
    testJobs:
      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
+- name: downstreamComponentMatrix
+  type: object
+  default:
+    - rocPyDecode:
+      name: rocPyDecode
+      sparseCheckoutDir: ''
+      skipUnifiedBuild: 'false'
+      buildDependsOn:
+        - rocDecode_build

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: ${{ parameters.componentName }}_build_${{ job.os }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -83,12 +115,15 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        os: ${{ job.os }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        os: ${{ job.os }}
@@ -169,3 +204,15 @@ jobs:
        registerROCmPackages: true
        environment: test
        gpuTarget: ${{ job.target }}
+
+- ${{ if parameters.triggerDownstreamJobs }}:
+  - ${{ each component in parameters.downstreamComponentMatrix }}:
+    - ${{ if not(and(parameters.unifiedBuild, eq(component.skipUnifiedBuild, 'true'))) }}:
+      - template: /.azuredevops/components/${{ component.name }}.yml@pipelines_repo
+        parameters:
+          checkoutRepo: ${{ parameters.rocPyDecodeRepo }}
+          sparseCheckoutDir: ${{ component.sparseCheckoutDir }}
+          buildDependsOn: ${{ component.buildDependsOn }}
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}+${{ parameters.componentName }}
+          triggerDownstreamJobs: true
+          unifiedBuild: ${{ parameters.unifiedBuild }}
--- a/.azuredevops/components/rocPyDecode.yml
+++ b/.azuredevops/components/rocPyDecode.yml
@@ -5,6 +5,22 @@ parameters:
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -47,19 +63,19 @@ parameters:
  type: object
  default:
    buildJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
    testJobs:
-      - gfx942:
-        target: gfx942
-      - gfx90a:
-        target: gfx90a
+      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
+      - { os: ubuntu2204, packageManager: apt, target: gfx90a }

 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: rocPyDecode_build_${{ job.target }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -74,16 +90,20 @@ jobs:
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
+        packageManager: ${{ job.packageManager }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
        dependencyList: ${{ parameters.rocmDependencies }}
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - task: Bash@3
      displayName: 'Save Python Package Paths'
      inputs:
@@ -190,6 +210,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - task: DownloadPipelineArtifact@2
      displayName: 'Download Pipeline Wheel Files'
+      retryCountOnTaskFailure: 3
      inputs:
        itemPattern: '**/*.whl'
        targetPath: $(Agent.BuildDirectory)
--- a/.azuredevops/components/rocSOLVER.yml
+++ b/.azuredevops/components/rocSOLVER.yml
@@ -74,12 +74,12 @@ parameters:
      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
      - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
+      #- { os: ubuntu2204, packageManager: apt, target: gfx1030 }
      - { os: almalinux8, packageManager: dnf, target: gfx942 }
      - { os: almalinux8, packageManager: dnf, target: gfx90a }
      - { os: almalinux8, packageManager: dnf, target: gfx1201 }
      - { os: almalinux8, packageManager: dnf, target: gfx1100 }
-      - { os: almalinux8, packageManager: dnf, target: gfx1030 }
+      #- { os: almalinux8, packageManager: dnf, target: gfx1030 }
    testJobs:
      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
--- a/.azuredevops/components/rocSPARSE.yml
+++ b/.azuredevops/components/rocSPARSE.yml
@@ -73,7 +73,7 @@ parameters:
      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
      - { os: ubuntu2204, packageManager: apt, target: gfx1201 }
      - { os: ubuntu2204, packageManager: apt, target: gfx1100 }
-      - { os: ubuntu2204, packageManager: apt, target: gfx1030 }
+      #- { os: ubuntu2204, packageManager: apt, target: gfx1030 }
    testJobs:
      - { os: ubuntu2204, packageManager: apt, target: gfx942 }
      - { os: ubuntu2204, packageManager: apt, target: gfx90a }
--- a/.azuredevops/components/rocWMMA.yml
+++ b/.azuredevops/components/rocWMMA.yml
@@ -70,7 +70,7 @@ jobs:
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
-    pool: ${{ variables.HIGH_BUILD_POOL }}
+    pool: ${{ variables.MEDIUM_BUILD_POOL }}
    workspace:
      clean: all
    steps:
--- a/.azuredevops/components/rocm-libraries.yml
+++ b/.azuredevops/components/rocm-libraries.yml
@@ -36,8 +36,10 @@ parameters:
    - gfortran
    - git
    - libdrm-dev
+    - liblapack-dev
    - libmsgpack-dev
    - libnuma-dev
+    - libopenblas-dev
    - ninja-build
    - python3-pip
    - python3-venv
@@ -46,6 +48,8 @@ parameters:
  default:
    - joblib
    - "packaging>=22.0"
+    - pytest
+    - pytest-cmake
    - --upgrade
 - name: rocmDependencies
  type: object
@@ -98,12 +102,12 @@ jobs:
    workspace:
      clean: all
    steps:
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
      parameters:
        aptPackages: ${{ parameters.aptPackages }}
        pipModules: ${{ parameters.pipModules }}
        packageManager: ${{ job.packageManager }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-cmake-latest.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
@@ -134,12 +138,26 @@ jobs:
          rocm-libraries | ${{ job.os }} | ${{ job.target }} | $(DAY_STRING)
          rocm-libraries | ${{ job.os }} | ${{ job.target }}
          rocm-libraries | ${{ job.os }}
+    - task: Bash@3
+      displayName: Add paths for CMake and Python site-packages binaries
+      inputs:
+        targetType: inline
+        script: |
+          USER_BASE=$(python3 -m site --user-base)
+          echo "##vso[task.prependpath]$USER_BASE/bin"
+          echo "##vso[task.setvariable variable=PytestCmakePath]$USER_BASE/share/Pytest/cmake"
+        displayName: Set cmake configure paths
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        os: ${{ job.os }}
        extraBuildFlags: >-
-          -DROCM_LIBRARIES_SUPERBUILD=ON
-          -GNinja
+          -D CMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor;$(PytestCmakePath)
+          -D CMAKE_INCLUDE_PATH=$(Agent.BuildDirectory)/rocm/llvm/include
+          -D CMAKE_CXX_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
+          -D CMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
+          -D CMAKE_CXX_COMPILER_LAUNCHER=ccache
+          -D CMAKE_C_COMPILER_LAUNCHER=ccache
+          -G Ninja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
        componentName: ${{ parameters.componentName }}
--- a/.azuredevops/components/rocprofiler-compute.yml
+++ b/.azuredevops/components/rocprofiler-compute.yml
@@ -1,10 +1,29 @@
 parameters:
+- name: componentName
+  type: string
+  default: rocprofiler-compute
 - name: checkoutRepo
  type: string
  default: 'self'
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -78,6 +97,10 @@ parameters:
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: rocprofiler_compute_build_${{ job.target }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -94,15 +117,19 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        extraBuildFlags: >-
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
    # - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
@@ -111,78 +138,83 @@ jobs:
    #     pipModules: ${{ parameters.pipModules }}
    #     gpuTarget: ${{ job.target }}

- ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: rocprofiler_compute_test_${{ job.target }}
-    timeoutInMinutes: 120
-    dependsOn: rocprofiler_compute_build_${{ job.target }}
-    condition:
-      and(succeeded(),
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
-        eq(${{ parameters.aggregatePipeline }}, False)
-      )
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    - name: PYTHON_VERSION
-      value: 3.10
-    pool: ${{ job.target }}_test_pool
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
-      parameters:
-        checkoutRepo: ${{ parameters.checkoutRepo }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-      parameters:
-        gpuTarget: ${{ job.target }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmTestDependencies }}
-        gpuTarget: ${{ job.target }}
-    - task: Bash@3
-      displayName: Add en_US.UTF-8 locale
-      inputs:
-        targetType: inline
-        script: |
-          sudo locale-gen en_US.UTF-8
-          sudo update-locale
-          locale -a
-    - task: Bash@3
-      displayName: Add ROCm binaries to PATH
-      inputs:
-        targetType: inline
-        script: |
-          echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/bin"
-          echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/llvm/bin"
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
-      parameters:
-        extraBuildFlags: >-
-          -DCMAKE_HIP_ARCHITECTURES=${{ job.target }}
-          -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
-          -DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm/lib/cmake/hip
-          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
-          -DROCM_PATH=$(Agent.BuildDirectory)/rocm
-          -DCMAKE_BUILD_TYPE=Release
-          -DENABLE_TESTS=ON
-          -DINSTALL_TESTS=ON
-          -GNinja
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: rocprofiler-compute
-        testDir: $(Build.BinariesDirectory)/libexec/rocprofiler-compute
-        testExecutable: ROCM_PATH=$(Agent.BuildDirectory)/rocm ctest
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-        environment: test
-        gpuTarget: ${{ job.target }}
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
+    - job: rocprofiler_compute_test_${{ job.target }}
+      timeoutInMinutes: 120
+      dependsOn: rocprofiler_compute_build_${{ job.target }}
+      condition:
+        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
+          eq(${{ parameters.aggregatePipeline }}, False)
+        )
+      variables:
+      - group: common
+      - template: /.azuredevops/variables-global.yml
+      - name: PYTHON_VERSION
+        value: 3.10
+      pool: ${{ job.target }}_test_pool
+      workspace:
+        clean: all
+      steps:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
+        parameters:
+          checkoutRepo: ${{ parameters.checkoutRepo }}
+          sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+        parameters:
+          preTargetFilter: ${{ parameters.componentName }}
+          gpuTarget: ${{ job.target }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
+          dependencyList: ${{ parameters.rocmTestDependencies }}
+          gpuTarget: ${{ job.target }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+      - task: Bash@3
+        displayName: Add en_US.UTF-8 locale
+        inputs:
+          targetType: inline
+          script: |
+            sudo locale-gen en_US.UTF-8
+            sudo update-locale
+            locale -a
+      - task: Bash@3
+        displayName: Add ROCm binaries to PATH
+        inputs:
+          targetType: inline
+          script: |
+            echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/bin"
+            echo "##vso[task.prependpath]$(Agent.BuildDirectory)/rocm/llvm/bin"
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+        parameters:
+          extraBuildFlags: >-
+            -DCMAKE_HIP_ARCHITECTURES=${{ job.target }}
+            -DCMAKE_C_COMPILER=$(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
+            -DCMAKE_MODULE_PATH=$(Agent.BuildDirectory)/rocm/lib/cmake/hip
+            -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/rocm
+            -DROCM_PATH=$(Agent.BuildDirectory)/rocm
+            -DCMAKE_BUILD_TYPE=Release
+            -DENABLE_TESTS=ON
+            -DINSTALL_TESTS=ON
+            -GNinja
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: ${{ parameters.componentName }}
+          testDir: $(Build.BinariesDirectory)/libexec/rocprofiler-compute
+          testExecutable: ROCM_PATH=$(Agent.BuildDirectory)/rocm ctest
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+          environment: test
+          gpuTarget: ${{ job.target }}
--- a/.azuredevops/components/rocprofiler.yml
+++ b/.azuredevops/components/rocprofiler.yml
@@ -8,6 +8,22 @@ parameters:
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -70,6 +86,10 @@ parameters:
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -94,6 +114,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
      parameters:
        dependencyList:
@@ -108,6 +129,8 @@ jobs:
        gpuTarget: ${{ job.target }}
        os: ${{ job.os }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
        os: ${{ job.os }}
@@ -115,6 +138,7 @@ jobs:
        extraBuildFlags: >-
          -DCMAKE_MODULE_PATH=$(Build.SourcesDirectory)/cmake_modules;$(Agent.BuildDirectory)/rocm/lib/cmake;$(Agent.BuildDirectory)/rocm/lib/cmake/hip;$(Agent.BuildDirectory)/rocm/lib64/cmake;$(Agent.BuildDirectory)/rocm/lib64/cmake/hip
          -DCMAKE_PREFIX_PATH="$(Agent.BuildDirectory)/rocm;$(Agent.BuildDirectory)/vendor"
+          -DROCM_PATH=$(Agent.BuildDirectory)/rocm
          -DCMAKE_POSITION_INDEPENDENT_CODE=ON
          -DENABLE_LDCONFIG=OFF
          -DUSE_PROF_API=1
@@ -122,10 +146,13 @@ jobs:
        multithreadFlag: -- -j32
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
        os: ${{ job.os }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        gpuTarget: ${{ job.target }}
        os: ${{ job.os }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
@@ -139,63 +166,68 @@ jobs:
            - HIP_ROCCLR_HOME:::/home/user/workspace/rocm
            - ROCM_PATH:::/home/user/workspace/rocm

- ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
-    dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
-    condition:
-      and(succeeded(),
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
-        eq(${{ parameters.aggregatePipeline }}, False)
-      )
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    - name: ROCM_PATH
-      value: $(Agent.BuildDirectory)/rocm
-    - name: LD_LIBRARY_PATH
-      value: $(Agent.BuildDirectory)/rocm/lib/rocprofiler:$(Agent.BuildDirectory)/rocm/share/rocprofiler/tests-v1/test:$(Agent.BuildDirectory)/rocm/share/rocprofiler/tests
-    pool: ${{ job.target }}_test_pool
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        packageManager: ${{ job.packageManager }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-      parameters:
-        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-      parameters:
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmDependencies }}
-        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: rocprofilerV1
-        testDir: $(Agent.BuildDirectory)/rocm/share/rocprofiler/tests-v1
-        testExecutable:  ./run.sh
-        testParameters: ''
-        testPublishResults: false
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: rocprofilerV2
-        testDir: $(Agent.BuildDirectory)/rocm
-        testExecutable:  share/rocprofiler/tests/runUnitTests
-        testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-        environment: test
-        gpuTarget: ${{ job.target }}
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
+    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
+      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
+      condition:
+        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
+          eq(${{ parameters.aggregatePipeline }}, False)
+        )
+      variables:
+      - group: common
+      - template: /.azuredevops/variables-global.yml
+      - name: ROCM_PATH
+        value: $(Agent.BuildDirectory)/rocm
+      - name: LD_LIBRARY_PATH
+        value: $(Agent.BuildDirectory)/rocm/lib/rocprofiler:$(Agent.BuildDirectory)/rocm/share/rocprofiler/tests-v1/test:$(Agent.BuildDirectory)/rocm/share/rocprofiler/tests
+      pool: ${{ job.target }}_test_pool
+      workspace:
+        clean: all
+      steps:
+      - checkout: none
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          packageManager: ${{ job.packageManager }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+        parameters:
+          preTargetFilter: ${{ parameters.componentName }}
+          gpuTarget: ${{ job.target }}
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+        parameters:
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
+          dependencyList: ${{ parameters.rocmDependencies }}
+          gpuTarget: ${{ job.target }}
+          os: ${{ job.os }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: rocprofilerV1
+          testDir: $(Agent.BuildDirectory)/rocm/share/rocprofiler/tests-v1
+          testExecutable:  ./run.sh
+          testParameters: ''
+          testPublishResults: false
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: rocprofilerV2
+          testDir: $(Agent.BuildDirectory)/rocm
+          testExecutable:  share/rocprofiler/tests/runUnitTests
+          testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+          environment: test
+          gpuTarget: ${{ job.target }}
--- a/.azuredevops/components/roctracer.yml
+++ b/.azuredevops/components/roctracer.yml
@@ -8,6 +8,22 @@ parameters:
 - name: checkoutRef
  type: string
  default: ''
+# monorepo related parameters
+- name: sparseCheckoutDir
+  type: string
+  default: ''
+- name: triggerDownstreamJobs
+  type: boolean
+  default: false
+- name: downstreamAggregateNames
+  type: string
+  default: ''
+- name: buildDependsOn
+  type: object
+  default: null
+- name: unifiedBuild
+  type: boolean
+  default: false
 # set to true if doing full build of ROCm stack
 # and dependencies are pulled from same pipeline
 - name: aggregatePipeline
@@ -65,6 +81,10 @@ parameters:
 jobs:
 - ${{ each job in parameters.jobMatrix.buildJobs }}:
  - job: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
+    ${{ if parameters.buildDependsOn }}:
+      dependsOn:
+        - ${{ each build in parameters.buildDependsOn }}:
+          - ${{ build }}_${{ job.os }}_${{ job.target }}
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -87,6 +107,7 @@ jobs:
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml
      parameters:
        checkoutRepo: ${{ parameters.checkoutRepo }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
      parameters:
        checkoutRef: ${{ parameters.checkoutRef }}
@@ -94,6 +115,8 @@ jobs:
        gpuTarget: ${{ job.target }}
        aggregatePipeline: ${{ parameters.aggregatePipeline }}
        os: ${{ job.os }}
+        ${{ if parameters.triggerDownstreamJobs }}:
+          downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
    # the linker flags will not affect ubuntu2204 builds as the paths do not exist
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
      parameters:
@@ -109,10 +132,13 @@ jobs:
          -GNinja
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/manifest.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
+        sparseCheckoutDir: ${{ parameters.sparseCheckoutDir }}
        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
      parameters:
+        componentName: ${{ parameters.componentName }}
        os: ${{ job.os }}
        gpuTarget: ${{ job.target }}
    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-links.yml
@@ -123,53 +149,57 @@ jobs:
    #     gpuTarget: ${{ job.target }}
    #     registerROCmPackages: true

- ${{ each job in parameters.jobMatrix.testJobs }}:
-  - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
-    dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
-    condition:
-      and(succeeded(),
-        eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
-        not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), variables['Build.DefinitionName'])),
-        eq(${{ parameters.aggregatePipeline }}, False)
-      )
-    variables:
-    - group: common
-    - template: /.azuredevops/variables-global.yml
-    pool: ${{ job.target }}_test_pool
-    workspace:
-      clean: all
-    steps:
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-        packageManager: ${{ job.packageManager }}
-        registerROCmPackages: true
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
-      parameters:
-        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
-      parameters:
-        checkoutRef: ${{ parameters.checkoutRef }}
-        dependencyList: ${{ parameters.rocmTestDependencies }}
-        gpuTarget: ${{ job.target }}
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
-      parameters:
-        componentName: roctracer
-        testExecutable: $(Agent.BuildDirectory)/rocm/share/roctracer/run_tests.sh
-        testParameters: ''
-        testDir: $(Agent.BuildDirectory)
-        testPublishResults: false
-        os: ${{ job.os }}
-    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
-      parameters:
-        aptPackages: ${{ parameters.aptPackages }}
-        pipModules: ${{ parameters.pipModules }}
-        environment: test
-        gpuTarget: ${{ job.target }}
-        registerROCmPackages: true
+- ${{ if eq(parameters.unifiedBuild, False) }}:
+  - ${{ each job in parameters.jobMatrix.testJobs }}:
+    - job: ${{ parameters.componentName }}_test_${{ job.os }}_${{ job.target }}
+      dependsOn: ${{ parameters.componentName }}_build_${{ job.os }}_${{ job.target }}
+      condition:
+        and(succeeded(),
+          eq(variables['ENABLE_${{ upper(job.target) }}_TESTS'], 'true'),
+          not(containsValue(split(variables['DISABLED_${{ upper(job.target) }}_TESTS'], ','), '${{ parameters.componentName }}')),
+          eq(${{ parameters.aggregatePipeline }}, False)
+        )
+      variables:
+      - group: common
+      - template: /.azuredevops/variables-global.yml
+      pool: ${{ job.target }}_test_pool
+      workspace:
+        clean: all
+      steps:
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+          packageManager: ${{ job.packageManager }}
+          registerROCmPackages: true
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/local-artifact-download.yml
+        parameters:
+          preTargetFilter: ${{ parameters.componentName }}
+          gpuTarget: ${{ job.target }}
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-aqlprofile.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-rocm.yml
+        parameters:
+          checkoutRef: ${{ parameters.checkoutRef }}
+          dependencyList: ${{ parameters.rocmTestDependencies }}
+          gpuTarget: ${{ job.target }}
+          os: ${{ job.os }}
+          ${{ if parameters.triggerDownstreamJobs }}:
+            downstreamAggregateNames: ${{ parameters.downstreamAggregateNames }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/gpu-diagnostics.yml
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml
+        parameters:
+          componentName: ${{ parameters.componentName }}
+          testExecutable: $(Agent.BuildDirectory)/rocm/share/roctracer/run_tests.sh
+          testParameters: ''
+          testDir: $(Agent.BuildDirectory)
+          testPublishResults: false
+          os: ${{ job.os }}
+      - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/docker-container.yml
+        parameters:
+          aptPackages: ${{ parameters.aptPackages }}
+          pipModules: ${{ parameters.pipModules }}
+          environment: test
+          gpuTarget: ${{ job.target }}
+          registerROCmPackages: true
--- a/.azuredevops/dependencies/catch2.yml
+++ b/.azuredevops/dependencies/catch2.yml
@@ -0,0 +1,63 @@
+parameters:
+- name: checkoutRepo
+  type: string
+  default: 'self'
+- name: checkoutRef
+  type: string
+  default: ''
+- name: catch2Version
+  type: string
+  default: ''
+- name: aptPackages
+  type: object
+  default:
+    - cmake
+    - git
+    - ninja-build
+
+- name: jobMatrix
+  type: object
+  default:
+    buildJobs:
+      - { os: ubuntu2204, packageManager: apt}
+      - { os: almalinux8, packageManager: dnf}
+
+jobs:
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
+  - job: catch2_${{ job.os }}
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    pool:
+      vmImage: 'ubuntu-22.04'
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
+    workspace:
+      clean: all
+    steps:
+    - checkout: none
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - task: Bash@3
+      displayName: Clone catch2 ${{ parameters.catch2Version }}
+      inputs:
+        targetType: inline
+        script: git clone https://github.com/catchorg/Catch2.git -b ${{ parameters.catch2Version }}
+        workingDirectory: $(Agent.BuildDirectory)
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        os: ${{ job.os }}
+        cmakeBuildDir: $(Agent.BuildDirectory)/Catch2/build
+        cmakeSourceDir: $(Agent.BuildDirectory)/Catch2
+        useAmdclang: false
+        extraBuildFlags: >-
+          -DCMAKE_BUILD_TYPE=Release
+          -GNinja
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
+        os: ${{ job.os }}
--- a/.azuredevops/dependencies/fmtlib.yml
+++ b/.azuredevops/dependencies/fmtlib.yml
@@ -0,0 +1,67 @@
+parameters:
+- name: checkoutRepo
+  type: string
+  default: 'self'
+- name: checkoutRef
+  type: string
+  default: ''
+- name: fmtlibVersion
+  type: string
+  default: ''
+- name: aptPackages
+  type: object
+  default:
+    - cmake
+    - git
+    - ninja-build
+    - libfmt-dev
+
+- name: jobMatrix
+  type: object
+  default:
+    buildJobs:
+      - { os: ubuntu2204, packageManager: apt}
+      - { os: almalinux8, packageManager: dnf}
+
+jobs:
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
+  - job: fmtlib_${{ job.os }}
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    pool:
+      vmImage: 'ubuntu-22.04'
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
+    workspace:
+      clean: all
+    steps:
+    - checkout: none
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - task: Bash@3
+      displayName: Clone fmtlib ${{ parameters.fmtlibVersion }}
+      inputs:
+        targetType: inline
+        script: git clone https://github.com/fmtlib/fmt.git -b ${{ parameters.fmtlibVersion }}
+        workingDirectory: $(Agent.BuildDirectory)
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        os: ${{ job.os }}
+        cmakeBuildDir: $(Agent.BuildDirectory)/fmt/build
+        cmakeSourceDir: $(Agent.BuildDirectory)/fmt
+        useAmdclang: false
+        extraBuildFlags: >-
+          -DCMAKE_BUILD_TYPE=Release
+          -DFMT_SYSTEM_HEADERS=ON
+          -DFMT_INSTALL=ON
+          -DFMT_TEST=OFF
+          -GNinja
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
+        os: ${{ job.os }}
--- a/.azuredevops/dependencies/libdivide.yml
+++ b/.azuredevops/dependencies/libdivide.yml
@@ -0,0 +1,64 @@
+parameters:
+- name: checkoutRepo
+  type: string
+  default: 'self'
+- name: checkoutRef
+  type: string
+  default: ''
+- name: libdivideVersion
+  type: string
+  default: ''
+- name: aptPackages
+  type: object
+  default:
+    - cmake
+    - git
+    - ninja-build
+
+- name: jobMatrix
+  type: object
+  default:
+    buildJobs:
+      - { os: ubuntu2204, packageManager: apt}
+      - { os: almalinux8, packageManager: dnf}
+
+jobs:
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
+  - job: libdivide_${{ job.os }}
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    pool:
+      vmImage: 'ubuntu-22.04'
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
+    workspace:
+      clean: all
+    steps:
+    - checkout: none
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - task: Bash@3
+      displayName: Clone libdivide ${{ parameters.libdivideVersion }}
+      inputs:
+        targetType: inline
+        script: git clone https://github.com/ridiculousfish/libdivide.git -b ${{ parameters.libdivideVersion }}
+        workingDirectory: $(Agent.BuildDirectory)
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        os: ${{ job.os }}
+        cmakeBuildDir: $(Agent.BuildDirectory)/libdivide/build
+        cmakeSourceDir: $(Agent.BuildDirectory)/libdivide
+        useAmdclang: false
+        extraBuildFlags: >-
+          -DCMAKE_BUILD_TYPE=Release
+          -DLIBDIVIDE_BUILD_TESTS=OFF
+          -GNinja
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
+        os: ${{ job.os }}
--- a/.azuredevops/dependencies/spdlog.yml
+++ b/.azuredevops/dependencies/spdlog.yml
@@ -0,0 +1,71 @@
+parameters:
+- name: checkoutRepo
+  type: string
+  default: 'self'
+- name: checkoutRef
+  type: string
+  default: ''
+- name: spdlogVersion
+  type: string
+  default: ''
+- name: aptPackages
+  type: object
+  default:
+    - cmake
+    - git
+    - ninja-build
+
+- name: jobMatrix
+  type: object
+  default:
+    buildJobs:
+      - { os: ubuntu2204, packageManager: apt}
+      - { os: almalinux8, packageManager: dnf}
+
+jobs:
+- ${{ each job in parameters.jobMatrix.buildJobs }}:
+  - job: spdlog_${{ job.os }}
+    variables:
+    - group: common
+    - template: /.azuredevops/variables-global.yml
+    pool:
+      vmImage: 'ubuntu-22.04'
+    ${{ if eq(job.os, 'almalinux8') }}:
+      container:
+        image: rocmexternalcicd.azurecr.io/manylinux228:latest
+        endpoint: ContainerService3
+    workspace:
+      clean: all
+    steps:
+    - checkout: none
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-other.yml
+      parameters:
+        aptPackages: ${{ parameters.aptPackages }}
+        packageManager: ${{ job.packageManager }}
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/dependencies-vendor.yml
+      parameters:
+        dependencyList:
+          - fmtlib
+    - task: Bash@3
+      displayName: Clone spdlog ${{ parameters.spdlogVersion }}
+      inputs:
+        targetType: inline
+        script: git clone https://github.com/gabime/spdlog.git -b ${{ parameters.spdlogVersion }}
+        workingDirectory: $(Agent.BuildDirectory)
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml
+      parameters:
+        os: ${{ job.os }}
+        cmakeBuildDir: $(Agent.BuildDirectory)/spdlog/build
+        cmakeSourceDir: $(Agent.BuildDirectory)/spdlog
+        useAmdclang: false
+        extraBuildFlags: >-
+          -DCMAKE_PREFIX_PATH=$(Agent.BuildDirectory)/vendor
+          -DCMAKE_BUILD_TYPE=Release
+          -DSPDLOG_USE_STD_FORMAT=OFF
+          -DSPDLOG_FMT_EXTERNAL_HO=ON
+          -DSPDLOG_INSTALL=ON
+          -GNinja
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/artifact-upload.yml
+      parameters:
+        os: ${{ job.os }}
--- a/.azuredevops/nightly/pytorch.yml
+++ b/.azuredevops/nightly/pytorch.yml
@@ -397,6 +397,7 @@ jobs:
  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/preamble.yml
  - task: DownloadPipelineArtifact@2
    displayName: 'Download Pipeline Wheel Files'
+    retryCountOnTaskFailure: 3
    inputs:
      itemPattern: '**/*.whl'
      targetPath: $(Agent.BuildDirectory)
--- a/.azuredevops/nightly/rocm-nightly.yml
+++ b/.azuredevops/nightly/rocm-nightly.yml
@@ -93,7 +93,7 @@ schedules:
 jobs:
 - ${{ each job in parameters.jobList }}:
  - job: nightly_${{ job.os }}_${{ job.target }}
-    timeoutInMinutes: 90
+    timeoutInMinutes: 120
    variables:
    - group: common
    - template: /.azuredevops/variables-global.yml
@@ -226,6 +226,7 @@ jobs:
            cat Dockerfile
    - task: Docker@2
      displayName: Build and upload Docker image
+      retryCountOnTaskFailure: 3
      inputs:
        containerRegistry: ContainerService3
        repository: 'nightly-${{ job.os }}-${{ job.target }}'
--- a/.azuredevops/tag-builds/catch2.yml
+++ b/.azuredevops/tag-builds/catch2.yml
@@ -0,0 +1,23 @@
+variables:
+- group: common
+- template: /.azuredevops/variables-global.yml
+
+parameters:
+- name: catch2Version
+  type: string
+  default: "v3.7.0"
+
+resources:
+  repositories:
+  - repository: pipelines_repo
+    type: github
+    endpoint: ROCm
+    name: ROCm/ROCm
+
+trigger: none
+pr: none
+
+jobs:
+  - template: ${{ variables.CI_DEPENDENCIES_PATH }}/catch2.yml
+    parameters:
+      catch2Version: ${{ parameters.catch2Version }}
--- a/.azuredevops/tag-builds/fmtlib.yml
+++ b/.azuredevops/tag-builds/fmtlib.yml
@@ -0,0 +1,23 @@
+variables:
+- group: common
+- template: /.azuredevops/variables-global.yml
+
+parameters:
+- name: fmtlibVersion
+  type: string
+  default: "11.1.3"
+
+resources:
+  repositories:
+  - repository: pipelines_repo
+    type: github
+    endpoint: ROCm
+    name: ROCm/ROCm
+
+trigger: none
+pr: none
+
+jobs:
+  - template: ${{ variables.CI_DEPENDENCIES_PATH }}/fmtlib.yml
+    parameters:
+      fmtlibVersion: ${{ parameters.fmtlibVersion }}
--- a/.azuredevops/tag-builds/libdivide.yml
+++ b/.azuredevops/tag-builds/libdivide.yml
@@ -0,0 +1,23 @@
+variables:
+- group: common
+- template: /.azuredevops/variables-global.yml
+
+parameters:
+- name: libdivideVersion
+  type: string
+  default: master
+
+resources:
+  repositories:
+  - repository: pipelines_repo
+    type: github
+    endpoint: ROCm
+    name: ROCm/ROCm
+
+trigger: none
+pr: none
+
+jobs:
+  - template: ${{ variables.CI_DEPENDENCIES_PATH }}/libdivide.yml
+    parameters:
+      libdivideVersion: ${{ parameters.libdivideVersion }}
--- a/.azuredevops/tag-builds/spdlog.yml
+++ b/.azuredevops/tag-builds/spdlog.yml
@@ -0,0 +1,23 @@
+variables:
+- group: common
+- template: /.azuredevops/variables-global.yml
+
+parameters:
+- name: spdlogVersion
+  type: string
+  default: "v1.15.1"
+
+resources:
+  repositories:
+  - repository: pipelines_repo
+    type: github
+    endpoint: ROCm
+    name: ROCm/ROCm
+
+trigger: none
+pr: none
+
+jobs:
+  - template: ${{ variables.CI_DEPENDENCIES_PATH }}/spdlog.yml
+    parameters:
+      spdlogVersion: ${{ parameters.spdlogVersion }}
--- a/.azuredevops/templates/steps/artifact-download.yml
+++ b/.azuredevops/templates/steps/artifact-download.yml
@@ -24,8 +24,12 @@ parameters:
 steps:
 - task: DownloadPipelineArtifact@2
  displayName: Download ${{ parameters.componentName }}
+  retryCountOnTaskFailure: 3
  inputs:
-    itemPattern: '**/*${{ parameters.componentName }}*${{ parameters.fileFilter }}*'
+    ${{ if eq(parameters.componentName, 'clr') }}:
+      itemPattern: '**/*${{ parameters.componentName }}*${{ parameters.fileFilter }}*amd*' # filter out nvidia clr artifacts
+    ${{ else }}:
+      itemPattern: '**/*${{ parameters.componentName }}*${{ parameters.fileFilter }}*'
    targetPath: '$(Pipeline.Workspace)/d'
    allowPartiallySucceededBuilds: true
    ${{ if parameters.aggregatePipeline }}:
--- a/.azuredevops/templates/steps/checkout.yml
+++ b/.azuredevops/templates/steps/checkout.yml
@@ -20,7 +20,7 @@ steps:
    retryCountOnTaskFailure: 3
    fetchFilter: blob:none
    ${{ if ne(parameters.sparseCheckoutDir, '') }}:
-      sparseCheckoutDirectories: ${{ parameters.sparseCheckoutDir }}
+      sparseCheckoutDirectories: ${{ parameters.sparseCheckoutDir }} shared
      path: sparse
  - ${{ if ne(parameters.sparseCheckoutDir, '') }}:
    - task: Bash@3
--- a/.azuredevops/templates/steps/dependencies-apt.yml
+++ b/.azuredevops/templates/steps/dependencies-apt.yml
@@ -10,6 +10,7 @@ steps:
 - ${{ if eq(parameters.registerROCmPackages, true) }}:
  - task: Bash@3
    displayName: 'Register AMDGPU & ROCm repos (apt)'
+    retryCountOnTaskFailure: 3
    inputs:
      targetType: inline
      script: |
@@ -20,7 +21,8 @@ steps:
        echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600
        sudo apt update
 - task: Bash@3
-  displayName: 'sudo apt-get update'
+  displayName: 'APT update and install packages'
+  retryCountOnTaskFailure: 3
  inputs:
    targetType: inline
    script: |
@@ -28,15 +30,6 @@ steps:
      echo "deb http://archive.ubuntu.com/ubuntu/ jammy-updates main restricted universe multiverse" | sudo tee -a /etc/apt/sources.list.d/default.list
      echo "deb http://archive.ubuntu.com/ubuntu/ jammy-backports main restricted universe multiverse" | sudo tee -a /etc/apt/sources.list.d/default.list
      echo "deb http://archive.ubuntu.com/ubuntu/ jammy-security main restricted universe multiverse" | sudo tee -a /etc/apt/sources.list.d/default.list
-      sudo DEBIAN_FRONTEND=noninteractive apt-get --yes update
- task: Bash@3
-  displayName: 'sudo apt-get fix'
-  inputs:
-    targetType: inline
-    script: sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --fix-broken install
- ${{ if gt(length(parameters.aptPackages), 0) }}:
-  - task: Bash@3
-    displayName: 'sudo apt-get install ...'
-    inputs:
-      targetType: inline
-      script: sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --fix-missing install ${{ join(' ', parameters.aptPackages) }}
+      sudo DEBIAN_FRONTEND=noninteractive apt-get --yes update && \
+        sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --fix-broken install && \
+        sudo DEBIAN_FRONTEND=noninteractive apt-get --yes --fix-missing install ${{ join(' ', parameters.aptPackages) }}
--- a/.azuredevops/templates/steps/dependencies-aqlprofile.yml
+++ b/.azuredevops/templates/steps/dependencies-aqlprofile.yml
@@ -5,51 +5,28 @@ parameters:

 steps:
 - task: Bash@3
-  displayName: Get aqlprofile package name
-  inputs:
-    targetType: inline
-    ${{ if eq(parameters.os, 'ubuntu2204') }}:
-      script: |
-        export packageName=$(curl -s https://repo.radeon.com/rocm/apt/$(REPO_RADEON_VERSION)/pool/main/h/hsa-amd-aqlprofile/ | grep -oP "href=\"\K[^\"]*$(lsb_release -rs)[^\"]*\.deb")
-        echo "##vso[task.setvariable variable=packageName;isreadonly=true]$packageName"
-    ${{ if eq(parameters.os, 'almalinux8') }}:
-      script: |
-        export packageName=$(curl -s https://repo.radeon.com/rocm/rhel8/$(REPO_RADEON_VERSION)/main/ | grep -oP "hsa-amd-aqlprofile-[^\"]+\.rpm" | head -n1)
-        echo "##vso[task.setvariable variable=packageName;isreadonly=true]$packageName"
- task: Bash@3
-  displayName: 'Download aqlprofile'
-  inputs:
-    targetType: inline
-    workingDirectory: '$(Pipeline.Workspace)'
-    ${{ if eq(parameters.os, 'ubuntu2204') }}:
-      script: wget -nv https://repo.radeon.com/rocm/apt/$(REPO_RADEON_VERSION)/pool/main/h/hsa-amd-aqlprofile/$(packageName)
-    ${{ if eq(parameters.os, 'almalinux8') }}:
-      script: wget -nv https://repo.radeon.com/rocm/rhel8/$(REPO_RADEON_VERSION)/main/$(packageName)
- task: Bash@3
-  displayName: 'Extract aqlprofile'
-  inputs:
-    targetType: inline
-    workingDirectory: '$(Pipeline.Workspace)'
-    ${{ if eq(parameters.os, 'ubuntu2204') }}:
-      script: |
-        mkdir hsa-amd-aqlprofile
-        dpkg-deb -R $(packageName) hsa-amd-aqlprofile
-    ${{ if eq(parameters.os, 'almalinux8') }}:
-      script: |
-        mkdir hsa-amd-aqlprofile
-        sudo dnf -y install rpm-build cpio
-        rpm2cpio $(packageName) | (cd hsa-amd-aqlprofile && cpio -idmv)
- task: Bash@3
-  displayName: 'Copy aqlprofile files'
+  displayName: Download and install aqlprofile
+  retryCountOnTaskFailure: 3
  inputs:
    targetType: inline
+    workingDirectory: $(Agent.BuildDirectory)
    script: |
-      mkdir -p $(Agent.BuildDirectory)/rocm
-      cp -R hsa-amd-aqlprofile/opt/rocm-*/* $(Agent.BuildDirectory)/rocm
-    workingDirectory: '$(Pipeline.Workspace)'
- task: Bash@3
-  displayName: 'Clean up aqlprofile'
-  inputs:
-    targetType: inline
-    script: rm -rf hsa-amd-aqlprofile $(packageName)
-    workingDirectory: '$(Pipeline.Workspace)'
+      set -e
+      if [ "${{ parameters.os }}" = "ubuntu2204" ]; then
+        packageName=$(curl -s https://repo.radeon.com/rocm/apt/$(REPO_RADEON_VERSION)/pool/main/h/hsa-amd-aqlprofile/ | grep -oP "href=\"\K[^\"]*$(lsb_release -rs)[^\"]*\.deb") && \
+        wget -nv https://repo.radeon.com/rocm/apt/$(REPO_RADEON_VERSION)/pool/main/h/hsa-amd-aqlprofile/$packageName && \
+        mkdir -p hsa-amd-aqlprofile && \
+        dpkg-deb -R $packageName hsa-amd-aqlprofile
+      elif [ "${{ parameters.os }}" = "almalinux8" ]; then
+        sudo dnf -y install rpm-build cpio && \
+        packageName=$(curl -s https://repo.radeon.com/rocm/rhel8/$(REPO_RADEON_VERSION)/main/ | grep -oP "hsa-amd-aqlprofile-[^\"]+\.rpm" | head -n1) && \
+        wget -nv https://repo.radeon.com/rocm/rhel8/$(REPO_RADEON_VERSION)/main/$packageName && \
+        mkdir -p hsa-amd-aqlprofile && \
+        rpm2cpio $packageName | (cd hsa-amd-aqlprofile && cpio -idmv)
+      else
+        echo "Unsupported OS: ${{ parameters.os }}"
+        exit 1
+      fi && \
+      mkdir -p $(Agent.BuildDirectory)/rocm && \
+      cp -R hsa-amd-aqlprofile/opt/rocm-*/* $(Agent.BuildDirectory)/rocm && \
+      rm -rf hsa-amd-aqlprofile $packageName
--- a/.azuredevops/templates/steps/dependencies-dnf.yml
+++ b/.azuredevops/templates/steps/dependencies-dnf.yml
@@ -54,11 +54,13 @@ parameters:
    libfftw3-dev: fftw-devel
    libfmt-dev: fmt-devel
    libgmp-dev: gmp-devel
+    liblapack-dev: lapack-devel
    liblzma-dev: xz-devel
    libmpfr-dev: mpfr-devel
    libmsgpack-dev: msgpack-devel
    libncurses5-dev: ncurses-devel
    libnuma-dev: numactl-devel
+    libopenblas-dev: openblas-devel
    libopenmpi-dev: openmpi-devel
    libpci-dev: libpciaccess-devel
    libssl-dev: openssl-devel
@@ -87,6 +89,7 @@ steps:
 - ${{ if eq(parameters.registerROCmPackages, true) }}:
  - task: Bash@3
    displayName: 'Register AMDGPU & ROCm repos (dnf)'
+    retryCountOnTaskFailure: 3
    inputs:
      targetType: inline
      script: |
@@ -107,12 +110,13 @@ steps:
        sudo dnf makecache
 - task: Bash@3
  displayName: 'Install base dnf packages'
+  retryCountOnTaskFailure: 3
  inputs:
    targetType: inline
    script: |
-      sudo dnf config-manager --set-enabled powertools
      # rpm fusion free repo for some dependencies
-      sudo dnf -y install https://download1.rpmfusion.org/free/el/rpmfusion-free-release-8.noarch.rpm
+      sudo dnf config-manager --set-enabled powertools && \
+      sudo dnf -y install https://download1.rpmfusion.org/free/el/rpmfusion-free-release-8.noarch.rpm && \
      sudo dnf -y install ${{ join(' ', parameters.basePackages) }}
 - task: Bash@3
  displayName: 'Check gcc environment'
@@ -126,6 +130,7 @@ steps:
      g++ -print-file-name=libstdc++.so
 - task: Bash@3
  displayName: 'Set python 3.11 as default'
+  retryCountOnTaskFailure: 3
  inputs:
    targetType: inline
    script: |
@@ -140,18 +145,20 @@ steps:
  - ${{ if eq(pkg, 'ninja-build') }}:
    - task: Bash@3
      displayName: 'Install ninja 1.11.1'
+      retryCountOnTaskFailure: 3
      inputs:
        targetType: inline
        script: |
-          curl -LO https://github.com/ninja-build/ninja/releases/download/v1.11.1/ninja-linux.zip
-          sudo dnf -y install unzip
-          unzip ninja-linux.zip
-          sudo mv ninja /usr/local/bin/ninja
-          sudo chmod +x /usr/local/bin/ninja
+          sudo dnf -y install unzip && \
+          curl -LO https://github.com/ninja-build/ninja/releases/download/v1.11.1/ninja-linux.zip && \
+          unzip ninja-linux.zip && \
+          sudo mv ninja /usr/local/bin/ninja && \
+          sudo chmod +x /usr/local/bin/ninja && \
          echo "##vso[task.prependpath]/usr/local/bin"
  - ${{ if ne(parameters.aptToDnfMap[pkg], '') }}:
    - task: Bash@3
      displayName: 'dnf install ${{ parameters.aptToDnfMap[pkg] }}'
+      retryCountOnTaskFailure: 3
      inputs:
        targetType: inline
        script: |
--- a/.azuredevops/templates/steps/dependencies-other.yml
+++ b/.azuredevops/templates/steps/dependencies-other.yml
@@ -27,6 +27,7 @@ steps:
 - ${{ if gt(length(parameters.pipModules), 0) }}:
  - task: Bash@3
    displayName: 'pip install  ...'
+    retryCountOnTaskFailure: 3
    inputs:
      targetType: inline
      script: python3 -m pip install -v --force-reinstall ${{ join(' ', parameters.pipModules) }}
--- a/.azuredevops/templates/steps/dependencies-rocm.yml
+++ b/.azuredevops/templates/steps/dependencies-rocm.yml
@@ -47,8 +47,8 @@ parameters:
      developBranch: aomp-dev
      hasGpuTarget: false
    clr:
-      pipelineId: 145
-      developBranch: amd-staging
+      pipelineId: 335
+      developBranch: develop
      hasGpuTarget: false
    composable_kernel:
      pipelineId: 86
@@ -59,8 +59,8 @@ parameters:
      developBranch: rocm
      hasGpuTarget: false
    HIP:
-      pipelineId: 93
-      developBranch: amd-staging
+      pipelineId: 335
+      developBranch: develop
      hasGpuTarget: false
    hip-tests:
      pipelineId: 233
@@ -203,8 +203,8 @@ parameters:
      developBranch: develop
      hasGpuTarget: true
    rocprofiler:
-      pipelineId: 143
-      developBranch: amd-staging
+      pipelineId: 329
+      developBranch: develop
      hasGpuTarget: true
    rocprofiler-compute:
      pipelineId: 257
--- a/.azuredevops/templates/steps/dependencies-vendor.yml
+++ b/.azuredevops/templates/steps/dependencies-vendor.yml
@@ -8,15 +8,20 @@ parameters:
  type: object
  default:
    boost: 250
+    catch2: 343
+    fmtlib: 341
    grpc: 72
    gtest: 73
    half560: 68
    lapack: 69
+    libdivide: 342
+    spdlog: 340

 steps:
 - ${{ each dependency in parameters.dependencyList }}:
  - task: DownloadPipelineArtifact@2
    displayName: Download ${{ dependency }}
+    retryCountOnTaskFailure: 3
    inputs:
      project: ROCm-CI
      buildType: specific
@@ -28,7 +33,7 @@ steps:
    inputs:
      archiveFilePatterns: '$(Pipeline.Workspace)/d/**/*.tar.gz'
      destinationFolder: $(Agent.BuildDirectory)/vendor
-      cleanDestinationFolder: true
+      cleanDestinationFolder: false
      overwriteExistingFiles: true
  - task: DeleteFiles@1
    displayName: Clean up ${{ dependency }}
--- a/.azuredevops/templates/steps/local-artifact-download.yml
+++ b/.azuredevops/templates/steps/local-artifact-download.yml
@@ -33,6 +33,7 @@ parameters:
 steps:
  - task: DownloadPipelineArtifact@2
    displayName: Download ${{ parameters.preTargetFilter}}*${{ parameters.os }}_${{ parameters.gpuTarget}}*${{ parameters.postTargetFilter}}
+    retryCountOnTaskFailure: 3
    inputs:
      ${{ if eq(parameters.buildType, 'specific') }}:
        buildType: specific
--- a/.azuredevops/templates/steps/miopen-get-ck-build.yml
+++ b/.azuredevops/templates/steps/miopen-get-ck-build.yml
@@ -7,6 +7,7 @@ steps:
 - task: Bash@3
  name: downloadCKBuild
  displayName: Download specific CK build
+  retryCountOnTaskFailure: 3
  env:
    CXX: $(Agent.BuildDirectory)/rocm/llvm/bin/amdclang++
    CC: $(Agent.BuildDirectory)/rocm/llvm/bin/amdclang
@@ -69,20 +70,29 @@ steps:

      RETRIES=0
      MAX_RETRIES=5
-      until wget -nv $ARTIFACT_URL -O $(System.ArtifactsDirectory)/ck.zip; do
-        RETRIES=$((RETRIES+1))
-        if [[ $RETRIES -ge $MAX_RETRIES ]]; then
-          echo "Failed to download CK artifact after $MAX_RETRIES attempts."
-          exit 1
+      SUCCESS=false
+      while [ $RETRIES -lt $MAX_RETRIES ]; do
+        wget -nv $ARTIFACT_URL -O $(System.ArtifactsDirectory)/ck.zip && \
+          unzip $(System.ArtifactsDirectory)/ck.zip -d $(System.ArtifactsDirectory) && \
+          mkdir -p $(Agent.BuildDirectory)/rocm && \
+          tar -zxvf $(System.ArtifactsDirectory)/composable_kernel*/*.tar.gz -C $(Agent.BuildDirectory)/rocm && \
+          rm -r $(System.ArtifactsDirectory)/ck.zip $(System.ArtifactsDirectory)/composable_kernel*
+
+        if [ $? -eq 0 ]; then
+          SUCCESS=true
+          echo "Successfully downloaded CK."
+          break
+        else
+          RETRIES=$((RETRIES + 1))
+          echo "Failed to download CK on attempt $RETRIES/$MAX_RETRIES, retrying..."
+          sleep 1
        fi
-        echo "Download failed, retrying ($RETRIES/$MAX_RETRIES)..."
-        sleep 5
      done

-      unzip $(System.ArtifactsDirectory)/ck.zip -d $(System.ArtifactsDirectory)
-      mkdir -p $(Agent.BuildDirectory)/rocm
-      tar -zxvf $(System.ArtifactsDirectory)/composable_kernel*/*.tar.gz -C $(Agent.BuildDirectory)/rocm
-      rm -r $(System.ArtifactsDirectory)/ck.zip $(System.ArtifactsDirectory)/composable_kernel*
+      if [ "$SUCCESS" = false ]; then
+        echo "ERROR: failed to download CK after $MAX_RETRIES attempts."
+        exit 1
+      fi

      if [[ $EXIT_CODE -ne 0 ]]; then
        BUILD_COMMIT=$(curl -s $AZ_API/build/builds/$CK_BUILD_ID | jq '.sourceVersion' | tr -d '"')
--- a/.azuredevops/variables-global.yml
+++ b/.azuredevops/variables-global.yml
@@ -28,13 +28,13 @@ variables:
 - name: GFX90A_TEST_POOL
  value: gfx90a_test_pool
 - name: LATEST_RELEASE_VERSION
-  value: 6.4.2
+  value: 6.4.3
 - name: REPO_RADEON_VERSION
-  value: 6.4.2
+  value: 6.4.3
 - name: NEXT_RELEASE_VERSION
  value: 7.0.0
 - name: LATEST_RELEASE_TAG
-  value: rocm-6.4.2
+  value: rocm-6.4.3
 - name: DOCKER_SKIP_GFX
  value: gfx90a
 - name: COMPOSABLE_KERNEL_PIPELINE_ID
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -5,6 +5,7 @@ ACEs
 ACS
 AccVGPR
 AccVGPRs
+AITER
 ALU
 AllReduce
 AMD
@@ -115,6 +116,7 @@ Deprecations
 DevCap
 DirectX
 Dockerfile
+Dockerized
 Doxygen
 dropless
 ELMo
@@ -122,6 +124,7 @@ ENDPGM
 EPYC
 ESXi
 EoS
+fas
 FBGEMM
 FFT
 FFTs
@@ -194,6 +197,7 @@ HWE
 HWS
 Haswell
 Higgs
+href
 Hyperparameters
 Huggingface
 ICD
@@ -360,6 +364,7 @@ PowerEdge
 PowerShell
 Pretrained
 Pretraining
+Primus
 Profiler's
 PyPi
 Pytest
@@ -524,6 +529,7 @@ Xilinx
 Xnack
 Xteam
 YAML
+YAMLs
 YML
 YModel
 ZeRO
@@ -584,6 +590,7 @@ completers
 composable
 concretization
 config
+configs
 conformant
 constructible
 convolutional
@@ -794,7 +801,9 @@ preprocessing
 preprocessor
 prequantized
 prerequisites
+pretrain
 pretraining
+primus
 profiler
 profilers
 protobuf
@@ -909,6 +918,7 @@ toolchain
 toolchains
 toolset
 toolsets
+torchtitan
 torchvision
 tqdm
 tracebacks
--- a/default.xml
+++ b/default.xml
@@ -1,7 +1,7 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <manifest>
    <remote name="rocm-org" fetch="https://github.com/ROCm/" />
-    <default revision="refs/tags/rocm-6.4.2"
+    <default revision="refs/tags/rocm-6.4.3"
     remote="rocm-org"
     sync-c="true"
     sync-j="4" />
--- a/docs/about/license.md
+++ b/docs/about/license.md
@@ -29,6 +29,7 @@ additional licenses. Please review individual repositories for more information.
 | [AMD SMI](https://github.com/ROCm/amdsmi) | [MIT](https://github.com/ROCm/amdsmi/blob/amd-staging/LICENSE) |
 | [aomp](https://github.com/ROCm/aomp/) | [Apache 2.0](https://github.com/ROCm/aomp/blob/aomp-dev/LICENSE) |
 | [aomp-extras](https://github.com/ROCm/aomp-extras/) | [MIT](https://github.com/ROCm/aomp-extras/blob/aomp-dev/LICENSE) |
+| [AQLprofile] | [MIT](https://github.com/ROCm/aqlprofile/blob/amd-staging/LICENSE) |
 | [Code Object Manager (Comgr)](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/comgr) | [The University of Illinois/NCSA](https://github.com/ROCm/llvm-project/blob/amd-staging/amd/comgr/LICENSE.txt) |
 | [Composable Kernel](https://github.com/ROCm/composable_kernel) | [MIT](https://github.com/ROCm/composable_kernel/blob/develop/LICENSE) |
 | [half](https://github.com/ROCm/half/) | [MIT](https://github.com/ROCm/half/blob/rocm/LICENSE.txt) |
@@ -46,7 +47,6 @@ additional licenses. Please review individual repositories for more information.
 | [hipSPARSE](https://github.com/ROCm/hipSPARSE/) | [MIT](https://github.com/ROCm/hipSPARSE/blob/develop/LICENSE.md) |
 | [hipSPARSELt](https://github.com/ROCm/hipSPARSELt/) | [MIT](https://github.com/ROCm/hipSPARSELt/blob/develop/LICENSE.md) |
 | [hipTensor](https://github.com/ROCm/hipTensor) | [MIT](https://github.com/ROCm/hipTensor/blob/develop/LICENSE) |
-| hsa-amd-aqlprofile | [AMD Software EULA](https://www.amd.com/en/legal/eula/amd-software-eula.html) |
 | [llvm-project](https://github.com/ROCm/llvm-project/) | [Apache](https://github.com/ROCm/llvm-project/blob/amd-staging/LICENSE.TXT) |
 | [llvm-project/flang](https://github.com/ROCm/llvm-project/tree/amd-staging/flang) | [Apache 2.0](https://github.com/ROCm/llvm-project/blob/amd-staging/flang/LICENSE.TXT) |
 | [MIGraphX](https://github.com/ROCm/AMDMIGraphX/) | [MIT](https://github.com/ROCm/AMDMIGraphX/blob/develop/LICENSE) |
@@ -132,12 +132,10 @@ companies.
 ### Package licensing

 :::{attention}
-AQL Profiler and AOCC CPU optimization are both provided in binary form, each
-subject to the license agreement enclosed in the directory for the binary available
-in `/opt/rocm/share/doc/hsa-amd-aqlprofile/EULA`. By using, installing,
-copying or distributing AQL Profiler and/or AOCC CPU Optimizations, you agree to
+ROCprof Trace Decoder and AOCC CPU optimizations are provided in binary form, subject to the license agreement enclosed on [GitHub](https://github.com/ROCm/rocprof-trace-decoder/blob/amd-mainline/LICENSE) for ROCprof Trace Decoder, and [Developer Central](https://www.amd.com/en/developer/aocc.html) for AOCC. By using, installing,
+copying or distributing ROCprof Trace Decoder or AOCC CPU Optimizations, you agree to
 the terms and conditions of this license agreement. If you do not agree to the
-terms of this agreement, do not install, copy or use the AQL Profiler and/or the
+terms of this agreement, do not install, copy or use ROCprof Trace Decoder or the
 AOCC CPU Optimizations.
 :::

--- a/docs/compatibility/compatibility-matrix-historical-6.0.csv
+++ b/docs/compatibility/compatibility-matrix-historical-6.0.csv
@@ -31,9 +31,9 @@ ROCm Version,6.4.3,6.4.2,6.4.1,6.4.0,6.3.3,6.3.2,6.3.1,6.3.0,6.2.4,6.2.2,6.2.1,6
      :doc:`TensorFlow <../compatibility/ml-compatibility/tensorflow-compatibility>`,"2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.18.1, 2.17.1, 2.16.2","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.17.0, 2.16.2, 2.15.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.16.1, 2.15.1, 2.14.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.15.0, 2.14.0, 2.13.1","2.14.0, 2.13.1, 2.12.1","2.14.0, 2.13.1, 2.12.1"
      :doc:`JAX <../compatibility/ml-compatibility/jax-compatibility>`,0.4.35,0.4.35,0.4.35,0.4.35,0.4.31,0.4.31,0.4.31,0.4.31,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26,0.4.26
      :doc:`verl <../compatibility/ml-compatibility/verl-compatibility>` [#verl_compat]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.3.0.post0,N/A,N/A,N/A,N/A,N/A
-      :doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>`,N/A,N/A,N/A,N/A,85f95ae,85f95ae,85f95ae,85f95ae,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`Stanford Megatron-LM <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>` [#stanford-megatron-lm_compat]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,85f95ae,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
      :doc:`DGL <../compatibility/ml-compatibility/dgl-compatibility>` [#dgl_compat]_,N/A,N/A,N/A,2.4.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,
-      :doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>`,N/A,N/A,N/A,N/A,0.7.0,0.7.0,0.7.0,0.7.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
+      :doc:`Megablocks <../compatibility/ml-compatibility/megablocks-compatibility>` [#megablocks_compat]_,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0.7.0,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
      :doc:`Taichi <../compatibility/ml-compatibility/taichi-compatibility>` [#taichi_compat]_,N/A,N/A,N/A,N/A,N/A,1.8.0b1,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A
      `ONNX Runtime <https://onnxruntime.ai/docs/build/eps.html#amd-migraphx>`_,1.2,1.2,1.2,1.2,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.17.3,1.14.1,1.14.1
 ,,,,,,,,,,,,,,,,,,
--- a/docs/compatibility/compatibility-matrix.rst
+++ b/docs/compatibility/compatibility-matrix.rst
@@ -242,7 +242,9 @@ Expand for full historical view of:
   .. [#mi300_602-past-60] **For ROCm 6.0.2** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.
   .. [#mi300_600-past-60] **For ROCm 6.0.0** - MI300A (gfx942) is supported on Ubuntu 22.04.3, RHEL 8.9, and SLES 15 SP5. MI300X (gfx942) is only supported on Ubuntu 22.04.3.
   .. [#verl_compat] verl is only supported on ROCm 6.2.0.
+   .. [#stanford-megatron-lm_compat] Stanford Megatron-LM is only supported on ROCm 6.3.0.
   .. [#dgl_compat] DGL is only supported on ROCm 6.4.0.
+   .. [#megablocks_compat] Megablocks is only supported on ROCm 6.3.0.
   .. [#taichi_compat] Taichi is only supported on ROCm 6.3.2.
   .. [#kfd_support-past-60] As of ROCm 6.4.0, forward and backward compatibility between the AMD Kernel-mode GPU Driver (KMD) and its user space software is provided up to a year apart. For earlier ROCm releases, the compatibility is provided for +/- 2 releases. The tested user space versions on this page were accurate as of the time of initial ROCm release. For the most up-to-date information, see the latest version of this information at `User and kernel-space support matrix <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html>`_.
   .. [#ROCT-rocr-past-60] Starting from ROCm 6.3.0, the ROCT Thunk Interface is included as part of the ROCr runtime package.
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -9,17 +9,21 @@ import shutil
 import sys
 from pathlib import Path

-shutil.copy2("../RELEASE.md", "./about/release-notes.md")
-shutil.copy2("../CHANGELOG.md", "./release/changelog.md")
+gh_release_path = os.path.join("..", "RELEASE.md")
+gh_changelog_path = os.path.join("..", "CHANGELOG.md")
+sphinx_release_path = os.path.join("about", "release-notes.md")
+sphinx_changelog_path = os.path.join("release", "changelog.md")
+shutil.copy2(gh_release_path, sphinx_release_path)
+shutil.copy2(gh_changelog_path, sphinx_changelog_path)

 # Mark the consolidated changelog as orphan to prevent Sphinx from warning about missing toctree entries
-with open("./release/changelog.md", "r+") as file:
+with open(sphinx_changelog_path, "r+", encoding="utf-8") as file:
    content = file.read()
    file.seek(0)
    file.write(":orphan:\n" + content)

 # Replace GitHub-style [!ADMONITION]s with Sphinx-compatible ```{admonition} blocks
-with open("./release/changelog.md", "r") as file:
+with open(sphinx_changelog_path, "r", encoding="utf-8") as file:
    lines = file.readlines()

    modified_lines = []
@@ -57,11 +61,14 @@ with open("./release/changelog.md", "r") as file:

    file.close()

-    with open("./release/changelog.md", 'w') as file:
+    with open(sphinx_changelog_path, "w", encoding="utf-8") as file:
        file.writelines(modified_lines)

-os.system("mkdir -p ../_readthedocs/html/downloads")
-os.system("cp compatibility/compatibility-matrix-historical-6.0.csv ../_readthedocs/html/downloads/compatibility-matrix-historical-6.0.csv")
+matrix_path = os.path.join("compatibility", "compatibility-matrix-historical-6.0.csv")
+rtd_path = os.path.join("..", "_readthedocs", "html", "downloads")
+if not os.path.exists(rtd_path):
+    os.makedirs(rtd_path)
+shutil.copy2(matrix_path, rtd_path)

 latex_engine = "xelatex"
 latex_elements = {
@@ -117,11 +124,15 @@ article_pages = [
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.5", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.6", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-primus-migration-guide", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/primus-megatron", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/pytorch-training", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.3", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.4", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.6", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-history", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/training/benchmark-docker/previous-versions/jax-maxtext-v25.4", "os": ["linux"]},
@@ -147,6 +158,8 @@ article_pages = [
    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.8.5-20250521", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250605", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.0.1-20250702", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250702", "os": ["linux"]},
+    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference", "os": ["linux"]},
    {"file": "how-to/rocm-for-ai/inference/deploy-your-model", "os": ["linux"]},

--- a/docs/contribute/building.md
+++ b/docs/contribute/building.md
@@ -28,13 +28,31 @@ See the [Python requirements file](https://github.com/ROCm/ROCm/blob/develop/doc

 Use the Python Virtual Environment (`venv`) and run the following commands from the project root:

+::::{tab-set}
+:::{tab-item} Linux and WSL
+:sync: linux
+
 ```sh
 python3 -mvenv .venv

-.venv/bin/python     -m pip install -r docs/sphinx/requirements.txt
-.venv/bin/python     -m sphinx -T -E -b html -d _build/doctrees -D language=en docs _build/html
+.venv/bin/python -m pip install -r docs/sphinx/requirements.txt
+.venv/bin/python -m sphinx -T -E -b html -d _build/doctrees -D language=en docs _build/html
 ```

+:::
+:::{tab-item} Windows
+:sync: windows
+
+```powershell
+python -mvenv .venv
+
+.venv\Scripts\python.exe -m pip install -r docs/sphinx/requirements.txt
+.venv\Scripts\python.exe -m sphinx -T -E -b html -d _build/doctrees -D language=en docs _build/html
+```
+
+:::
+::::
+
 Navigate to `_build/html/index.html` and open this file in a web browser.

 ## Visual Studio Code
--- a/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml
@@ -0,0 +1,163 @@
+vllm_benchmark:
+  unified_docker:
+    latest:
+      # TODO: update me
+      pull_tag: rocm/vllm:rocm6.4.1_vllm_0.9.1_20250715
+      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250715/images/sha256-4a429705fa95a58f6d20aceab43b1b76fa769d57f32d5d28bd3f4e030e2a78ea
+      rocm_version: 6.4.1
+      vllm_version: 0.9.1 (0.9.2.dev364+gb432b7a28.rocm641)
+      pytorch_version: 2.7.0+gitf717b2a
+      hipblaslt_version: 0.15
+  model_groups:
+    - group: Meta Llama
+      tag: llama
+      models:
+      - model: Llama 3.1 8B
+        mad_tag: pyt_vllm_llama-3.1-8b
+        model_repo: meta-llama/Llama-3.1-8B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-8B
+        precision: float16
+      - model: Llama 3.1 70B
+        mad_tag: pyt_vllm_llama-3.1-70b
+        model_repo: meta-llama/Llama-3.1-70B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
+        precision: float16
+      - model: Llama 3.1 405B
+        mad_tag: pyt_vllm_llama-3.1-405b
+        model_repo: meta-llama/Llama-3.1-405B-Instruct
+        url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
+        precision: float16
+      - model: Llama 2 7B
+        mad_tag: pyt_vllm_llama-2-7b
+        model_repo: meta-llama/Llama-2-7b-chat-hf
+        url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
+        precision: float16
+      - model: Llama 2 70B
+        mad_tag: pyt_vllm_llama-2-70b
+        model_repo: meta-llama/Llama-2-70b-chat-hf
+        url: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
+        precision: float16
+      - model: Llama 3.1 8B FP8
+        mad_tag: pyt_vllm_llama-3.1-8b_fp8
+        model_repo: amd/Llama-3.1-8B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV
+        precision: float8
+      - model: Llama 3.1 70B FP8
+        mad_tag: pyt_vllm_llama-3.1-70b_fp8
+        model_repo: amd/Llama-3.1-70B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV
+        precision: float8
+      - model: Llama 3.1 405B FP8
+        mad_tag: pyt_vllm_llama-3.1-405b_fp8
+        model_repo: amd/Llama-3.1-405B-Instruct-FP8-KV
+        url: https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV
+        precision: float8
+    - group: Mistral AI
+      tag: mistral
+      models:
+      - model: Mixtral MoE 8x7B
+        mad_tag: pyt_vllm_mixtral-8x7b
+        model_repo: mistralai/Mixtral-8x7B-Instruct-v0.1
+        url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
+        precision: float16
+      - model: Mixtral MoE 8x22B
+        mad_tag: pyt_vllm_mixtral-8x22b
+        model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
+        url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
+        precision: float16
+      - model: Mistral 7B
+        mad_tag: pyt_vllm_mistral-7b
+        model_repo: mistralai/Mistral-7B-Instruct-v0.3
+        url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
+        precision: float16
+      - model: Mixtral MoE 8x7B FP8
+        mad_tag: pyt_vllm_mixtral-8x7b_fp8
+        model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+        precision: float8
+      - model: Mixtral MoE 8x22B FP8
+        mad_tag: pyt_vllm_mixtral-8x22b_fp8
+        model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+        precision: float8
+      - model: Mistral 7B FP8
+        mad_tag: pyt_vllm_mistral-7b_fp8
+        model_repo: amd/Mistral-7B-v0.1-FP8-KV
+        url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
+        precision: float8
+    - group: Qwen
+      tag: qwen
+      models:
+      - model: Qwen2 7B
+        mad_tag: pyt_vllm_qwen2-7b
+        model_repo: Qwen/Qwen2-7B-Instruct
+        url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
+        precision: float16
+      - model: Qwen2 72B
+        mad_tag: pyt_vllm_qwen2-72b
+        model_repo: Qwen/Qwen2-72B-Instruct
+        url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
+        precision: float16
+      - model: QwQ-32B
+        mad_tag: pyt_vllm_qwq-32b
+        model_repo: Qwen/QwQ-32B
+        url: https://huggingface.co/Qwen/QwQ-32B
+        precision: float16
+        tunableop: true
+    - group: Databricks DBRX
+      tag: dbrx
+      models:
+      - model: DBRX Instruct
+        mad_tag: pyt_vllm_dbrx-instruct
+        model_repo: databricks/dbrx-instruct
+        url: https://huggingface.co/databricks/dbrx-instruct
+        precision: float16
+      - model: DBRX Instruct FP8
+        mad_tag: pyt_vllm_dbrx_fp8
+        model_repo: amd/dbrx-instruct-FP8-KV
+        url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
+        precision: float8
+    - group: Google Gemma
+      tag: gemma
+      models:
+      - model: Gemma 2 27B
+        mad_tag: pyt_vllm_gemma-2-27b
+        model_repo: google/gemma-2-27b
+        url: https://huggingface.co/google/gemma-2-27b
+        precision: float16
+    - group: Cohere
+      tag: cohere
+      models:
+      - model: C4AI Command R+ 08-2024
+        mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
+        model_repo: CohereForAI/c4ai-command-r-plus-08-2024
+        url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
+        precision: float16
+      - model: C4AI Command R+ 08-2024 FP8
+        mad_tag: pyt_vllm_command-r-plus_fp8
+        model_repo: amd/c4ai-command-r-plus-FP8-KV
+        url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
+        precision: float8
+    - group: DeepSeek
+      tag: deepseek
+      models:
+      - model: DeepSeek MoE 16B
+        mad_tag: pyt_vllm_deepseek-moe-16b-chat
+        model_repo: deepseek-ai/deepseek-moe-16b-chat
+        url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
+        precision: float16
+    - group: Microsoft Phi
+      tag: phi
+      models:
+      - model: Phi-4
+        mad_tag: pyt_vllm_phi-4
+        model_repo: microsoft/phi-4
+        url: https://huggingface.co/microsoft/phi-4
+    - group: TII Falcon
+      tag: falcon
+      models:
+      - model: Falcon 180B
+        mad_tag: pyt_vllm_falcon-180b
+        model_repo: tiiuae/falcon-180B
+        url: https://huggingface.co/tiiuae/falcon-180B
+        precision: float16
--- a/docs/data/how-to/rocm-for-ai/inference/pytorch-inference-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/pytorch-inference-benchmark-models.yaml
@@ -39,7 +39,7 @@ pytorch_inference_benchmark:
        model_repo: Wan-AI/Wan2.1-T2V-14B
        url: https://huggingface.co/Wan-AI/Wan2.1-T2V-14B
        precision: bfloat16
-    - group: Janus-Pro
+    - group: Janus Pro
      tag: janus-pro
      models:
      - model: Janus Pro 7B
@@ -47,3 +47,11 @@ pytorch_inference_benchmark:
        model_repo: deepseek-ai/Janus-Pro-7B
        url: https://huggingface.co/deepseek-ai/Janus-Pro-7B
        precision: bfloat16
+    - group: Hunyuan Video
+      tag: hunyuan
+      models:
+      - model: Hunyuan Video
+        mad_tag: pyt_hy_video
+        model_repo: tencent/HunyuanVideo
+        url: https://huggingface.co/tencent/HunyuanVideo
+        precision: float16
--- a/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml
@@ -2,11 +2,11 @@ vllm_benchmark:
  unified_docker:
    latest:
      # TODO: update me
-      pull_tag: rocm/vllm:rocm6.4.1_vllm_0.9.1_20250715
-      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250715/images/sha256-4a429705fa95a58f6d20aceab43b1b76fa769d57f32d5d28bd3f4e030e2a78ea
+      pull_tag: rocm/vllm:rocm6.4.1_vllm_0.10.0_20250812
+      docker_hub_url: https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.0_20250812/images/sha256-4c277ad39af3a8c9feac9b30bf78d439c74d9b4728e788a419d3f1d0c30cacaa
      rocm_version: 6.4.1
-      vllm_version: 0.9.1 (0.9.2.dev364+gb432b7a28.rocm641)
-      pytorch_version: 2.7.0+gitf717b2a
+      vllm_version: 0.10.0 (0.10.1.dev395+g340ea86df.rocm641)
+      pytorch_version: 2.7.0+gitf717b2a (2.7.0+gitf717b2a)
      hipblaslt_version: 0.15
  model_groups:
    - group: Meta Llama
@@ -27,11 +27,6 @@ vllm_benchmark:
        model_repo: meta-llama/Llama-3.1-405B-Instruct
        url: https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
        precision: float16
-      - model: Llama 2 7B
-        mad_tag: pyt_vllm_llama-2-7b
-        model_repo: meta-llama/Llama-2-7b-chat-hf
-        url: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
-        precision: float16
      - model: Llama 2 70B
        mad_tag: pyt_vllm_llama-2-70b
        model_repo: meta-llama/Llama-2-70b-chat-hf
@@ -65,11 +60,6 @@ vllm_benchmark:
        model_repo: mistralai/Mixtral-8x22B-Instruct-v0.1
        url: https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1
        precision: float16
-      - model: Mistral 7B
-        mad_tag: pyt_vllm_mistral-7b
-        model_repo: mistralai/Mistral-7B-Instruct-v0.3
-        url: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
-        precision: float16
      - model: Mixtral MoE 8x7B FP8
        mad_tag: pyt_vllm_mixtral-8x7b_fp8
        model_repo: amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
@@ -80,72 +70,15 @@ vllm_benchmark:
        model_repo: amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
        url: https://huggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
        precision: float8
-      - model: Mistral 7B FP8
-        mad_tag: pyt_vllm_mistral-7b_fp8
-        model_repo: amd/Mistral-7B-v0.1-FP8-KV
-        url: https://huggingface.co/amd/Mistral-7B-v0.1-FP8-KV
-        precision: float8
    - group: Qwen
      tag: qwen
      models:
-      - model: Qwen2 7B
-        mad_tag: pyt_vllm_qwen2-7b
-        model_repo: Qwen/Qwen2-7B-Instruct
-        url: https://huggingface.co/Qwen/Qwen2-7B-Instruct
-        precision: float16
-      - model: Qwen2 72B
-        mad_tag: pyt_vllm_qwen2-72b
-        model_repo: Qwen/Qwen2-72B-Instruct
-        url: https://huggingface.co/Qwen/Qwen2-72B-Instruct
-        precision: float16
      - model: QwQ-32B
        mad_tag: pyt_vllm_qwq-32b
        model_repo: Qwen/QwQ-32B
        url: https://huggingface.co/Qwen/QwQ-32B
        precision: float16
        tunableop: true
-    - group: Databricks DBRX
-      tag: dbrx
-      models:
-      - model: DBRX Instruct
-        mad_tag: pyt_vllm_dbrx-instruct
-        model_repo: databricks/dbrx-instruct
-        url: https://huggingface.co/databricks/dbrx-instruct
-        precision: float16
-      - model: DBRX Instruct FP8
-        mad_tag: pyt_vllm_dbrx_fp8
-        model_repo: amd/dbrx-instruct-FP8-KV
-        url: https://huggingface.co/amd/dbrx-instruct-FP8-KV
-        precision: float8
-    - group: Google Gemma
-      tag: gemma
-      models:
-      - model: Gemma 2 27B
-        mad_tag: pyt_vllm_gemma-2-27b
-        model_repo: google/gemma-2-27b
-        url: https://huggingface.co/google/gemma-2-27b
-        precision: float16
-    - group: Cohere
-      tag: cohere
-      models:
-      - model: C4AI Command R+ 08-2024
-        mad_tag: pyt_vllm_c4ai-command-r-plus-08-2024
-        model_repo: CohereForAI/c4ai-command-r-plus-08-2024
-        url: https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024
-        precision: float16
-      - model: C4AI Command R+ 08-2024 FP8
-        mad_tag: pyt_vllm_command-r-plus_fp8
-        model_repo: amd/c4ai-command-r-plus-FP8-KV
-        url: https://huggingface.co/amd/c4ai-command-r-plus-FP8-KV
-        precision: float8
-    - group: DeepSeek
-      tag: deepseek
-      models:
-      - model: DeepSeek MoE 16B
-        mad_tag: pyt_vllm_deepseek-moe-16b-chat
-        model_repo: deepseek-ai/deepseek-moe-16b-chat
-        url: https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat
-        precision: float16
    - group: Microsoft Phi
      tag: phi
      models:
@@ -153,11 +86,3 @@ vllm_benchmark:
        mad_tag: pyt_vllm_phi-4
        model_repo: microsoft/phi-4
        url: https://huggingface.co/microsoft/phi-4
-    - group: TII Falcon
-      tag: falcon
-      models:
-      - model: Falcon 180B
-        mad_tag: pyt_vllm_falcon-180b
-        model_repo: tiiuae/falcon-180B
-        url: https://huggingface.co/tiiuae/falcon-180B
-        precision: float16
--- a/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml
@@ -1,26 +1,15 @@
 dockers:
-  - pull_tag: rocm/megatron-lm:v25.6_py312
-    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py312/images/sha256-482ff906532285bceabdf2bda629bd32cb6174d2d07f4243a736378001b28df0
+  - pull_tag: rocm/megatron-lm:v25.7_py310
+    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a
    components:
-      ROCm: 6.4.1
-      PyTorch: 2.8.0a0+git7d205b2
-      Python: 3.12
-      Transformer Engine: 2.1.0.dev0+8c4a512
-      hipBLASLt: 393e413
-      Triton: 3.3.0
-      RCCL: 2.23.4.7a84c5d
-    doc_name: Ubuntu 24.04 + Python 3.12
-  - pull_tag: rocm/megatron-lm:v25.6_py310
-    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py310/images/sha256-9627bd9378684fe26cb1a10c7dd817868f553b33402e49b058355b0f095568d6
-    components:
-      ROCm: 6.4.1
-      PyTorch: 2.8.0a0+git7d205b2
+      ROCm: 6.4.2
+      Primus: v0.1.0-rc1
+      PyTorch: 2.8.0a0+gitd06a406
      Python: "3.10"
-      Transformer Engine: 2.1.0.dev0+8c4a512
-      hipBLASLt: 393e413
+      Transformer Engine: 2.1.0.dev0+ba586519
+      hipBLASLt: 37ba1d36
      Triton: 3.3.0
-      RCCL: 2.23.4.7a84c5d
-    doc_name: Ubuntu 22.04 + Python 3.10
+      RCCL: 2.22.3
 model_groups:
  - group: Meta Llama
    tag: llama
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.6-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/megatron-lm-v25.6-benchmark-models.yaml
@@ -0,0 +1,60 @@
+dockers:
+  - pull_tag: rocm/megatron-lm:v25.6_py312
+    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py312/images/sha256-482ff906532285bceabdf2bda629bd32cb6174d2d07f4243a736378001b28df0
+    components:
+      ROCm: 6.4.1
+      PyTorch: 2.8.0a0+git7d205b2
+      Python: 3.12
+      Transformer Engine: 2.1.0.dev0+8c4a512
+      hipBLASLt: 393e413
+      Triton: 3.3.0
+      RCCL: 2.23.4.7a84c5d
+    doc_name: Ubuntu 24.04 + Python 3.12
+  - pull_tag: rocm/megatron-lm:v25.6_py310
+    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py310/images/sha256-9627bd9378684fe26cb1a10c7dd817868f553b33402e49b058355b0f095568d6
+    components:
+      ROCm: 6.4.1
+      PyTorch: 2.8.0a0+git7d205b2
+      Python: "3.10"
+      Transformer Engine: 2.1.0.dev0+8c4a512
+      hipBLASLt: 393e413
+      Triton: 3.3.0
+      RCCL: 2.23.4.7a84c5d
+    doc_name: Ubuntu 22.04 + Python 3.10
+model_groups:
+  - group: Meta Llama
+    tag: llama
+    models:
+      - model: Llama 3.3 70B
+        mad_tag: pyt_megatron_lm_train_llama-3.3-70b
+      - model: Llama 3.1 8B
+        mad_tag: pyt_megatron_lm_train_llama-3.1-8b
+      - model: Llama 3.1 70B
+        mad_tag: pyt_megatron_lm_train_llama-3.1-70b
+      - model: Llama 3.1 70B (proxy)
+        mad_tag: pyt_megatron_lm_train_llama-3.1-70b-proxy
+      - model: Llama 2 7B
+        mad_tag: pyt_megatron_lm_train_llama-2-7b
+      - model: Llama 2 70B
+        mad_tag: pyt_megatron_lm_train_llama-2-70b
+  - group: DeepSeek
+    tag: deepseek
+    models:
+      - model: DeepSeek-V3 (proxy)
+        mad_tag: pyt_megatron_lm_train_deepseek-v3-proxy
+      - model: DeepSeek-V2-Lite
+        mad_tag: pyt_megatron_lm_train_deepseek-v2-lite-16b
+  - group: Mistral AI
+    tag: mistral
+    models:
+      - model: Mixtral 8x7B
+        mad_tag: pyt_megatron_lm_train_mixtral-8x7b
+      - model: Mixtral 8x22B (proxy)
+        mad_tag: pyt_megatron_lm_train_mixtral-8x22b-proxy
+  - group: Qwen
+    tag: qwen
+    models:
+      - model: Qwen 2.5 7B
+        mad_tag: pyt_megatron_lm_train_qwen2.5-7b
+      - model: Qwen 2.5 72B
+        mad_tag: pyt_megatron_lm_train_qwen2.5-72b
--- a/docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.6-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.6-benchmark-models.yaml
@@ -0,0 +1,120 @@
+unified_docker:
+  latest:
+    pull_tag: rocm/pytorch-training:v25.6
+    docker_hub_url: https://hub.docker.com/r/rocm/pytorch-training/tags
+    rocm_version: 6.4.1
+    pytorch_version: 2.8.0a0+git7d205b2
+    python_version: 3.10.17
+    transformer_engine_version: 1.14.0+2f85f5f2
+    flash_attention_version: 3.0.0.post1
+    hipblaslt_version: 0.15.0-8c6919d
+    triton_version: 3.3.0
+model_groups:
+  - group: Pre-training
+    tag: pre-training
+    models:
+    - model: Llama 3.1 8B
+      mad_tag: pyt_train_llama-3.1-8b
+      model_repo: Llama-3.1-8B
+      url: https://huggingface.co/meta-llama/Llama-3.1-8B
+      precision: BF16
+      training_modes: [pretrain]
+    - model: Llama 3.1 70B
+      mad_tag: pyt_train_llama-3.1-70b
+      model_repo: Llama-3.1-70B
+      url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
+      precision: BF16
+      training_modes: [pretrain]
+    - model: FLUX.1-dev
+      mad_tag: pyt_train_flux
+      model_repo: Flux
+      url: https://huggingface.co/black-forest-labs/FLUX.1-dev
+      precision: BF16
+      training_modes: [pretrain]
+  - group: Fine-tuning
+    tag: fine-tuning
+    models:
+    - model: Llama 4 Scout 17B-16E
+      mad_tag: pyt_train_llama-4-scout-17b-16e
+      model_repo: Llama-4-17B_16E
+      url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3.3 70B
+      mad_tag: pyt_train_llama-3.3-70b
+      model_repo: Llama-3.3-70B
+      url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
+    - model: Llama 3.2 1B
+      mad_tag: pyt_train_llama-3.2-1b
+      model_repo: Llama-3.2-1B
+      url: https://huggingface.co/meta-llama/Llama-3.2-1B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3.2 3B
+      mad_tag: pyt_train_llama-3.2-3b
+      model_repo: Llama-3.2-3B
+      url: https://huggingface.co/meta-llama/Llama-3.2-3B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3.2 Vision 11B
+      mad_tag: pyt_train_llama-3.2-vision-11b
+      model_repo: Llama-3.2-Vision-11B
+      url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision
+      precision: BF16
+      training_modes: [finetune_fw]
+    - model: Llama 3.2 Vision 90B
+      mad_tag: pyt_train_llama-3.2-vision-90b
+      model_repo: Llama-3.2-Vision-90B
+      url: https://huggingface.co/meta-llama/Llama-3.2-90B-Vision
+      precision: BF16
+      training_modes: [finetune_fw]
+    - model: Llama 3.1 8B
+      mad_tag: pyt_train_llama-3.1-8b
+      model_repo: Llama-3.1-8B
+      url: https://huggingface.co/meta-llama/Llama-3.1-8B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3.1 70B
+      mad_tag: pyt_train_llama-3.1-70b
+      model_repo: Llama-3.1-70B
+      url: https://huggingface.co/meta-llama/Llama-3.1-70B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
+    - model: Llama 3.1 405B
+      mad_tag: pyt_train_llama-3.1-405b
+      model_repo: Llama-3.1-405B
+      url: https://huggingface.co/meta-llama/Llama-3.1-405B
+      precision: BF16
+      training_modes: [finetune_qlora, HF_finetune_lora]
+    - model: Llama 3 8B
+      mad_tag: pyt_train_llama-3-8b
+      model_repo: Llama-3-8B
+      url: https://huggingface.co/meta-llama/Meta-Llama-3-8B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 3 70B
+      mad_tag: pyt_train_llama-3-70b
+      model_repo: Llama-3-70B
+      url: https://huggingface.co/meta-llama/Meta-Llama-3-70B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 2 7B
+      mad_tag: pyt_train_llama-2-7b
+      model_repo: Llama-2-7B
+      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
+    - model: Llama 2 13B
+      mad_tag: pyt_train_llama-2-13b
+      model_repo: Llama-2-13B
+      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Llama 2 70B
+      mad_tag: pyt_train_llama-2-70b
+      model_repo: Llama-2-70B
+      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
+      precision: BF16
+      training_modes: [finetune_lora, finetune_qlora, HF_finetune_lora]
--- a/docs/data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
@@ -0,0 +1,58 @@
+dockers:
+  - pull_tag: rocm/megatron-lm:v25.7_py310
+    docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a
+    components:
+      ROCm: 6.4.2
+      Primus: v0.1.0-rc1
+      PyTorch: 2.8.0a0+gitd06a406
+      Python: "3.10"
+      Transformer Engine: 2.1.0.dev0+ba586519
+      hipBLASLt: 37ba1d36
+      Triton: 3.3.0
+      RCCL: 2.22.3
+model_groups:
+  - group: Meta Llama
+    tag: llama
+    models:
+      - model: Llama 3.3 70B
+        mad_tag: primus_pyt_megatron_lm_train_llama-3.3-70b
+        config_name: llama3.3_70B-pretrain.yaml
+      - model: Llama 3.1 70B
+        mad_tag: primus_pyt_megatron_lm_train_llama-3.1-70b
+        config_name: llama3.1_70B-pretrain.yaml
+      - model: Llama 3.1 8B
+        mad_tag: primus_pyt_megatron_lm_train_llama-3.1-8b
+        config_name: llama3.1_8B-pretrain.yaml
+      - model: Llama 2 7B
+        mad_tag: primus_pyt_megatron_lm_train_llama-2-7b
+        config_name: llama2_7B-pretrain.yaml
+      - model: Llama 2 70B
+        mad_tag: primus_pyt_megatron_lm_train_llama-2-70b
+        config_name: llama2_70B-pretrain.yaml
+  - group: DeepSeek
+    tag: deepseek
+    models:
+      - model: DeepSeek-V3 (proxy)
+        mad_tag: primus_pyt_megatron_lm_train_deepseek-v3-proxy
+        config_name: deepseek_v3-pretrain.yaml
+      - model: DeepSeek-V2-Lite
+        mad_tag: primus_pyt_megatron_lm_train_deepseek-v2-lite-16b
+        config_name: deepseek_v2_lite-pretrain.yaml
+  - group: Mistral AI
+    tag: mistral
+    models:
+      - model: Mixtral 8x7B
+        mad_tag: primus_pyt_megatron_lm_train_mixtral-8x7b
+        config_name: mixtral_8x7B_v0.1-pretrain.yaml
+      - model: Mixtral 8x22B (proxy)
+        mad_tag: primus_pyt_megatron_lm_train_mixtral-8x22b-proxy
+        config_name: mixtral_8x22B_v0.1-pretrain.yaml
+  - group: Qwen
+    tag: qwen
+    models:
+      - model: Qwen 2.5 7B
+        mad_tag: primus_pyt_megatron_lm_train_qwen2.5-7b
+        config_name: primus_qwen2.5_7B-pretrain.yaml
+      - model: Qwen 2.5 72B
+        mad_tag: primus_pyt_megatron_lm_train_qwen2.5-72b
+        config_name: qwen2.5_72B-pretrain.yaml
--- a/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
+++ b/docs/data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml
@@ -1,38 +1,17 @@
-unified_docker:
-  latest:
-    pull_tag: rocm/pytorch-training:v25.6
-    docker_hub_url: https://hub.docker.com/r/rocm/pytorch-training/tags
-    rocm_version: 6.4.1
-    pytorch_version: 2.8.0a0+git7d205b2
-    python_version: 3.10.17
-    transformer_engine_version: 1.14.0+2f85f5f2
-    flash_attention_version: 3.0.0.post1
-    hipblaslt_version: 0.15.0-8c6919d
-    triton_version: 3.3.0
+dockers:
+  - pull_tag: rocm/pytorch-training:v25.7
+    docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.7/images/sha256-cc6fd840ab89cb81d926fc29eca6d075aee9875a55a522675a4b9231c9a0a712
+    components:
+      ROCm: 6.4.2
+      PyTorch: 2.8.0a0+gitd06a406
+      Python: 3.10.18
+      Transformer Engine: 2.2.0.dev0+94e53dd8
+      Flash Attention: 3.0.0.post1
+      hipBLASLt: 1.1.0-4b9a52edfc
+      Triton: 3.3.0
 model_groups:
-  - group: Pre-training
-    tag: pre-training
-    models:
-    - model: Llama 3.1 8B
-      mad_tag: pyt_train_llama-3.1-8b
-      model_repo: Llama-3.1-8B
-      url: https://huggingface.co/meta-llama/Llama-3.1-8B
-      precision: BF16
-      training_modes: [pretrain]
-    - model: Llama 3.1 70B
-      mad_tag: pyt_train_llama-3.1-70b
-      model_repo: Llama-3.1-70B
-      url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
-      precision: BF16
-      training_modes: [pretrain]
-    - model: FLUX.1-dev
-      mad_tag: pyt_train_flux
-      model_repo: Flux
-      url: https://huggingface.co/black-forest-labs/FLUX.1-dev
-      precision: BF16
-      training_modes: [pretrain]
-  - group: Fine-tuning
-    tag: fine-tuning
+  - group: Meta Llama
+    tag: llama
    models:
    - model: Llama 4 Scout 17B-16E
      mad_tag: pyt_train_llama-4-scout-17b-16e
@@ -75,19 +54,19 @@ model_groups:
      model_repo: Llama-3.1-8B
      url: https://huggingface.co/meta-llama/Llama-3.1-8B
      precision: BF16
-      training_modes: [finetune_fw, finetune_lora]
+      training_modes: [pretrain, finetune_fw, finetune_lora, HF_pretrain]
    - model: Llama 3.1 70B
      mad_tag: pyt_train_llama-3.1-70b
      model_repo: Llama-3.1-70B
-      url: https://huggingface.co/meta-llama/Llama-3.1-70B
+      url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
      precision: BF16
-      training_modes: [finetune_fw, finetune_lora, finetune_qlora]
+      training_modes: [pretrain, finetune_fw, finetune_lora]
    - model: Llama 3.1 405B
      mad_tag: pyt_train_llama-3.1-405b
      model_repo: Llama-3.1-405B
      url: https://huggingface.co/meta-llama/Llama-3.1-405B
      precision: BF16
-      training_modes: [finetune_qlora, HF_finetune_lora]
+      training_modes: [finetune_qlora]
    - model: Llama 3 8B
      mad_tag: pyt_train_llama-3-8b
      model_repo: Llama-3-8B
@@ -117,4 +96,67 @@ model_groups:
      model_repo: Llama-2-70B
      url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
      precision: BF16
-      training_modes: [finetune_lora, finetune_qlora, HF_finetune_lora]
+      training_modes: [finetune_lora, finetune_qlora]
+  - group: OpenAI
+    tag: openai
+    models:
+    - model: GPT OSS 20B
+      mad_tag: pyt_train_gpt_oss_20b
+      model_repo: GPT-OSS-20B
+      url: https://huggingface.co/openai/gpt-oss-20b
+      precision: BF16
+      training_modes: [HF_finetune_lora]
+    - model: GPT OSS 120B
+      mad_tag: pyt_train_gpt_oss_120b
+      model_repo: GPT-OSS-120B
+      url: https://huggingface.co/openai/gpt-oss-120b
+      precision: BF16
+      training_modes: [HF_finetune_lora]
+  - group: Qwen
+    tag: qwen
+    models:
+    - model: Qwen 3 8B
+      mad_tag: pyt_train_qwen3-8b
+      model_repo: Qwen3-8B
+      url: https://huggingface.co/Qwen/Qwen3-8B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Qwen 3 32B
+      mad_tag: pyt_train_qwen3-32b
+      model_repo: Qwen3-32
+      url: https://huggingface.co/Qwen/Qwen3-32B
+      precision: BF16
+      training_modes: [finetune_lora]
+    - model: Qwen 2.5 32B
+      mad_tag: pyt_train_qwen2.5-32b
+      model_repo: Qwen2.5-32B
+      url: https://huggingface.co/Qwen/Qwen2.5-32B
+      precision: BF16
+      training_modes: [finetune_lora]
+    - model: Qwen 2.5 72B
+      mad_tag: pyt_train_qwen2.5-72b
+      model_repo: Qwen2.5-72B
+      url: https://huggingface.co/Qwen/Qwen2.5-72B
+      precision: BF16
+      training_modes: [finetune_lora]
+    - model: Qwen 2 1.5B
+      mad_tag: pyt_train_qwen2-1.5b
+      model_repo: Qwen2-1.5B
+      url: https://huggingface.co/Qwen/Qwen2-1.5B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+    - model: Qwen 2 7B
+      mad_tag: pyt_train_qwen2-7b
+      model_repo: Qwen2-7B
+      url: https://huggingface.co/Qwen/Qwen2-7B
+      precision: BF16
+      training_modes: [finetune_fw, finetune_lora]
+  - group: Flux
+    tag: flux
+    models:
+    - model: FLUX.1-dev
+      mad_tag: pyt_train_flux
+      model_repo: Flux
+      url: https://huggingface.co/black-forest-labs/FLUX.1-dev
+      precision: BF16
+      training_modes: [pretrain]
--- a/docs/how-to/build-rocm.rst
+++ b/docs/how-to/build-rocm.rst
@@ -19,5 +19,6 @@ The general steps to build ROCm are:
 #. Run the build command

 Because the ROCm stack is constantly evolving, the most current instructions are stored with the source code in GitHub.  
-For detailed build instructions, see `Getting and Building ROCm from Source <https://github.com/ROCm/ROCm?tab=readme-ov-file#getting-and-building-rocm-from-source>`.
+For detailed build instructions, see `Getting and Building ROCm from Source <https://github.com/ROCm/ROCm?tab=readme-ov-file#getting-and-building-rocm-from-source>`_.
+

--- a/docs/how-to/deep-learning-rocm.rst
+++ b/docs/how-to/deep-learning-rocm.rst
@@ -2,58 +2,124 @@
   :description: How to install deep learning frameworks for ROCm
   :keywords: deep learning, frameworks, ROCm, install, PyTorch, TensorFlow, JAX, MAGMA, DeepSpeed, ML, AI

-********************************************
-Installing deep learning frameworks for ROCm
-********************************************
+**********************************
+Deep learning frameworks for ROCm
+**********************************

-ROCm provides a comprehensive ecosystem for deep learning development, including
-:ref:`libraries <artificial-intelligence-apis>` for optimized deep learning operations and ROCm-aware versions of popular
-deep learning frameworks and libraries such as PyTorch, TensorFlow, and JAX. ROCm works closely with these
-frameworks to ensure that framework-specific optimizations take advantage of AMD accelerator and GPU architectures.
+Deep learning frameworks provide environments for machine learning, training, fine-tuning, inference, and performance optimization.

-The following guides provide information on compatibility and supported
-features for these ROCm-enabled deep learning frameworks.
+ROCm offers a complete ecosystem for developing and running deep learning applications efficiently. It also provides ROCm-compatible versions of popular frameworks and libraries, such as PyTorch, TensorFlow, JAX, and others.

-* :doc:`PyTorch compatibility <../compatibility/ml-compatibility/pytorch-compatibility>`
-* :doc:`TensorFlow compatibility <../compatibility/ml-compatibility/tensorflow-compatibility>`
-* :doc:`JAX compatibility <../compatibility/ml-compatibility/jax-compatibility>`
-* :doc:`verl compatibility <../compatibility/ml-compatibility/verl-compatibility>`
-* :doc:`Stanford Megatron-LM compatibility <../compatibility/ml-compatibility/stanford-megatron-lm-compatibility>`
-* :doc:`DGL compatibility <../compatibility/ml-compatibility/dgl-compatibility>`
-* :doc:`Megablocks compatibility <../compatibility/ml-compatibility/megablocks-compatibility>`
-* :doc:`Taichi compatibility <../compatibility/ml-compatibility/taichi-compatibility>`
+The AMD ROCm organization actively contributes to open-source development and collaborates closely with framework organizations. This collaboration ensures that framework-specific optimizations effectively leverage AMD GPUs and accelerators.

-This chart steps through typical installation workflows for installing deep learning frameworks for ROCm.
+The table below summarizes information about ROCm-enabled deep learning frameworks. It includes details on ROCm compatibility and third-party tool support, installation steps and options, and links to GitHub resources. For a complete list of supported framework versions on ROCm, see the :doc:`Compatibility matrix <../compatibility/compatibility-matrix>` topic.

-.. image:: ../data/how-to/framework_install_2024_07_04.png
-   :alt: Flowchart for installing ROCm-aware machine learning frameworks
-   :align: center
+.. list-table:: 
+    :header-rows: 1
+    :widths: 5 3 6 3

-See the installation instructions to get started.
+    * - Framework
+      - Installation
+      - Installation options
+      - GitHub

-* :doc:`PyTorch for ROCm <rocm-install-on-linux:install/3rd-party/pytorch-install>`
-* :doc:`TensorFlow for ROCm <rocm-install-on-linux:install/3rd-party/tensorflow-install>`
-* :doc:`JAX for ROCm <rocm-install-on-linux:install/3rd-party/jax-install>`
-* :doc:`verl for ROCm <rocm-install-on-linux:install/3rd-party/verl-install>`
-* :doc:`Stanford Megatron-LM for ROCm <rocm-install-on-linux:install/3rd-party/stanford-megatron-lm-install>`
-* :doc:`DGL for ROCm <rocm-install-on-linux:install/3rd-party/dgl-install>`
-* :doc:`Megablocks for ROCm <rocm-install-on-linux:install/3rd-party/megablocks-install>`
-* :doc:`Taichi for ROCm <rocm-install-on-linux:install/3rd-party/taichi-install>`
+    * - `PyTorch <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/pytorch-compatibility.html>`__
+      - .. raw:: html

-.. note::
+          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html"><i class="fas fa-link fa-lg"></i></a>
+      - 
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-a-docker-image-with-pytorch-pre-installed>`__
+        - `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-a-wheels-package>`__
+        - `ROCm Base Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-the-pytorch-rocm-base-docker-image>`__
+        - `Upstream Docker file <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#using-the-pytorch-upstream-dockerfile>`__
+      - .. raw:: html

-   For guidance on installing ROCm itself, refer to :doc:`ROCm installation for Linux <rocm-install-on-linux:index>`.
+          <a href="https://github.com/ROCm/pytorch"><i class="fab fa-github fa-lg"></i></a>
+
+    * - `TensorFlow <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/tensorflow-compatibility.html>`__
+      - .. raw:: html
+
+          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html"><i class="fas fa-link fa-lg"></i></a>
+      - 
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html#using-a-docker-image-with-tensorflow-pre-installed>`__
+        - `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/tensorflow-install.html#using-a-wheels-package>`__
+
+      - .. raw:: html
+
+          <a href="https://github.com/ROCm/tensorflow-upstream"><i class="fab fa-github fa-lg"></i></a> 
+
+    * - `JAX <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/jax-compatibility.html>`__
+      - .. raw:: html
+
+          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/jax-install.html"><i class="fas fa-link fa-lg"></i></a>
+      - 
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/jax-install.html#using-a-prebuilt-docker-image>`__
+      - .. raw:: html
+
+          <a href="https://github.com/ROCm/jax"><i class="fab fa-github fa-lg"></i></a>
+
+    * - `verl <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/verl-compatibility.html>`__
+      - .. raw:: html
+
+          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/verl-install.html"><i class="fas fa-link fa-lg"></i></a>
+      - 
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/verl-install.html#use-a-prebuilt-docker-image-with-verl-pre-installed>`__
+      - .. raw:: html
+
+          <a href="https://github.com/ROCm/verl"><i class="fab fa-github fa-lg"></i></a>
+
+    * - `Stanford Megatron-LM <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/stanford-megatron-lm-compatibility.html>`__
+      - .. raw:: html
+
+          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/stanford-megatron-lm-install.html"><i class="fas fa-link fa-lg"></i></a>
+      - 
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/stanford-megatron-lm-install.html#use-a-prebuilt-docker-image-with-stanford-megatron-lm-pre-installed>`__
+      - .. raw:: html
+
+          <a href="https://github.com/ROCm/Stanford-Megatron-LM"><i class="fab fa-github fa-lg"></i></a>
+
+    * - `DGL <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/dgl-compatibility.html>`__
+      - .. raw:: html
+
+          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/dgl-install.html"><i class="fas fa-link fa-lg"></i></a>
+      - 
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/dgl-install.html#use-a-prebuilt-docker-image-with-dgl-pre-installed>`__
+      - .. raw:: html
+
+          <a href="https://github.com/ROCm/dgl"><i class="fab fa-github fa-lg"></i></a> 
+
+    * - `Megablocks <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/megablocks-compatibility.html>`__
+      - .. raw:: html
+
+          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/megablocks-install.html"><i class="fas fa-link fa-lg"></i></a>
+      - 
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/megablocks-install.html#using-a-prebuilt-docker-image-with-megablocks-pre-installed>`__
+      - .. raw:: html
+
+          <a href="https://github.com/ROCm/megablocks"><i class="fab fa-github fa-lg"></i></a>
+
+    * - `Taichi <https://rocm.docs.amd.com/en/latest/compatibility/ml-compatibility/taichi-compatibility.html>`__
+      - .. raw:: html
+
+          <a href="https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html"><i class="fas fa-link fa-lg"></i></a>
+      - 
+        - `Docker image <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html#use-a-prebuilt-docker-image-with-taichi-pre-installed>`__
+        - `Wheels package <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/taichi-install.html#use-a-wheels-package>`__
+
+      - .. raw:: html
+
+          <a href="https://github.com/ROCm/taichi"><i class="fab fa-github fa-lg"></i></a>

 Learn how to use your ROCm deep learning environment for training, fine-tuning, inference, and performance optimization
 through the following guides.

 * :doc:`rocm-for-ai/index`

-* :doc:`Training <rocm-for-ai/training/index>`
+* :doc:`Use ROCm for training <rocm-for-ai/training/index>`

-* :doc:`Fine-tuning LLMs <rocm-for-ai/fine-tuning/index>`
+* :doc:`Use ROCm for fine-tuning LLMs <rocm-for-ai/fine-tuning/index>`

-* :doc:`Inference <rocm-for-ai/inference/index>`
+* :doc:`Use ROCm for AI inference <rocm-for-ai/inference/index>`

-* :doc:`Inference optimization <rocm-for-ai/inference-optimization/index>`
+* :doc:`Use ROCm for AI inference optimization <rocm-for-ai/inference-optimization/index>`

--- a/docs/how-to/rocm-for-ai/inference-optimization/workload.rst
+++ b/docs/how-to/rocm-for-ai/inference-optimization/workload.rst
@@ -939,7 +939,7 @@ hipBLASLt benchmarking
 The GEMM library
 `hipBLASLt <https://rocm.docs.amd.com/projects/hipBLASLt/en/latest/index.html>`_
 provides a benchmark tool for its supported operations. Refer to the
-`documentation <https://github.com/ROCm/hipBLASLt/blob/develop/clients/benchmarks/README.md>`_
+`documentation <https://github.com/ROCm/hipBLASLt/blob/develop/clients/bench/README.md>`_
 for details.

 * Example 1: Benchmark mix fp8 GEMM
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250702.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250702.rst
@@ -14,7 +14,7 @@ vLLM inference performance testing
   This documentation does not reflect the latest version of ROCm vLLM
   inference performance documentation. See :doc:`../vllm` for the latest version.

-.. _vllm-benchmark-unified-docker:
+.. _vllm-benchmark-unified-docker-702:

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250702-benchmark-models.yaml

@@ -77,7 +77,7 @@ vLLM inference performance testing
        </div>
      </div>

-   .. _vllm-benchmark-vllm:
+   .. _vllm-benchmark-vllm-702:

   {% for model_group in model_groups %}
      {% for model in model_group.models %}
@@ -159,7 +159,7 @@ vLLM inference performance testing
   Once the setup is complete, choose between two options to reproduce the
   benchmark results:

-   .. _vllm-benchmark-mad:
+   .. _vllm-benchmark-mad-702:

   {% for model_group in model_groups %}
      {% for model in model_group.models %}
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-0.9.1-20250715.rst
@@ -0,0 +1,450 @@
+:orphan:
+
+.. meta::
+   :description: Learn how to validate LLM inference performance on MI300X accelerators using AMD MAD and the
+                 ROCm vLLM Docker image.
+   :keywords: model, MAD, automation, dashboarding, validate
+
+**********************************
+vLLM inference performance testing
+**********************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm vLLM
+   inference performance documentation. See :doc:`../vllm` for the latest version.
+
+.. _vllm-benchmark-unified-docker-715:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml
+
+   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
+   {% set model_groups = data.vllm_benchmark.model_groups %}
+
+   The `ROCm vLLM Docker <{{ unified_docker.docker_hub_url }}>`_ image offers
+   a prebuilt, optimized environment for validating large language model (LLM)
+   inference performance on AMD Instinct™ MI300X series accelerators. This ROCm vLLM
+   Docker image integrates vLLM and PyTorch tailored specifically for MI300X series
+   accelerators and includes the following components:
+
+   .. list-table::
+      :header-rows: 1
+
+      * - Software component
+        - Version
+
+      * - `ROCm <https://github.com/ROCm/ROCm>`__
+        - {{ unified_docker.rocm_version }}
+
+      * - `vLLM <https://docs.vllm.ai/en/latest>`__
+        - {{ unified_docker.vllm_version }}
+
+      * - `PyTorch <https://github.com/ROCm/pytorch>`__
+        - {{ unified_docker.pytorch_version }}
+
+      * - `hipBLASLt <https://github.com/ROCm/hipBLASLt>`__
+        - {{ unified_docker.hipblaslt_version }}
+
+With this Docker image, you can quickly test the :ref:`expected
+inference performance numbers <vllm-benchmark-performance-measurements-715>` for
+MI300X series accelerators.
+
+What's new
+==========
+
+The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <vllm-history>`.
+
+* The ``--compilation-config-parameter`` is no longer required as its options are now enabled by default.
+  This parameter has been removed from the benchmarking script.
+
+* Resolved Llama 3.1 405 B custom all-reduce issue, eliminating the need for ``--disable-custom-all-reduce``.
+  This parameter has been removed from the benchmarking script.
+
+* Fixed a ``+rms_norm`` custom kernel issue.
+
+* Added quick reduce functionality. Set ``VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=FP`` to enable; supported modes are ``FP``, ``INT8``, ``INT6``, ``INT4``.
+
+* Implemented a workaround to potentially mitigate GPU crashes experienced with the Command R+ model, pending a driver fix.
+
+Supported models
+================
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml
+
+   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
+   {% set model_groups = data.vllm_benchmark.model_groups %}
+
+   .. _vllm-benchmark-available-models-715:
+
+   The following models are supported for inference performance benchmarking
+   with vLLM and ROCm. Some instructions, commands, and recommendations in this
+   documentation might vary by model -- select one to get started.
+
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+      <div class="row">
+         <div class="col-2 me-2 model-param-head">Model group</div>
+         <div class="row col-10">
+   {% for model_group in model_groups %}
+            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+   {% endfor %}
+         </div>
+      </div>
+
+      <div class="row mt-1">
+         <div class="col-2 me-2 model-param-head">Model</div>
+         <div class="row col-10">
+   {% for model_group in model_groups %}
+      {% set models = model_group.models %}
+      {% for model in models %}
+         {% if models|length % 3 == 0 %}
+            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% else %}
+            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% endif %}
+      {% endfor %}
+   {% endfor %}
+         </div>
+      </div>
+      </div>
+
+   .. _vllm-benchmark-vllm-715:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. note::
+
+         See the `{{ model.model }} model card on Hugging Face <{{ model.url }}>`_ to learn more about your selected model.
+         Some models require access authorization prior to use via an external license agreement through a third party.
+
+      {% endfor %}
+   {% endfor %}
+
+.. note::
+
+   vLLM is a toolkit and library for LLM inference and serving. AMD implements
+   high-performance custom kernels and modules in vLLM to enhance performance.
+   See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
+   more information.
+
+.. _vllm-benchmark-performance-measurements-715:
+
+Performance measurements
+========================
+
+To evaluate performance, the
+`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+page provides reference throughput and latency measurements for inferencing popular AI models.
+
+.. important::
+
+   The performance data presented in
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
+   only reflects the latest version of this inference benchmarking environment.
+   The listed measurements should not be interpreted as the peak performance achievable by AMD Instinct MI325X and MI300X accelerators or ROCm software.
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting training.
+
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/previous-versions/vllm_0.9.1_20250715-benchmark_models.yaml
+
+   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
+   {% set model_groups = data.vllm_benchmark.model_groups %}
+
+   Pull the Docker image
+   =====================
+
+   Download the `ROCm vLLM Docker image <{{ unified_docker.docker_hub_url }}>`_.
+   Use the following command to pull the Docker image from Docker Hub.
+
+   .. code-block:: shell
+
+      docker pull {{ unified_docker.pull_tag }}
+
+   Benchmarking
+   ============
+
+   Once the setup is complete, choose between two options to reproduce the
+   benchmark results:
+
+   .. _vllm-benchmark-mad-715:
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+   .. container:: model-doc {{model.mad_tag}}
+
+      .. tab-set::
+
+         .. tab-item:: MAD-integrated benchmarking
+
+            1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+               directory and install the required packages on the host machine.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD
+                  pip install -r requirements.txt
+
+            2. Use this command to run the performance benchmark test on the `{{model.model}} <{{ model.url }}>`_ model
+               using one GPU with the :literal:`{{model.precision}}` data type on the host machine.
+
+               .. code-block:: shell
+
+                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+                  madengine run \
+                      --tags {{model.mad_tag}} \
+                      --keep-model-dir \
+                      --live-output \
+                      --timeout 28800
+
+            MAD launches a Docker container with the name
+            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
+            model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
+
+            Although the :ref:`available models <vllm-benchmark-available-models-715>` are preconfigured
+            to collect latency and throughput performance data, you can also change the benchmarking
+            parameters. See the standalone benchmarking tab for more information.
+
+            {% if model.tunableop %}
+
+            .. note::
+
+               For improved performance, consider enabling :ref:`PyTorch TunableOp <mi300x-tunableop>`.
+               TunableOp automatically explores different implementations and configurations of certain PyTorch
+               operators to find the fastest one for your hardware.
+
+               By default, ``{{model.mad_tag}}`` runs with TunableOp disabled
+               (see
+               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__).
+               To enable it, include the ``--tunableop on`` argument in your
+               run.
+
+               Enabling TunableOp triggers a two-pass run -- a warm-up followed
+               by the performance-collection run.
+
+            {% endif %}
+
+         .. tab-item:: Standalone benchmarking
+
+            .. rubric:: Download the Docker image and required scripts
+
+            1. Run the vLLM benchmark tool independently by starting the
+               `Docker container <{{ unified_docker.docker_hub_url }}>`_
+               as shown in the following snippet.
+
+               .. code-block:: shell
+
+                  docker pull {{ unified_docker.pull_tag }}
+                  docker run -it \
+                      --device=/dev/kfd \
+                      --device=/dev/dri \
+                      --group-add video \
+                      --shm-size 16G \
+                      --security-opt seccomp=unconfined \
+                      --security-opt apparmor=unconfined \
+                      --cap-add=SYS_PTRACE \
+                      -v $(pwd):/workspace \
+                      --env HUGGINGFACE_HUB_CACHE=/workspace \
+                      --name test \
+                      {{ unified_docker.pull_tag }}
+
+            2. In the Docker container, clone the ROCm MAD repository and navigate to the
+               benchmark scripts directory at ``~/MAD/scripts/vllm``.
+
+               .. code-block:: shell
+
+                  git clone https://github.com/ROCm/MAD
+                  cd MAD/scripts/vllm
+
+            3. To start the benchmark, use the following command with the appropriate options.
+
+               .. dropdown:: Benchmark options
+                  :open:
+
+                  .. list-table::
+                     :header-rows: 1
+                     :align: center
+
+                     * - Name
+                       - Options
+                       - Description
+
+                     * - ``$test_option``
+                       - latency
+                       - Measure decoding token latency
+
+                     * -
+                       - throughput
+                       - Measure token generation throughput
+
+                     * -
+                       - all
+                       - Measure both throughput and latency
+
+                     * - ``$num_gpu``
+                       - 1 or 8
+                       - Number of GPUs
+
+                     * - ``$datatype``
+                       - ``float16`` or ``float8``
+                       - Data type
+
+                  The input sequence length, output sequence length, and tensor parallel (TP) are
+                  already configured. You don't need to specify them with this script.
+
+               Command:
+
+               .. code-block::
+
+                  ./vllm_benchmark_report.sh \
+                      -s $test_option \
+                      -m {{model.model_repo}} \
+                      -g $num_gpu \
+                      -d {{model.precision}}
+
+               .. note::
+
+                  For best performance, it's recommend to run with ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``.
+
+                  If you encounter the following error, pass your access-authorized Hugging
+                  Face token to the gated models.
+
+                  .. code-block::
+
+                     OSError: You are trying to access a gated repo.
+
+                     # pass your HF_TOKEN
+                     export HF_TOKEN=$your_personal_hf_token
+
+            .. rubric:: Benchmarking examples
+
+            Here are some examples of running the benchmark with various options:
+
+            * Latency benchmark
+
+              Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
+
+              .. code-block::
+
+                 ./vllm_benchmark_report.sh \
+                     -s latency \
+                     -m {{model.model_repo}} \
+                     -g 8 \
+                     -d {{model.precision}}
+
+              Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
+
+            * Throughput benchmark
+
+              Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
+
+              .. code-block:: shell
+
+                 ./vllm_benchmark_report.sh \
+                     -s throughput \
+                     -m {{model.model_repo}} \
+                     -g 8 \
+                     -d {{model.precision}}
+
+              Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
+
+            .. raw:: html
+
+               <style>
+               mjx-container[jax="CHTML"][display="true"] {
+                  text-align: left;
+                  margin: 0;
+               }
+               </style>
+
+            .. note::
+
+               Throughput is calculated as:
+
+               - .. math:: throughput\_tot = requests \times (\mathsf{\text{input lengths}} + \mathsf{\text{output lengths}}) / elapsed\_time
+
+               - .. math:: throughput\_gen = requests \times \mathsf{\text{output lengths}} / elapsed\_time
+      {% endfor %}
+   {% endfor %}
+
+Advanced usage
+==============
+
+For information on experimental features and known issues related to ROCm optimization efforts on vLLM,
+see the developer's guide at `<https://github.com/ROCm/vllm/tree/f94ec9beeca1071cc34f9d1e206d8c7f3ac76129/docs/dev-docker>`__.
+
+Reproducing the Docker image
+----------------------------
+
+To reproduce this ROCm/vLLM Docker image release, follow these steps:
+
+1. Clone the `vLLM repository <https://github.com/ROCm/vllm>`__.
+
+   .. code-block:: shell
+
+      git clone https://github.com/ROCm/vllm.git
+
+2. Checkout the specific release commit.
+
+   .. code-block:: shell
+
+      cd vllm
+      git checkout b432b7a285aa0dcb9677380936ffa74931bb6d6f
+
+3. Build the Docker image. Replace ``vllm-rocm`` with your desired image tag.
+
+   .. code-block:: shell
+
+      docker build -f docker/Dockerfile.rocm -t vllm-rocm .
+
+Known issues and workarounds
+============================
+
+AITER does not support FP8 KV cache yet.
+
+Further reading
+===============
+
+- To learn more about the options for latency and throughput benchmark scripts,
+  see `<https://github.com/ROCm/vllm/tree/main/benchmarks>`_.
+
+- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
+
+- To learn more about system settings and management practices to configure your system for
+  AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+
+- For application performance optimization strategies for HPC and AI workloads,
+  including inference with vLLM, see :doc:`/how-to/rocm-for-ai/inference-optimization/workload`.
+
+- To learn how to run community models from Hugging Face on AMD GPUs, see
+  :doc:`Running models from Hugging Face </how-to/rocm-for-ai/inference/hugging-face-models>`.
+
+- To learn how to fine-tune LLMs and optimize inference, see
+  :doc:`Fine-tuning LLMs and inference optimization </how-to/rocm-for-ai/fine-tuning/fine-tuning-and-inference>`.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`vllm-history` to find documentation for previous releases
+of the ``ROCm/vllm`` Docker image.
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/previous-versions/vllm-history.rst
@@ -16,14 +16,23 @@ previous releases of the ``ROCm/vllm`` Docker image on `Docker Hub <https://hub.
     - Components
     - Resources

-   * - ``rocm/vllm:rocm6.4.1_vllm_0.9.1_20250715``
+   * - ``rocm/vllm:rocm6.4.1_vllm_0.10.0_20250812``
       (latest)
+     - 
+       * ROCm 6.4.1
+       * vLLM 0.10.0
+       * PyTorch 2.7.0
+     - 
+       * :doc:`Documentation <../vllm>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.10.0_20250812/images/sha256-4c277ad39af3a8c9feac9b30bf78d439c74d9b4728e788a419d3f1d0c30cacaa>`__
+
+   * - ``rocm/vllm:rocm6.4.1_vllm_0.9.1_20250715``
     - 
       * ROCm 6.4.1
       * vLLM 0.9.1
       * PyTorch 2.7.0
     - 
-       * :doc:`Documentation <../vllm>`
+       * :doc:`Documentation <vllm-0.9.1-20250715>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/vllm/rocm6.4.1_vllm_0.9.1_20250715/images/sha256-4a429705fa95a58f6d20aceab43b1b76fa769d57f32d5d28bd3f4e030e2a78ea>`__

   * - ``rocm/vllm:rocm6.4.1_vllm_0.9.1_20250702``
--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/pytorch-inference.rst
@@ -103,7 +103,7 @@ PyTorch inference performance testing

         The Chai-1 benchmark uses a specifically selected Docker image using ROCm 6.2.3 and PyTorch 2.3.0 to address an accuracy issue.

-   .. container:: model-doc pyt_clip_inference pyt_mochi_video_inference pyt_wan2.1_inference pyt_janus_pro_inference
+   .. container:: model-doc pyt_clip_inference pyt_mochi_video_inference pyt_wan2.1_inference pyt_janus_pro_inference pyt_hy_video

      Use the following command to pull the `ROCm PyTorch Docker image <https://hub.docker.com/layers/rocm/pytorch/latest/images/sha256-05b55983e5154f46e7441897d0908d79877370adca4d1fff4899d9539d6c4969>`__ from Docker Hub.

--- a/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
+++ b/docs/how-to/rocm-for-ai/inference/benchmark-docker/vllm.rst
@@ -7,7 +7,7 @@
 vLLM inference performance testing
 **********************************

-.. _vllm-benchmark-unified-docker:
+.. _vllm-benchmark-unified-docker-812:

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/inference/vllm-benchmark-models.yaml

@@ -39,7 +39,7 @@ vLLM inference performance testing
        - {{ unified_docker.hipblaslt_version }}

 With this Docker image, you can quickly test the :ref:`expected
-inference performance numbers <vllm-benchmark-performance-measurements>` for
+inference performance numbers <vllm-benchmark-performance-measurements-812>` for
 MI300X series accelerators.

 What's new
@@ -47,17 +47,11 @@ What's new

 The following is summary of notable changes since the :doc:`previous ROCm/vLLM Docker release <previous-versions/vllm-history>`.

-* The ``--compilation-config-parameter`` is no longer required as its options are now enabled by default.
-  This parameter has been removed from the benchmarking script.
+* Upgraded to vLLM v0.10.

-* Resolved Llama 3.1 405 B custom all-reduce issue, eliminating the need for ``--disable-custom-all-reduce``.
-  This parameter has been removed from the benchmarking script.
+* FP8 KV cache support via AITER.

-* Fixed a ``+rms_norm`` custom kernel issue.
-
-* Added quick reduce functionality. Set ``VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=FP`` to enable; supported modes are ``FP``, ``INT8``, ``INT6``, ``INT4``.
-
-* Implemented a workaround to potentially mitigate GPU crashes experienced with the Command R+ model, pending a driver fix.
+* Full graph capture support via AITER.

 Supported models
 ================
@@ -67,7 +61,7 @@ Supported models
   {% set unified_docker = data.vllm_benchmark.unified_docker.latest %}
   {% set model_groups = data.vllm_benchmark.model_groups %}

-   .. _vllm-benchmark-available-models:
+   .. _vllm-benchmark-available-models-812:

   The following models are supported for inference performance benchmarking
   with vLLM and ROCm. Some instructions, commands, and recommendations in this
@@ -102,7 +96,7 @@ Supported models
      </div>
      </div>

-   .. _vllm-benchmark-vllm:
+   .. _vllm-benchmark-vllm-812:

   {% for model_group in model_groups %}
      {% for model in model_group.models %}
@@ -124,14 +118,14 @@ Supported models
   See :ref:`fine-tuning-llms-vllm` and :ref:`mi300x-vllm-optimization` for
   more information.

-.. _vllm-benchmark-performance-measurements:
+.. _vllm-benchmark-performance-measurements-812:

 Performance measurements
 ========================

 To evaluate performance, the
 `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html>`_
-page provides reference throughput and latency measurements for inferencing popular AI models.
+page provides reference throughput and serving measurements for inferencing popular AI models.

 .. important::

@@ -176,7 +170,7 @@ system's configuration.
   Once the setup is complete, choose between two options to reproduce the
   benchmark results:

-   .. _vllm-benchmark-mad:
+   .. _vllm-benchmark-mad-812:

   {% for model_group in model_groups %}
      {% for model in model_group.models %}
@@ -209,12 +203,15 @@ system's configuration.
                      --timeout 28800

            MAD launches a Docker container with the name
-            ``container_ci-{{model.mad_tag}}``. The latency and throughput reports of the
-            model are collected in the following path: ``~/MAD/reports_{{model.precision}}/``.
+            ``container_ci-{{model.mad_tag}}``. The throughput and serving reports of the
+            model are collected in the following paths: ``{{ model.mad_tag }}_throughput.csv``
+            and ``{{ model.mad_tag }}_serving.csv``.

-            Although the :ref:`available models <vllm-benchmark-available-models>` are preconfigured
-            to collect latency and throughput performance data, you can also change the benchmarking
-            parameters. See the standalone benchmarking tab for more information.
+            Although the :ref:`available models
+            <vllm-benchmark-available-models-812>` are preconfigured to collect
+            offline throughput and online serving performance data, you can
+            also change the benchmarking parameters. See the standalone
+            benchmarking tab for more information.

            {% if model.tunableop %}

@@ -224,14 +221,12 @@ system's configuration.
               TunableOp automatically explores different implementations and configurations of certain PyTorch
               operators to find the fastest one for your hardware.

-               By default, ``{{model.mad_tag}}`` runs with TunableOp disabled
-               (see
-               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__).
-               To enable it, include the ``--tunableop on`` argument in your
-               run.
+               By default, ``{{model.mad_tag}}`` runs with TunableOp disabled (see
+               `<https://github.com/ROCm/MAD/blob/develop/models.json>`__). To enable it, include
+               the ``--tunableop on`` argument in your run.

-               Enabling TunableOp triggers a two-pass run -- a warm-up followed
-               by the performance-collection run.
+               Enabling TunableOp triggers a two-pass run -- a warm-up followed by the
+               performance-collection run.

            {% endif %}

@@ -269,6 +264,13 @@ system's configuration.

            3. To start the benchmark, use the following command with the appropriate options.

+               .. code-block::
+
+                  ./run.sh \
+                      --config $CONFIG_CSV \
+                      --model_repo {{ model.model_repo }} \
+                      <overrides>
+
               .. dropdown:: Benchmark options
                  :open:

@@ -280,42 +282,40 @@ system's configuration.
                       - Options
                       - Description

-                     * - ``$test_option``
-                       - latency
-                       - Measure decoding token latency
+                     * - ``--config``
+                       - ``configs/default.csv``
+                       - Run configs from the CSV for the chosen model repo and benchmark.

                     * -
-                       - throughput
-                       - Measure token generation throughput
+                       - ``configs/extended.csv``
+                       - 

                     * -
-                       - all
-                       - Measure both throughput and latency
+                       - ``configs/performance.csv``
+                       - 

-                     * - ``$num_gpu``
-                       - 1 or 8
-                       - Number of GPUs
+                     * - ``--benchmark``
+                       - ``throughput``
+                       - Measure offline end-to-end throughput.

-                     * - ``$datatype``
-                       - ``float16`` or ``float8``
-                       - Data type
+                     * - 
+                       - ``serving``
+                       - Measure online serving performance.
+
+                     * - 
+                       - ``all``
+                       - Measure both throughput and serving.
+
+                     * - `<overrides>`
+                       - See `run.sh <https://github.com/ROCm/MAD/blob/develop/scripts/vllm/run.sh>`__ for more info.
+                       - Additional overrides to the config CSV.

                  The input sequence length, output sequence length, and tensor parallel (TP) are
                  already configured. You don't need to specify them with this script.

-               Command:
-
-               .. code-block::
-
-                  ./vllm_benchmark_report.sh \
-                      -s $test_option \
-                      -m {{model.model_repo}} \
-                      -g $num_gpu \
-                      -d {{model.precision}}
-
               .. note::

-                  For best performance, it's recommend to run with ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``.
+                  For best performance, it's recommended to run with ``VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1``.

                  If you encounter the following error, pass your access-authorized Hugging
                  Face token to the gated models.
@@ -331,33 +331,33 @@ system's configuration.

            Here are some examples of running the benchmark with various options:

-            * Latency benchmark
-
-              Use this command to benchmark the latency of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
-
-              .. code-block::
-
-                 ./vllm_benchmark_report.sh \
-                     -s latency \
-                     -m {{model.model_repo}} \
-                     -g 8 \
-                     -d {{model.precision}}
-
-              Find the latency report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_latency_report.csv``.
-
            * Throughput benchmark

              Use this command to benchmark the throughput of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.

              .. code-block:: shell

-                 ./vllm_benchmark_report.sh \
-                     -s throughput \
-                     -m {{model.model_repo}} \
-                     -g 8 \
-                     -d {{model.precision}}
+                 export MAD_MODEL_NAME={{ model.mad_tag }}
+                 ./run.sh \
+                     --config configs/default.csv \
+                     --model_repo {{model.model_repo}} \
+                     --benchmark throughput

-              Find the throughput report at ``./reports_{{model.precision}}_vllm_rocm{{unified_docker.rocm_version}}/summary/{{model.model_repo.split('/', 1)[1] if '/' in model.model_repo else model.model_repo}}_throughput_report.csv``.
+              Find the throughput benchmark report at ``./{{ model.mad_tag }}_throughput.csv``.
+
+            * Serving benchmark
+
+              Use this command to benchmark the serving performance of the {{model.model}} model on eight GPUs with :literal:`{{model.precision}}` precision.
+
+              .. code-block::
+
+                 export MAD_MODEL_NAME={{ model.mad_tag }}
+                 ./run.sh \
+                     --config configs/default.csv \
+                     --model_repo {{model.model_repo}} \
+                     --benchmark serving
+
+              Find the serving benchmark report at ``./{{ model.mad_tag }}_serving.csv``.

            .. raw:: html

@@ -400,7 +400,7 @@ To reproduce this ROCm/vLLM Docker image release, follow these steps:
   .. code-block:: shell

      cd vllm
-      git checkout b432b7a285aa0dcb9677380936ffa74931bb6d6f
+      git checkout 340ea86dfe5955d6f9a9e767d6abab5aacf2c978

 3. Build the Docker image. Replace ``vllm-rocm`` with your desired image tag.

@@ -408,11 +408,6 @@ To reproduce this ROCm/vLLM Docker image release, follow these steps:

      docker build -f docker/Dockerfile.rocm -t vllm-rocm .

-Known issues and workarounds
-============================
-
-AITER does not support FP8 KV cache yet.
-
 Further reading
 ===============

--- a/docs/how-to/rocm-for-ai/install.rst
+++ b/docs/how-to/rocm-for-ai/install.rst
@@ -1,14 +1,14 @@
 .. meta::
-   :description: How to install ROCm and popular machine learning frameworks.
+   :description: How to install ROCm and popular deep learning frameworks.
   :keywords: ROCm, AI, LLM, train, fine-tune, FSDP, DeepSpeed, LLaMA, tutorial

 .. _rocm-for-ai-install:

-***********************************************
-Installing ROCm and machine learning frameworks
-***********************************************
+********************************************
+Installing ROCm and deep learning frameworks
+********************************************

-Before getting started, install ROCm and supported machine learning frameworks.
+Before getting started, install ROCm and supported deep learning frameworks.

 .. grid:: 1

@@ -22,9 +22,9 @@ If you’re new to ROCm, refer to the :doc:`ROCm quick start install guide for L
 <rocm-install-on-linux:install/quick-start>`.

 If you’re using a Radeon GPU for graphics-accelerated applications, refer to the
-`Radeon installation instructions <https://rocm.docs.amd.com/projects/radeon/en/docs-6.1.3/docs/install/native_linux/install-radeon.html>`_.
+`Radeon installation instructions <https://rocm.docs.amd.com/projects/radeon/en/latest/docs/install/native_linux/howto_native_linux.html>`_.

-You can install ROCm on :ref:`compatible systems <rocm-install-on-linux:reference/system-requirements>` via your Linux
+You can install ROCm on :doc:`compatible systems <rocm-install-on-linux:reference/system-requirements>` via your Linux
 distribution's package manager. See the following documentation resources to get started:

 * :doc:`ROCm installation overview <rocm-install-on-linux:install/install-overview>`
@@ -43,29 +43,16 @@ distribution's package manager. See the following documentation resources to get
      If you encounter any issues during installation, refer to the
      :doc:`Installation troubleshooting <rocm-install-on-linux:reference/install-faq>` guide.

-Machine learning frameworks
-===========================
+Deep learning frameworks
+========================

-ROCm supports popular machine learning frameworks and libraries including `PyTorch
+ROCm supports deep learning frameworks and libraries including `PyTorch
 <https://pytorch.org/blog/pytorch-for-amd-rocm-platform-now-available-as-python-package>`_, `TensorFlow
-<https://tensorflow.org>`_, `JAX <https://jax.readthedocs.io/en/latest>`_, and `DeepSpeed
-<https://cloudblogs.microsoft.com/opensource/2022/03/21/supporting-efficient-large-model-training-on-amd-instinct-gpus-with-deepspeed/>`_.
+<https://tensorflow.org>`_, `JAX <https://jax.readthedocs.io/en/latest>`_, and more.

-Review the framework installation documentation. For ease-of-use, it's recommended to use official ROCm prebuilt Docker
+Review the :doc:`framework installation documentation <../deep-learning-rocm>`. For ease-of-use, it's recommended to use official ROCm prebuilt Docker
 images with the framework pre-installed.

-* :doc:`PyTorch for ROCm <rocm-install-on-linux:install/3rd-party/pytorch-install>`
-
-* :doc:`TensorFlow for ROCm <rocm-install-on-linux:install/3rd-party/tensorflow-install>`
-
-* :doc:`JAX for ROCm <rocm-install-on-linux:install/3rd-party/jax-install>`
-
-* :doc:`verl for ROCm <rocm-install-on-linux:install/3rd-party/verl-install>`
-
-* :doc:`Stanford Megatron-LM for ROCm <rocm-install-on-linux:install/3rd-party/jax-install>`
-
-* :doc:`DGL for ROCm <rocm-install-on-linux:install/3rd-party/jax-install>`
-
 Next steps
 ==========

--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
@@ -1,3 +1,5 @@
+:orphan:
+
 .. meta::
   :description: How to train a model using Megatron-LM for ROCm.
   :keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
@@ -6,6 +8,14 @@
 Training a model with Megatron-LM for ROCm
 ******************************************

+.. caution::
+
+   The ROCm Megatron-LM framework now has limited support with this Docker
+   environment; it now focuses on Primus with Megatron-Core. See :doc:`primus-megatron`.
+
+   To learn how to migrate your existing workloads to Primus with Megatron-Core,
+   see :doc:`previous-versions/megatron-lm-primus-migration-guide`.
+
 The `Megatron-LM framework for ROCm <https://github.com/ROCm/Megatron-LM>`_ is
 a specialized fork of the robust Megatron-LM, designed to enable efficient
 training of large-scale language models on AMD GPUs. By leveraging AMD
@@ -20,13 +30,17 @@ essential components, including PyTorch, ROCm libraries, and Megatron-LM
 utilities. It contains the following software components to accelerate training
 workloads:

+.. note::
+
+   This Docker environment is based on Python 3.10 and Ubuntu 22.04. For an alternative environment with
+   Python 3.12 and Ubuntu 24.04, see the :doc:`previous ROCm Megatron-LM v25.6 Docker release <previous-versions/megatron-lm-v25.6>`.
+
 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/megatron-lm-benchmark-models.yaml

   {% set dockers = data.dockers %}
-   {% if dockers|length > 1 %}
   .. tab-set::

-      {% for docker in data.dockers %}
+      {% for docker in dockers %}
      .. tab-item:: ``{{ docker.pull_tag }}``
         :sync: {{ docker.pull_tag }}

@@ -42,28 +56,14 @@ workloads:

            {% endfor %}
      {% endfor %}
-   {% elif dockers|length == 1 %}
-   .. list-table::
-      :header-rows: 1
-
-      * - Software component
-        - Version
-
-      {% for component_name, component_version in docker.components %}
-      * - {{ component_name }}
-        - {{ component_version }}
-
-      {% endfor %}
-   {% endif %}

   .. _amd-megatron-lm-model-support:

-   The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
-
   Supported models
   ================

-   The following models are supported for training performance benchmarking with Megatron-LM and ROCm.
+   The following models are supported for training performance benchmarking with Megatron-LM and ROCm
+   on AMD Instinct MI300X series accelerators.
   Some instructions, commands, and training recommendations in this documentation might
   vary by model -- select one to get started.

@@ -177,7 +177,7 @@ Download the Docker image
      {% if dockers|length > 1 %}
      .. tab-set::

-         {% for docker in data.dockers %}
+         {% for docker in dockers %}
         .. tab-item:: {{ docker.doc_name }}
            :sync: {{ docker.pull_tag }}

@@ -227,10 +227,17 @@ Download the Docker image
      docker start megatron_training_env
      docker exec -it megatron_training_env bash

-The Docker container includes a pre-installed, verified version of the ROCm
-Megatron-LM development branch
-`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev>`__, including necessary
-training scripts.
+4. **Megatron-LM backward compatibility setup** -- this Docker is primarily intended for use with Primus, but it maintains Megatron-LM compatibility with limited support.
+   To roll back to using Megatron-LM, follow these steps:
+
+   .. code-block:: shell
+
+      cd /workspace/Megatron-LM/
+      pip uninstall megatron-core
+      pip install -e .
+
+The Docker container hosts
+`<https://github.com/ROCm/Megatron-LM/tree/rocm_dev>`__ at verified commit ``e8e9edc``.

 .. _amd-megatron-lm-environment-setup:

--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-history.rst
@@ -16,12 +16,20 @@ previous releases of the ``ROCm/megatron-lm`` Docker image on `Docker Hub <https
     - Components
     - Resources

-   * - v25.6 (latest)
+   * - v25.7 (latest)
+     - 
+       * ROCm 
+       * PyTorch 
+     - 
+       * :doc:`Documentation <../megatron-lm>`
+       * `Docker Hub (py310) <https://hub.docker.com/layers/rocm/megatron-lm/v25.7_py310/images/sha256-6189df849feeeee3ae31bb1e97aef5006d69d2b90c134e97708c19632e20ab5a>`__
+
+   * - v25.6
     - 
       * ROCm 6.4.1
       * PyTorch 2.8.0a0+git7d205b2
     - 
-       * :doc:`Documentation <../megatron-lm>`
+       * :doc:`Documentation <megatron-lm-v25.6>`
       * `Docker Hub (py312) <https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py312/images/sha256-482ff906532285bceabdf2bda629bd32cb6174d2d07f4243a736378001b28df0>`__
       * `Docker Hub (py310) <https://hub.docker.com/layers/rocm/megatron-lm/v25.6_py310/images/sha256-9627bd9378684fe26cb1a10c7dd817868f553b33402e49b058355b0f095568d6>`__

--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-primus-migration-guide.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-primus-migration-guide.rst
@@ -0,0 +1,175 @@
+:orphan:
+
+**********************************************************************
+Migrating workloads to Primus (Megatron-Core backend) from Megatron-LM
+**********************************************************************
+
+Primus supports Megatron-Core as backend optimization library,
+replacing ROCm Megatron-LM. This document outlines the steps to migrate
+workload from ROCm Megatron-LM to Primus with the Megatron-Core backend.
+
+Model architecture
+==================
+
+ROCm Megatron-LM defines model architecture parameters in the training scripts;
+for example, the Llama 3 8B model parameters are defined in
+`examples/llama/train_llama3.sh <https://github.com/ROCm/Megatron-LM/blob/rocm_dev/examples/llama/train_llama3.sh#L117>`__
+as shown below:
+
+.. code-block:: bash
+
+   HIDDEN_SIZE=4096 
+   FFN_HIDDEN_SIZE=14336 
+   NUM_LAYERS=32 
+   NUM_HEADS=32 
+   NUM_KV_HEADS=8
+
+Primus defines the model architecture through model YAML configuration files
+inside the ``primus/configs/models/megatron/`` repository. For example, Llama 3 8B
+model architecture parameters are defined in
+`primus/configs/models/megatron/llama3_8B.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/llama3_8B.yaml>`__
+as shown below:
+
+.. code-block:: yaml
+
+   bases:
+     - llama3_base.yaml
+
+   tokenizer_type: Llama3Tokenizer
+   tokenizer_model: meta-llama/Llama-3.1-8B
+
+   ffn_hidden_size: 14336
+   hidden_size: 4096
+   num_attention_heads: 32
+   num_layers: 32
+   num_query_groups: 8
+
+Primus' model config files follow a hierarchical design, meaning that new model
+config YAMLs can inherit existing model config files by importing them as
+bases. For example,
+`llama3.1_8B.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/llama3.1_8B.yaml>`__
+uses ``llama3_8B.yaml`` as a base config and overrides few parameters, as shown below.
+In this example, ``llama3.1_8B`` overrides the ``max_position_embeddings`` value:
+
+.. code-block:: yaml
+
+   bases:
+     - llama3_8B.yaml
+
+   tokenizer_type: Llama3Tokenizer
+   tokenizer_model: meta-llama/Llama-3.1-8B
+
+   max_position_embeddings: 131072
+
+.. tip::
+
+   Primus provides ``llama_base.yaml`` as the base configuration, which can be
+   used as bases for additional model architectures. For example,
+   `mixtral_base.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/mixtral_base.yaml>`__
+   and
+   `deepseek_v3_base.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/deepseek_v3_base.yaml>`__
+   define ``llama_base.yaml`` as its base.
+
+   .. code-block:: yaml
+
+      # Example mixtral_base.yaml:
+
+      bases:
+        - llama_base.yaml
+
+      init_method_std: 0.01
+      rotary_base: 1000000
+      qk_layernorm: false
+
+      group_query_attention: true
+      num_query_groups: 8
+
+      # moe parameters
+      num_experts: 8
+      moe_router_topk: 2
+      moe_router_load_balancing_type: aux_loss
+      moe_aux_loss_coeff: 1e-2
+      moe_grouped_gemm: true
+      moe_token_dispatcher_type: alltoall
+
+It is recommended to add a new ``${MODEL_NAME}_base.yaml`` to add a new
+category of model and define new models on top of it. For example, to add
+Qwen2.5 models in Primus, we define
+`qwen2.5_base.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/qwen2.5_base.yaml>`__
+and build
+`qwen2.5_7B.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/qwen2.5_7B.yaml>`__
+and
+`qwen2.5_72B.yaml <https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/primus/configs/models/megatron/qwen2.5_72B.yaml>`__
+using ``qwen2.5_base.yaml`` as the base config.
+
+Training parameters
+===================
+
+ROCm Megatron-LM also defines the training parameters, like batch size,
+tensor-parallelism, precision, as so on, in the training scripts. For example,
+Llama3 8B model parameters are defined in
+`examples/llama/train_llama3.sh <https://github.com/ROCm/Megatron-LM/blob/rocm_dev/examples/llama/train_llama3.sh>`__
+as shown below:
+
+.. code-block:: bash
+
+   TP="${TP:-8}"
+   PP="${PP:-1}"
+   CP="${CP:-1}"
+   MBS="${MBS:-1}"
+   BS="${BS:-8}"
+
+Primus defines the training parameters in top-level YAML files -- see
+`examples/megatron/configs/
+<https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1/examples/megatron/configs>`__.
+For example, the `llama3.1_8B-pretrain.yaml
+<https://github.com/AMD-AIG-AIMA/Primus/blob/v0.1.0-rc1/examples/megatron/configs/llama3.1_8B-pretrain.yaml>`__
+configuration imports the ``llama3.1_8B.yaml`` model architecture file. Users can then override
+the default training parameters in ``llama3.1_8B-pretrain.yaml``.
+
+.. code-block:: yaml
+
+   # model to run
+   model: llama3.1_8B.yaml  # Model architecture yaml
+   overrides:
+     # log
+     # disable_wandb: false
+     # disable_tensorboard: false
+     stderr_sink_level: DEBUG
+
+     log_avg_skip_iterations: 2
+     log_avg_reset_interval: 50
+
+     train_iters: 50
+     micro_batch_size: 2
+     global_batch_size: 128
+
+     seq_length: 8192
+     max_position_embeddings: 8192
+
+     lr: 1.0e-5
+     min_lr: 0.0
+     lr_warmup_iters: 2
+     lr_decay_iters: null
+     lr_decay_style: cosine
+     weight_decay: 0.1
+     adam_beta1: 0.9
+     adam_beta2: 0.95
+     eod_mask_loss: true
+     init_method_std: 0.008
+     norm_epsilon: 1.0e-6
+
+Backward compatibility with Megatron-LM
+=======================================
+
+The Dockerized environment used for Primus maintains compatibility with Megatron-LM with
+limited support. To roll back to using Megatron-LM, follow these steps.
+
+.. code-block:: shell
+
+   cd /workspace/Megatron-LM/
+   pip uninstall megatron-core
+   pip install -e .
+
+Once Megatron-LM is installed, follow :doc:`the documentation <../megatron-lm>` to run workloads as
+usual.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v24.12-dev.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v24.12-dev.rst
@@ -18,7 +18,7 @@ Training a model with ROCm Megatron-LM
 The ROCm Megatron-LM framework is a specialized fork of the robust Megatron-LM, designed to
 enable efficient training of large-scale language models on AMD GPUs. By leveraging AMD Instinct™ MI300X
 accelerators, AMD Megatron-LM delivers enhanced scalability, performance, and resource utilization for AI
-workloads. It is purpose-built to :ref:`support models <amd-megatron-lm-model-support>`
+workloads. It is purpose-built to :ref:`support models <amd-megatron-lm-model-support-24-12>`
 like Meta's Llama 2, Llama 3, and Llama 3.1, enabling developers to train next-generation AI models with greater
 efficiency. See the GitHub repository at `<https://github.com/ROCm/Megatron-LM>`__.

@@ -67,7 +67,7 @@ Megatron-LM provides the following key features to train large language models e

 - Pre-training

-.. _amd-megatron-lm-model-support:
+.. _amd-megatron-lm-model-support-24-12:

 The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.

--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.3.rst
@@ -67,7 +67,7 @@ Megatron-LM provides the following key features to train large language models e

 - Pre-training

-.. _amd-megatron-lm-model-support:
+.. _amd-megatron-lm-model-support-25-3:

 The following models are pre-optimized for performance on the AMD Instinct MI300X accelerator.

@@ -278,7 +278,7 @@ handle a variety of input sequences, including unseen words or domain-specific t
   .. tab-item:: Llama
      :sync: llama

-      To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``Llama2Tokenizer``.
+      To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support-25-3>`, use the ``Llama2Tokenizer``.

      To train any of Llama 3 and Llama 3.1 models that this Docker image supports, use the ``HuggingFaceTokenizer``.
      Set the Hugging Face model link in the ``TOKENIZER_MODEL`` variable.
@@ -292,7 +292,7 @@ handle a variety of input sequences, including unseen words or domain-specific t
   .. tab-item:: DeepSeek V2
      :sync: deepseek

-      To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``DeepSeekV2Tokenizer``.
+      To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support-25-3>`, use the ``DeepSeekV2Tokenizer``.

 Multi-node training
 ^^^^^^^^^^^^^^^^^^^
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.4.rst
@@ -67,7 +67,7 @@ Megatron-LM provides the following key features to train large language models e

 - Pre-training

-.. _amd-megatron-lm-model-support:
+.. _amd-megatron-lm-model-support-25-4:

 The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.

@@ -291,7 +291,7 @@ or ``${DATA_DIR}/tokenizer_llama2``.
   .. tab-item:: Llama
      :sync: llama

-      To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``Llama2Tokenizer``
+      To train any of the Llama 2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support-25-4>`, use the ``Llama2Tokenizer``
      or the default ``HuggingFaceTokenizer``.

      To train any of Llama 3 and Llama 3.1 models that this Docker image supports, use the ``HuggingFaceTokenizer``.
@@ -320,7 +320,7 @@ or ``${DATA_DIR}/tokenizer_llama2``.
   .. tab-item:: DeepSeek V2
      :sync: deepseek

-      To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support>`, use the ``DeepSeekV2Tokenizer``.
+      To train any of the DeepSeek V2 models that :ref:`this Docker image supports <amd-megatron-lm-model-support-25-4>`, use the ``DeepSeekV2Tokenizer``.

 Multi-node training
 ^^^^^^^^^^^^^^^^^^^
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.6.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/megatron-lm-v25.6.rst
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-history.rst
@@ -16,12 +16,20 @@ previous releases of the ``ROCm/pytorch-training`` Docker image on `Docker Hub <
     - Components
     - Resources

+   * - v25.7
+     - 
+       * ROCm 6.4.2
+       * PyTorch 2.8.0a0+gitd06a406
+     - 
+       * :doc:`Documentation <../pytorch-training>`
+       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.7/images/sha256-cc6fd840ab89cb81d926fc29eca6d075aee9875a55a522675a4b9231c9a0a712>`__
+
   * - v25.6
     - 
       * ROCm 6.3.4
       * PyTorch 2.8.0a0+git7d205b2
     - 
-       * :doc:`Documentation <../pytorch-training>`
+       * :doc:`Documentation <pytorch-training-v25.6>`
       * `Docker Hub <https://hub.docker.com/layers/rocm/pytorch-training/v25.6/images/sha256-a4cea3c493a4a03d199a3e81960ac071d79a4a7a391aa9866add3b30a7842661>`__

   * - v25.5
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.5.rst
@@ -437,3 +437,8 @@ Once the setup is complete, choose between two options to start benchmarking:

           ./pytorch_benchmark_report.sh -t HF_finetune_lora -p BF16 -m Llama-2-70B

+Previous versions
+=================
+
+See :doc:`pytorch-training-history` to find documentation for previous releases
+of the ``ROCm/pytorch-training`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.6.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/previous-versions/pytorch-training-v25.6.rst
@@ -0,0 +1,456 @@
+:orphan:
+
+.. meta::
+   :description: How to train a model using PyTorch for ROCm.
+   :keywords: ROCm, AI, LLM, train, PyTorch, torch, Llama, flux, tutorial, docker
+
+**************************************
+Training a model with PyTorch for ROCm
+**************************************
+
+.. caution::
+
+   This documentation does not reflect the latest version of ROCm vLLM
+   performance benchmark documentation. See :doc:`../pytorch-training` for the latest version.
+
+PyTorch is an open-source machine learning framework that is widely used for
+model training with GPU-optimized components for transformer-based models.
+
+The `PyTorch for ROCm training Docker <https://hub.docker.com/layers/rocm/pytorch-training/v25.6/images/sha256-a4cea3c493a4a03d199a3e81960ac071d79a4a7a391aa9866add3b30a7842661>`_
+(``rocm/pytorch-training:v25.6``) image provides a prebuilt optimized environment for fine-tuning and pretraining a
+model on AMD Instinct MI325X and MI300X accelerators. It includes the following software components to accelerate
+training workloads:
+
+--------------------------+--------------------------------+
+| Software component       | Version                        |
+==========================+================================+
+| ROCm                     | 6.3.4                          |
+--------------------------+--------------------------------+
+| PyTorch                  | 2.8.0a0+git7d205b2             |
+--------------------------+--------------------------------+
+| Python                   | 3.10.17                        |
+--------------------------+--------------------------------+
+| Transformer Engine       | 1.14.0+2f85f5f2                |
+--------------------------+--------------------------------+
+| Flash Attention          | 3.0.0.post1                    |
+--------------------------+--------------------------------+
+| hipBLASLt                | 0.15.0-8c6919d                 |
+--------------------------+--------------------------------+
+| Triton                   | 3.3.0                          |
+--------------------------+--------------------------------+
+
+.. _amd-pytorch-training-model-support-v256:
+
+Supported models
+================
+
+The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X accelerators.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/previous-versions/pytorch-training-v25.6-benchmark-models.yaml
+
+   {% set unified_docker = data.unified_docker.latest %}
+   {% set model_groups = data.model_groups %}
+
+   .. raw:: html
+
+      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+        <div class="row">
+          <div class="col-2 me-2 model-param-head">Workload</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+            <div class="col-6 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+   {% endfor %}
+          </div>
+        </div>
+
+        <div class="row mt-1">
+          <div class="col-2 me-2 model-param-head">Model</div>
+          <div class="row col-10">
+   {% for model_group in model_groups %}
+      {% set models = model_group.models %}
+      {% for model in models %}
+         {% if models|length % 3 == 0 %}
+            <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% else %}
+            <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+         {% endif %}
+      {% endfor %}
+   {% endfor %}
+          </div>
+        </div>
+      </div>
+
+   .. note::
+
+      Some models require an external license agreement through a third party (for example, Meta).
+
+   .. _amd-pytorch-training-performance-measurements-v256:
+
+   Performance measurements
+   ========================
+
+   To evaluate performance, the
+   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
+   page provides reference throughput and latency measurements for training
+   popular AI models.
+
+   .. note::
+
+      The performance data presented in
+      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
+      should not be interpreted as the peak performance achievable by AMD
+      Instinct MI325X and MI300X accelerators or ROCm software.
+
+   System validation
+   =================
+
+   Before running AI workloads, it's important to validate that your AMD hardware is configured
+   correctly and performing optimally.
+
+   If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+   can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+   optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+   before starting training.
+
+   To test for optimal performance, consult the recommended :ref:`System health benchmarks
+   <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+   system's configuration.
+
+   This Docker image is optimized for specific model configurations outlined
+   below. Performance can vary for other training workloads, as AMD
+   doesn’t validate configurations and run conditions outside those described.
+
+   Benchmarking
+   ============
+
+   Once the setup is complete, choose between two options to start benchmarking:
+
+   .. tab-set::
+
+      .. tab-item:: MAD-integrated benchmarking
+
+         Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+         directory and install the required packages on the host machine.
+
+         .. code-block:: shell
+
+            git clone https://github.com/ROCm/MAD
+            cd MAD
+            pip install -r requirements.txt
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+
+         .. container:: model-doc {{ model.mad_tag }}
+
+            For example, use this command to run the performance benchmark test on the {{ model.model }} model
+            using one GPU with the {{ model.precision }} data type on the host machine.
+
+            .. code-block:: shell
+
+               export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+               madengine run \
+                   --tags {{ model.mad_tag }} \
+                   --keep-model-dir \
+                   --live-output \
+                   --timeout 28800
+
+            MAD launches a Docker container with the name
+            ``container_ci-{{ model.mad_tag }}``, for example. The latency and throughput reports of the
+            model are collected in the following path: ``~/MAD/perf.csv``.
+
+      {% endfor %}
+   {% endfor %}
+
+      .. tab-item:: Standalone benchmarking
+
+         .. rubric:: Download the Docker image and required packages
+
+         Use the following command to pull the Docker image from Docker Hub.
+
+         .. code-block:: shell
+
+            docker pull {{ unified_docker.pull_tag }}
+
+         Run the Docker container.
+
+         .. code-block:: shell
+
+            docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env {{ unified_docker.pull_tag }}
+
+         Use these commands if you exit the ``training_env`` container and need to return to it.
+
+         .. code-block:: shell
+
+            docker start training_env
+            docker exec -it training_env bash
+
+         In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
+         repository and navigate to the benchmark scripts directory
+         ``/workspace/MAD/scripts/pytorch_train``.
+
+         .. code-block:: shell
+
+            git clone https://github.com/ROCm/MAD
+            cd MAD/scripts/pytorch_train
+
+         .. rubric:: Prepare training datasets and dependencies
+
+         The following benchmarking examples require downloading models and datasets
+         from Hugging Face. To ensure successful access to gated repos, set your
+         ``HF_TOKEN``.
+
+         .. code-block:: shell
+
+            export HF_TOKEN=$your_personal_hugging_face_access_token
+
+         Run the setup script to install libraries and datasets needed for benchmarking.
+
+         .. code-block:: shell
+
+            ./pytorch_benchmark_setup.sh
+
+         .. container:: model-doc pyt_train_llama-3.1-8b
+
+            ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 8B:
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Library
+                 - Reference
+
+               * - ``accelerate``
+                 - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
+
+               * - ``datasets``
+                 - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
+
+         .. container:: model-doc pyt_train_llama-3.1-70b
+
+            ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 70B:
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Library
+                 - Reference
+
+               * - ``datasets``
+                 - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
+
+               * - ``torchdata``
+                 - `TorchData <https://pytorch.org/data/beta/index.html>`_
+
+               * - ``tomli``
+                 - `Tomli <https://pypi.org/project/tomli/>`_
+
+               * - ``tiktoken``
+                 - `tiktoken <https://github.com/openai/tiktoken>`_
+
+               * - ``blobfile``
+                 - `blobfile <https://pypi.org/project/blobfile/>`_
+
+               * - ``tabulate``
+                 - `tabulate <https://pypi.org/project/tabulate/>`_
+
+               * - ``wandb``
+                 - `Weights & Biases <https://github.com/wandb/wandb>`_
+
+               * - ``sentencepiece``
+                 - `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
+
+               * - ``tensorboard``
+                 - `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
+
+         .. container:: model-doc pyt_train_flux
+
+            ``pytorch_benchmark_setup.sh`` installs the following libraries for FLUX:
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Library
+                 - Reference
+
+               * - ``accelerate``
+                 - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
+
+               * - ``datasets``
+                 - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
+
+               * - ``sentencepiece``
+                 - `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
+
+               * - ``tensorboard``
+                 - `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
+
+               * - ``csvkit``
+                 - `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1
+
+               * - ``deepspeed``
+                 - `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2
+
+               * - ``diffusers``
+                 - `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0
+
+               * - ``GitPython``
+                 - `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44
+
+               * - ``opencv-python-headless``
+                 - `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84
+
+               * - ``peft``
+                 - `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0
+
+               * - ``protobuf``
+                 - `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2
+
+               * - ``pytest``
+                 - `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4
+
+               * - ``python-dotenv``
+                 - `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1
+
+               * - ``seaborn``
+                 - `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2
+
+               * - ``transformers``
+                 - `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0
+
+         ``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face:
+
+         * `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_
+
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+         {% if model_group.tag == "pre-training" and model.mad_tag in ["pyt_train_llama-3.1-8b", "pyt_train_llama-3.1-70b", "pyt_train_flux"] %}
+
+         .. container:: model-doc {{ model.mad_tag }}
+
+            .. rubric:: Pretraining
+
+            To start the pre-training benchmark, use the following command with the
+            appropriate options. See the following list of options and their descriptions.
+
+            .. code-block:: shell
+
+               ./pytorch_benchmark_report.sh -t pretrain -m {{ model.model_repo }} -p $datatype -s $sequence_length
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Name
+                 - Options
+                 - Description
+
+            {% if model.mad_tag == "pyt_train_llama-3.1-8b" %}
+               * - ``$datatype``
+                 - ``BF16`` or ``FP8``
+                 - Only Llama 3.1 8B supports FP8 precision.
+            {% else %}
+               * - ``$datatype``
+                 - ``BF16``
+                 - Only Llama 3.1 8B supports FP8 precision.
+            {% endif %}
+
+               * - ``$sequence_length``
+                 - Sequence length for the language model.
+                 - Between 2048 and 8192. 8192 by default.
+
+            {% if model.mad_tag == "pyt_train_flux" %}
+            .. container:: model-doc {{ model.mad_tag }}
+
+               .. note::
+
+                  Occasionally, downloading the Flux dataset might fail. In the event of this
+                  error, manually download it from Hugging Face at
+                  `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
+                  and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
+                  the required dataset.
+            {% endif %}
+         {% endif %}
+
+         {% if model_group.tag == "fine-tuning" %}
+         .. container:: model-doc {{ model.mad_tag }}
+
+            .. rubric:: Fine-tuning
+
+            To start the fine-tuning benchmark, use the following command with the
+            appropriate options. See the following list of options and their descriptions.
+
+            .. code-block:: shell
+
+               ./pytorch_benchmark_report.sh -t $training_mode -m {{ model.model_repo }} -p BF16 -s $sequence_length
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Name
+                 - Options
+                 - Description
+
+               * - ``$training_mode``
+                 - ``finetune_fw``
+                 - Full weight fine-tuning (BF16 supported)
+
+               * -
+                 - ``finetune_lora``
+                 - LoRA fine-tuning (BF16 supported)
+
+               * -
+                 - ``finetune_qlora``
+                 - QLoRA fine-tuning (BF16 supported)
+
+               * -
+                 - ``HF_finetune_lora``
+                 - LoRA fine-tuning with Hugging Face PEFT
+
+               * - ``$datatype``
+                 - ``BF16``
+                 - All models support BF16.
+
+               * - ``$sequence_length``
+                 - Between 2048 and 16384.
+                 - Sequence length for the language model.
+
+            .. note::
+
+               {{ model.model }} currently supports the following fine-tuning methods:
+
+            {% for method in model.training_modes %}
+               * ``{{ method }}``
+            {% endfor %}
+            {% if model.training_modes|length < 4 %}
+
+               The upstream `torchtune <https://github.com/pytorch/torchtune>`_ repository
+               does not currently provide YAML configuration files for other combinations of
+               model to fine-tuning method
+               However, you can still configure your own YAML files to enable support for
+               fine-tuning methods not listed here by following existing patterns in the
+               ``/workspace/torchtune/recipes/configs`` directory.
+            {% endif %}
+         {% endif %}
+      {% endfor %}
+   {% endfor %}
+
+               .. rubric:: Benchmarking examples
+
+               For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.
+
+Further reading
+===============
+
+- To learn more about MAD and the ``madengine`` CLI, see the `MAD usage guide <https://github.com/ROCm/MAD?tab=readme-ov-file#usage-guide>`__.
+
+- To learn more about system settings and management practices to configure your system for
+  AMD Instinct MI300X series accelerators, see `AMD Instinct MI300X system optimization <https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/system-optimization/mi300x.html>`_.
+
+- For a list of other ready-made Docker images for AI with ROCm, see
+  `AMD Infinity Hub <https://www.amd.com/en/developer/resources/infinity-hub.html#f-amd_hub_category=AI%20%26%20ML%20Models>`_.
+
+Previous versions
+=================
+
+See :doc:`pytorch-training-history` to find documentation for previous releases
+of the ``ROCm/pytorch-training`` Docker image.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst
@@ -0,0 +1,602 @@
+.. meta::
+   :description: How to train a model using Megatron-LM for ROCm.
+   :keywords: ROCm, AI, LLM, train, Megatron-LM, megatron, Llama, tutorial, docker, torch
+
+**********************************************
+Training a model with Primus and Megatron-Core
+**********************************************
+
+`Primus <https://github.com/AMD-AIG-AIMA/Primus>`__ is a unified and flexible
+LLM training framework designed to streamline training. It streamlines LLM
+training on AMD Instinct accelerators using a modular, reproducible configuration paradigm.
+Primus is backend-agnostic and supports multiple training engines -- including Megatron-Core.
+
+.. note::
+
+   Primus with the Megatron-Core backend is intended to replace ROCm
+   Megatron-LM in this Dockerized training environment. To learn how to migrate
+   workloads from Megatron-LM to Primus with Megatron-Core, see
+   :doc:`previous-versions/megatron-lm-primus-migration-guide`.
+
+For ease of use, AMD provides a ready-to-use Docker image for MI300 series accelerators
+containing essential components for Primus and Megatron-Core.
+
+.. note::
+
+   This Docker environment is based on Python 3.10 and Ubuntu 22.04. For an alternative environment with
+   Python 3.12 and Ubuntu 24.04, see the :doc:`previous ROCm Megatron-LM v25.6 Docker release <previous-versions/megatron-lm-v25.6>`.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
+
+   {% set dockers = data.dockers %}
+   {% set docker = dockers[0] %}
+   .. list-table::
+      :header-rows: 1
+
+      * - Software component
+        - Version
+
+      {% for component_name, component_version in docker.components.items() %}
+      * - {{ component_name }}
+        - {{ component_version }}
+      {% endfor %}
+
+.. _amd-primus-megatron-lm-model-support:
+
+Supported models
+================
+
+The following models are pre-optimized for performance on AMD Instinct MI300X series accelerators.
+Some instructions, commands, and training examples in this documentation might
+vary by model -- select one to get started.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
+
+   {% set model_groups = data.model_groups %}
+   .. raw:: html
+
+         <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
+           <div class="row">
+             <div class="col-2 me-2 model-param-head">Model</div>
+             <div class="row col-10">
+      {% for model_group in model_groups %}
+               <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+      {% endfor %}
+             </div>
+           </div>
+
+           <div class="row mt-1">
+             <div class="col-2 me-2 model-param-head">Model variant</div>
+             <div class="row col-10">
+      {% for model_group in model_groups %}
+         {% set models = model_group.models %}
+         {% for model in models %}
+            {% if models|length % 3 == 0 %}
+               <div class="col-4 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% else %}
+               <div class="col-6 model-param" data-param-k="model" data-param-v="{{ model.mad_tag }}" data-param-group="{{ model_group.tag }}" tabindex="0">{{ model.model }}</div>
+            {% endif %}
+         {% endfor %}
+      {% endfor %}
+             </div>
+           </div>
+         </div>
+
+.. note::
+
+   Some models, such as Llama, require an external license agreement through
+   a third party (for example, Meta).
+
+System validation
+=================
+
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.
+
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting training.
+
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.
+
+.. _mi300x-amd-primus-megatron-lm-training:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
+
+   {% set dockers = data.dockers %}
+      {% set docker = dockers[0] %}
+
+   Environment setup
+   =================
+
+   Use the following instructions to set up the environment, configure the script to train models, and
+   reproduce the benchmark results on MI300X series accelerators with the ``{{ docker.pull_tag }}`` image.
+
+   .. _amd-primus-megatron-lm-requirements:
+
+   Download the Docker image
+   -------------------------
+
+   1. Use the following command to pull the Docker image from Docker Hub.
+
+      .. code-block:: shell
+
+         docker pull {{ docker.pull_tag }}
+
+   2. Launch the Docker container.
+
+      .. code-block:: shell
+
+         docker run -it \
+             --device /dev/dri \
+             --device /dev/kfd \
+             --device /dev/infiniband \
+             --network host --ipc host \
+             --group-add video \
+             --cap-add SYS_PTRACE \
+             --security-opt seccomp=unconfined \
+             --privileged \
+             -v $HOME:$HOME \
+             --shm-size 128G \
+             --name primus_training_env \
+             {{ docker.pull_tag }}
+
+3. Use these commands if you exit the ``primus_training_env`` container and need to return to it.
+
+   .. code-block:: shell
+
+      docker start primus_training_env
+      docker exec -it primus_training_env bash
+
+The Docker container hosts verified release tag ``v0.1.0-rc1`` of the `Primus
+<https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1>`__ repository.
+
+.. _amd-primus-megatron-lm-environment-setup:
+
+Configuration
+=============
+
+Primus defines a training configuration in YAML for each model in
+`examples/megatron/configs <https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1/examples/megatron/configs>`__.
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
+
+   {% set model_groups = data.model_groups %}
+   {% for model_group in model_groups %}
+      {% for model in model_group.models %}
+   .. container:: model-doc {{ model.mad_tag }}
+
+      To update training parameters for {{ model.model }}, you can update ``examples/megatron/configs/{{ model.config_name }}``.
+      Note that training configuration YAML files for other models follow this naming convention.
+
+      {% endfor %}
+   {% endfor %}
+
+.. note::
+
+   See :ref:`Key options <amd-primus-megatron-lm-benchmark-test-vars>` for more information on configuration options.
+
+Dataset options
+---------------
+
+You can use either mock data or real data for training.
+
+* Mock data can be useful for testing and validation. Use the ``mock_data`` field to toggle between mock and real data. The default
+  value is ``true`` for enabled.
+
+  .. code-block:: yaml
+
+     mock_data: true
+
+* If you're using a real dataset, update the ``train_data_path`` field to point to the location of your dataset.
+
+  .. code-block:: bash
+
+     mock_data: false
+     train_data_path: /path/to/your/dataset
+
+  Ensure that the files are accessible inside the Docker container.
+
+.. _amd-primus-megatron-lm-tokenizer:
+
+Tokenizer
+---------
+
+In Primus, each model uses a tokenizer from Hugging Face. For example, Llama
+3.1 8B model uses ``tokenizer_model: meta-llama/Llama-3.1-8B`` and
+``tokenizer_type: Llama3Tokenizer`` defined in the `llama3.1-8B model
+<https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1/primus/configs/models/megatron/llama3.1_8B.yaml>`__
+definition. As such, you need to set the ``HF_TOKEN`` environment variable with
+right permissions to access the tokenizer for each model.
+
+.. code-block:: bash
+
+   # Export your HF_TOKEN in the workspace
+   export HF_TOKEN=<your_hftoken>
+
+.. _amd-primus-megatron-lm-run-training:
+
+Run training
+============
+
+Use the following example commands to set up the environment, configure
+:ref:`key options <amd-primus-megatron-lm-benchmark-test-vars>`, and run training on
+MI300X series accelerators with the AMD Megatron-LM environment.
+
+Single node training
+--------------------
+
+To run training on a single node, navigate to ``/workspace/Primus`` and use the following setup command:
+
+.. code-block:: shell
+
+   pip install -r requirements.txt
+   export HSA_NO_SCRATCH_RECLAIM=1
+   export NVTE_CK_USES_BWD_V3=1
+
+Once setup is complete, run the appropriate training command.
+
+.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.3-70b
+
+   To run pre-training for Llama 3.3 70B BF16, run:
+
+   .. code-block:: shell
+
+      EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
+      bash ./examples/run_pretrain.sh \
+          --micro_batch_size 2 \
+          --global_batch_size 16 \
+          --train_iters 50
+
+.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b
+
+   To run pre-training for Llama 3.1 8B FP8, run:
+
+   .. code-block:: shell
+
+      EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
+      bash ./examples/run_pretrain.sh \
+          --train_iters 50 \
+          --fp8 hybrid
+
+   For Llama 3.1 8B BF16, use the following command:
+
+   .. code-block:: shell
+
+      EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
+      bash ./examples/run_pretrain.sh --train_iters 50
+
+.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-70b
+
+   To run pre-training for Llama 3.1 70B BF16, run:
+
+   .. code-block:: shell
+
+      EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
+      bash ./examples/run_pretrain.sh \
+           --train_iters 50
+
+   To run the training on a single node for Llama 3.1 70B FP8 with proxy, use the following command:
+
+   .. code-block:: shell
+
+      EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
+      bash ./examples/run_pretrain.sh \
+          --train_iters 50 \
+          --num_layers 40 \
+          --fp8 hybrid \
+          --no_fp8_weight_transpose_cache true
+
+   .. note::
+
+      Use two or more nodes to run the *full* Llama 70B model with FP8 precision.
+
+.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-7b
+
+   To run pre-training for Llama 2 7B FP8, run:
+
+   .. code-block:: shell
+
+      EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
+      bash ./examples/run_pretrain.sh \
+          --train_iters 50 \
+          --fp8 hybrid
+
+   To run pre-training for Llama 2 7B BF16, run:
+
+   .. code-block:: shell
+
+      EXP=examples/megatron/configs/llama2_7B-pretrain.yaml \
+      bash ./examples/run_pretrain.sh --train_iters 50
+
+.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-70b
+
+   To run pre-training for Llama 2 70B BF16, run:
+
+   .. code-block:: shell
+
+      EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
+      bash ./examples/run_pretrain.sh --train_iters 50 
+
+.. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v3-proxy
+
+   To run training on a single node for DeepSeek-V3 (MoE with expert parallel) with 3-layer proxy, 
+   use the following command:
+
+   .. code-block:: shell
+
+      EXP=examples/megatron/configs/deepseek_v3-pretrain.yaml \
+      bash examples/run_pretrain.sh \
+          --num_layers 3 \
+          --moe_layer_freq 1 \
+          --train_iters 50
+
+.. container:: model-doc primus_pyt_megatron_lm_train_deepseek-v2-lite-16b
+
+   To run training on a single node for DeepSeek-V2-Lite (MoE with expert parallel),
+   use the following command:
+
+   .. code-block:: shell
+
+      EXP=examples/megatron/configs/deepseek_v2_lite-pretrain.yaml \
+      bash examples/run_pretrain.sh \
+          --global_batch_size 256 \
+          --train_iters 50
+
+.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x7b
+
+   To run training on a single node for Mixtral 8x7B (MoE with expert parallel),
+   use the following command:
+
+   .. code-block:: shell
+
+      EXP=examples/megatron/configs/mixtral_8x7B_v0.1-pretrain.yaml \
+      bash examples/run_pretrain.sh --train_iters 50
+
+.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x22b-proxy
+
+   To run training on a single node for Mixtral 8x7B (MoE with expert parallel) with 4-layer proxy,
+   use the following command:
+
+   .. code-block:: shell
+
+      EXP=examples/megatron/configs/mixtral_8x22B_v0.1-pretrain.yaml \
+      bash examples/run_pretrain.sh \
+          --num_layers 4 \
+          --pipeline_model_parallel_size 1 \
+          --micro_batch_size 1 \
+          --global_batch_size 16 \
+          --train_iters 50
+
+.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-7b
+
+   To run training on a single node for Qwen 2.5 7B BF16, use the following
+   command:
+
+   .. code-block:: shell
+
+      EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \
+      bash examples/run_pretrain.sh --train_iters 50
+
+   For FP8, use the following command.
+
+   .. code-block:: shell
+
+      EXP=examples/megatron/configs/qwen2.5_7B-pretrain.yaml \
+      bash examples/run_pretrain.sh \
+          --train_iters 50 \
+          --fp8 hybrid
+
+.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-72b
+
+   To run the training on a single node for Qwen 2.5 72B BF16, use the following command.
+
+   .. code-block:: shell
+
+      EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \
+      bash examples/run_pretrain.sh --train_iters 50
+
+Multi-node training examples
+----------------------------
+
+To run training on multiple nodes, you can use the
+`run_slurm_pretrain.sh <https://github.com/AMD-AIG-AIMA/Primus/tree/v0.1.0-rc1/examples/run_slurm_pretrain.sh>`__
+to launch the multi-node workload. Use the following steps to setup your environment:
+
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/primus-megatron-benchmark-models.yaml
+
+   {% set dockers = data.dockers %}
+   {% set docker = dockers[0] %}
+
+   .. code-block:: shell
+
+      cd /workspace/Primus/
+      export DOCKER_IMAGE={{ docker.pull_tag }}
+      export HF_TOKEN=<your_HF_token>
+      export HSA_NO_SCRATCH_RECLAIM=1
+      export NVTE_CK_USES_BWD_V3=1
+      export NCCL_IB_HCA=<your_NCCL_IB_HCA> # specify which RDMA interfaces to use for communication
+      export NCCL_SOCKET_IFNAME=<your_NCCL_SOCKET_IFNAME> # your Network Interface
+      export GLOO_SOCKET_IFNAME=<your_GLOO_SOCKET_IFNAME> # your Network Interface
+      export NCCL_IB_GID_INDEX=3 # Set InfiniBand GID index for NCCL communication. Default is 3 for ROCE
+
+.. note::
+
+   * Make sure correct network drivers are installed on the nodes. If inside a Docker, either install the drivers inside the Docker container or pass the network drivers from the host while creating Docker container.
+   * If ``NCCL_IB_HCA`` and ``NCCL_SOCKET_IFNAME`` are not set, Primus will try to auto-detect. However, since NICs can vary accross different cluster, it is encouraged to explicitly export your NCCL parameters for the cluster.
+   * To find your network interface, you can use ``ip a``.
+   * To find RDMA interfaces, you can use ``ibv_devices`` to get the list of all the RDMA/IB  devices.
+
+.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.3-70b
+
+   To train Llama 3.3 70B FP8 on 8 nodes, run:
+
+   .. code-block:: shell
+
+      NNODES=8 EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
+      bash examples/run_slurm_pretrain.sh \
+          --micro_batch_size 4 \
+          --global_batch_size 256 \
+          --recompute_num_layers 80 \
+          --no_fp8_weight_transpose_cache true \
+          --fp8 hybrid
+
+   To train Llama 3.3 70B BF16 on 8 nodes, run:
+
+   .. code-block:: shell
+
+      NNODES=8 EXP=examples/megatron/configs/llama3.3_70B-pretrain.yaml \
+      bash examples/run_slurm_pretrain.sh \
+          --micro_batch_size 1 \
+          --global_batch_size 256 \
+          --recompute_num_layers 12
+
+.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-8b
+
+   To train Llama 3.1 8B FP8 on 8 nodes, run:
+
+   .. code-block:: shell
+
+      # Adjust the training parameters. For e.g., `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case 
+      NNODES=8 EXP=examples/megatron/configs/llama3.1_8B-pretrain.yaml \
+      bash ./examples/run_slurm_pretrain.sh \
+          --global_batch_size 1024 \
+          --fp8 hybrid
+
+.. container:: model-doc primus_pyt_megatron_lm_train_llama-3.1-70b
+
+   To train Llama 3.1 70B FP8 on 8 nodes, run:
+
+   .. code-block:: shell
+
+      NNODES=8 EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
+      bash examples/run_slurm_pretrain.sh \
+          --micro_batch_size 4 \
+          --global_batch_size 256 \
+          --recompute_num_layers 80 \
+          --no_fp8_weight_transpose_cache true \
+          --fp8 hybrid
+
+   To train Llama 3.1 70B BF16 on 8 nodes, run:
+
+   .. code-block:: shell
+
+      NNODES=8 EXP=examples/megatron/configs/llama3.1_70B-pretrain.yaml \
+      bash examples/run_slurm_pretrain.sh \
+          --micro_batch_size 1 \
+          --global_batch_size 256 \
+          --recompute_num_layers 12
+
+.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-7b
+
+   To train Llama 2 8B FP8 on 8 nodes, run:
+
+   .. code-block:: shell
+
+      # Adjust the training parameters. For e.g., `global_batch_size: 8 * #single_node_bs` for 8 nodes in this case 
+      NNODES=8 EXP=examples/megatron/configs/llama2_7B-pretrain.yaml bash ./examples/run_slurm_pretrain.sh --global_batch_size 2048 --fp8 hybrid
+
+.. container:: model-doc primus_pyt_megatron_lm_train_llama-2-70b
+
+   To train Llama 2 70B FP8 on 8 nodes, run:
+
+   .. code-block:: shell
+
+      NNODES=8 EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
+      bash examples/run_slurm_pretrain.sh \
+          --micro_batch_size 10 \
+          --global_batch_size 640 \
+          --recompute_num_layers 80 \
+          --no_fp8_weight_transpose_cache true \
+          --fp8 hybrid
+
+   To train Llama 2 70B BF16 on 8 nodes, run:
+
+   .. code-block:: shell
+
+      NNODES=8 EXP=examples/megatron/configs/llama2_70B-pretrain.yaml \
+      bash ./examples/run_slurm_pretrain.sh \
+          --micro_batch_size 2 \
+          --global_batch_size 1536 \
+          --recompute_num_layers 12
+
+.. container:: model-doc primus_pyt_megatron_lm_train_mixtral-8x7b
+
+   To train Mixtral 8x7B BF16 on 8 nodes, run:
+
+   .. code-block:: shell
+
+      NNODES=8 EXP=examples/megatron/configs/mixtral_8x7B_v0.1-pretrain.yaml \
+      bash examples/run_slurm_pretrain.sh \
+          --micro_batch_size 2 \
+          --global_batch_size 256
+
+.. container:: model-doc primus_pyt_megatron_lm_train_qwen2.5-72b
+
+   To train Qwen2.5 72B FP8 on 8 nodes, run:
+
+   .. code-block:: shell
+
+      NNODES=8 EXP=examples/megatron/configs/qwen2.5_72B-pretrain.yaml \
+      bash examples/run_slurm_pretrain.sh \
+          --micro_batch_size 8 \
+          --global_batch_size 512 \
+          --recompute_num_layers 80 \
+          --no_fp8_weight_transpose_cache true \
+          --fp8 hybrid
+
+.. _amd-primus-megatron-lm-benchmark-test-vars:
+
+Key options
+-----------
+
+The following are key options to take note of
+
+fp8
+  ``hybrid`` enables FP8 GEMMs.
+
+use_torch_fsdp2
+  ``use_torch_fsdp2: 1``  enables torch fsdp-v2. If FSDP is enabled,
+  set ``use_distributed_optimizer`` and ``overlap_param_gather`` to ``false``.
+
+profile
+  To enable PyTorch profiling, set these parameters:
+
+  .. code-block:: yaml
+
+     profile: true
+     use_pytorch_profiler: true
+     profile_step_end: 7
+     profile_step_start: 6
+
+train_iters
+  The total number of iterations (default: 50).
+
+mock_data
+  True by default.
+
+micro_batch_size
+  Micro batch size.
+
+global_batch_size
+  Global batch size.
+
+recompute_granularity
+  For activation checkpointing.
+
+num_layers
+  For using a reduced number of layers as with proxy models.
+
+Previous versions
+=================
+
+See :doc:`previous-versions/megatron-lm-history` to find documentation for previous releases
+of the ``ROCm/megatron-lm`` Docker image.
+
+This training environment now uses Primus with Megatron as the primary
+configuration. Limited support for the legacy ROCm Megatron-LM is still
+available. For instructions on using ROCm Megatron-LM, see the
+:doc:`megatron-lm` document.
--- a/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
+++ b/docs/how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
@@ -9,28 +9,25 @@ Training a model with PyTorch for ROCm
 PyTorch is an open-source machine learning framework that is widely used for
 model training with GPU-optimized components for transformer-based models.

-The `PyTorch for ROCm training Docker <https://hub.docker.com/r/rocm/pytorch-training/tags>`_
-(``rocm/pytorch-training:v25.6``) image provides a prebuilt optimized environment for fine-tuning and pretraining a
-model on AMD Instinct MI325X and MI300X accelerators. It includes the following software components to accelerate
-training workloads:
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml

-+--------------------------+--------------------------------+
-| Software component       | Version                        |
-+==========================+================================+
-| ROCm                     | 6.3.4                          |
-+--------------------------+--------------------------------+
-| PyTorch                  | 2.8.0a0+git7d205b2             |
-+--------------------------+--------------------------------+
-| Python                   | 3.10.17                        |
-+--------------------------+--------------------------------+
-| Transformer Engine       | 1.14.0+2f85f5f2                |
-+--------------------------+--------------------------------+
-| Flash Attention          | 3.0.0.post1                    |
-+--------------------------+--------------------------------+
-| hipBLASLt                | 0.15.0-8c6919d                 |
-+--------------------------+--------------------------------+
-| Triton                   | 3.3.0                          |
-+--------------------------+--------------------------------+
+   {% set dockers = data.dockers %}
+   {% set docker = dockers[0] %}
+   The `PyTorch for ROCm training Docker <{{ docker.docker_hub_url }}>`__
+   (``{{ docker.pull_tag }}``) image provides a prebuilt optimized environment for fine-tuning and pretraining a
+   model on AMD Instinct MI325X and MI300X accelerators. It includes the following software components to accelerate
+   training workloads:
+
+   .. list-table::
+      :header-rows: 1
+
+      * - Software component
+        - Version
+
+      {% for component_name, component_version in docker.components.items() %}
+      * - {{ component_name }}
+        - {{ component_version }}
+      {% endfor %}

 .. _amd-pytorch-training-model-support:

@@ -38,26 +35,27 @@ Supported models
 ================

 The following models are pre-optimized for performance on the AMD Instinct MI325X and MI300X accelerators.
+Some instructions, commands, and training recommendations in this documentation might
+vary by model -- select one to get started.

 .. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml

-   {% set unified_docker = data.unified_docker.latest %}
+   {% set unified_docker = data.dockers[0] %}
   {% set model_groups = data.model_groups %}
-
   .. raw:: html

      <div id="vllm-benchmark-ud-params-picker" class="container-fluid">
        <div class="row">
-          <div class="col-2 me-2 model-param-head">Workload</div>
+          <div class="col-2 me-2 model-param-head">Model group</div>
          <div class="row col-10">
   {% for model_group in model_groups %}
-            <div class="col-6 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
+            <div class="col-3 model-param" data-param-k="model-group" data-param-v="{{ model_group.tag }}" tabindex="0">{{ model_group.group }}</div>
   {% endfor %}
          </div>
        </div>

        <div class="row mt-1">
-          <div class="col-2 me-2 model-param-head">Model</div>
+          <div class="col-2 me-2 model-param-head">Model variant</div>
          <div class="row col-10">
   {% for model_group in model_groups %}
      {% set models = model_group.models %}
@@ -73,84 +71,116 @@ The following models are pre-optimized for performance on the AMD Instinct MI325
        </div>
      </div>

-   .. note::

-      Some models require an external license agreement through a third party (for example, Meta).
+   .. _amd-pytorch-training-supported-training-modes:

-   .. _amd-pytorch-training-performance-measurements:
+   The following table lists supported training modes per model.

-   Performance measurements
-   ========================
+   .. dropdown:: Supported training modes

-   To evaluate performance, the
+      .. list-table::
+         :header-rows: 1
+
+         * - Model
+           - Supported training modes
+
+      {% for model_group in model_groups %}
+         {% set models = model_group.models %}
+         {% for model in models %}
+         * - {{ model.model }}
+           - ``{{ model.training_modes | join('``, ``') }}``
+
+         {% endfor %}
+      {% endfor %}
+
+      .. note::
+
+         Some model and fine-tuning combinations are not listed. This is
+         because the `upstream torchtune repository <https://github.com/pytorch/torchtune>`__
+         doesn't provide default YAML configurations for them.
+         For advanced usage, you can create a custom configuration to enable
+         unlisted fine-tuning methods by using an existing file in the
+         ``/workspace/torchtune/recipes/configs`` directory as a template.
+
+.. _amd-pytorch-training-performance-measurements:
+
+Performance measurements
+========================
+
+To evaluate performance, the
+`Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
+page provides reference throughput and latency measurements for training
+popular AI models.
+
+.. note::
+
+   The performance data presented in
   `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
-   page provides reference throughput and latency measurements for training
-   popular AI models.
+   should not be interpreted as the peak performance achievable by AMD
+   Instinct MI325X and MI300X accelerators or ROCm software.

-   .. note::
+System validation
+=================

-      The performance data presented in
-      `Performance results with AMD ROCm software <https://www.amd.com/en/developer/resources/rocm-hub/dev-ai/performance-results.html#tabs-a8deaeb413-item-21cea50186-tab>`_
-      should not be interpreted as the peak performance achievable by AMD
-      Instinct MI325X and MI300X accelerators or ROCm software.
+Before running AI workloads, it's important to validate that your AMD hardware is configured
+correctly and performing optimally.

-   System validation
-   =================
+If you have already validated your system settings, including aspects like NUMA auto-balancing, you
+can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
+optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
+before starting training.

-   Before running AI workloads, it's important to validate that your AMD hardware is configured
-   correctly and performing optimally.
+To test for optimal performance, consult the recommended :ref:`System health benchmarks
+<rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
+system's configuration.

-   If you have already validated your system settings, including aspects like NUMA auto-balancing, you
-   can skip this step. Otherwise, complete the procedures in the :ref:`System validation and
-   optimization <rocm-for-ai-system-optimization>` guide to properly configure your system settings
-   before starting training.
+This Docker image is optimized for specific model configurations outlined
+below. Performance can vary for other training workloads, as AMD
+doesn’t test configurations and run conditions outside those described.

-   To test for optimal performance, consult the recommended :ref:`System health benchmarks
-   <rocm-for-ai-system-health-bench>`. This suite of tests will help you verify and fine-tune your
-   system's configuration.
+Run training
+============

-   This Docker image is optimized for specific model configurations outlined
-   below. Performance can vary for other training workloads, as AMD
-   doesn’t validate configurations and run conditions outside those described.
+.. datatemplate:yaml:: /data/how-to/rocm-for-ai/training/pytorch-training-benchmark-models.yaml

-   Benchmarking
-   ============
+   {% set unified_docker = data.dockers[0] %}
+   {% set model_groups = data.model_groups %}

-   Once the setup is complete, choose between two options to start benchmarking:
+   Once the setup is complete, choose between two options to start benchmarking training:

   .. tab-set::

      .. tab-item:: MAD-integrated benchmarking

-         Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
-         directory and install the required packages on the host machine.
+         1. Clone the ROCm Model Automation and Dashboarding (`<https://github.com/ROCm/MAD>`__) repository to a local
+            directory and install the required packages on the host machine.

-         .. code-block:: shell
+            .. code-block:: shell

-            git clone https://github.com/ROCm/MAD
-            cd MAD
-            pip install -r requirements.txt
+               git clone https://github.com/ROCm/MAD
+               cd MAD
+               pip install -r requirements.txt

   {% for model_group in model_groups %}
      {% for model in model_group.models %}

         .. container:: model-doc {{ model.mad_tag }}

-            For example, use this command to run the performance benchmark test on the {{ model.model }} model
-            using one GPU with the {{ model.precision }} data type on the host machine.
+            2. For example, use this command to run the performance benchmark test on the {{ model.model }} model
+               using one node with the {{ model.precision }} data type on the host machine.

-            .. code-block:: shell
+               .. code-block:: shell

-               export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
-               madengine run \
-                   --tags {{ model.mad_tag }} \
-                   --keep-model-dir \
-                   --live-output \
-                   --timeout 28800
+                  export MAD_SECRETS_HFTOKEN="your personal Hugging Face token to access gated models"
+                  madengine run \
+                      --tags {{ model.mad_tag }} \
+                      --keep-model-dir \
+                      --live-output \
+                      --timeout 28800

-            MAD launches a Docker container with the name
-            ``container_ci-{{ model.mad_tag }}``, for example. The latency and throughput reports of the
-            model are collected in the following path: ``~/MAD/perf.csv``.
+               MAD launches a Docker container with the name
+               ``container_ci-{{ model.mad_tag }}``. The latency and throughput reports of the
+               model are collected in ``~/MAD/perf.csv``.

      {% endfor %}
   {% endfor %}
@@ -159,222 +189,213 @@ The following models are pre-optimized for performance on the AMD Instinct MI325

         .. rubric:: Download the Docker image and required packages

-         Use the following command to pull the Docker image from Docker Hub.
+         1. Use the following command to pull the Docker image from Docker Hub.

-         .. code-block:: shell
+            .. code-block:: shell

-            docker pull {{ unified_docker.pull_tag }}
+               docker pull {{ unified_docker.pull_tag }}

-         Run the Docker container.
+         2. Run the Docker container.

-         .. code-block:: shell
+            .. code-block:: shell

-            docker run -it --device /dev/dri --device /dev/kfd --network host --ipc host --group-add video --cap-add SYS_PTRACE --security-opt seccomp=unconfined --privileged -v $HOME:$HOME -v  $HOME/.ssh:/root/.ssh --shm-size 64G --name training_env {{ unified_docker.pull_tag }}
+               docker run -it \
+                   --device /dev/dri \
+                   --device /dev/kfd \
+                   --network host \
+                   --ipc host \
+                   --group-add video \
+                   --cap-add SYS_PTRACE \
+                   --security-opt seccomp=unconfined \
+                   --privileged \
+                   -v $HOME:$HOME \
+                   -v $HOME/.ssh:/root/.ssh \
+                   --shm-size 64G \
+                   --name training_env \
+                   {{ unified_docker.pull_tag }}

-         Use these commands if you exit the ``training_env`` container and need to return to it.
+            Use these commands if you exit the ``training_env`` container and need to return to it.

-         .. code-block:: shell
+            .. code-block:: shell

-            docker start training_env
-            docker exec -it training_env bash
+               docker start training_env
+               docker exec -it training_env bash

-         In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
-         repository and navigate to the benchmark scripts directory
-         ``/workspace/MAD/scripts/pytorch_train``.
+         3. In the Docker container, clone the `<https://github.com/ROCm/MAD>`__
+            repository and navigate to the benchmark scripts directory
+            ``/workspace/MAD/scripts/pytorch_train``.

-         .. code-block:: shell
+            .. code-block:: shell

-            git clone https://github.com/ROCm/MAD
-            cd MAD/scripts/pytorch_train
+               git clone https://github.com/ROCm/MAD
+               cd MAD/scripts/pytorch_train

         .. rubric:: Prepare training datasets and dependencies

-         The following benchmarking examples require downloading models and datasets
-         from Hugging Face. To ensure successful access to gated repos, set your
-         ``HF_TOKEN``.
+         1. The following benchmarking examples require downloading models and datasets
+            from Hugging Face. To ensure successful access to gated repos, set your
+            ``HF_TOKEN``.

-         .. code-block:: shell
+            .. code-block:: shell

-            export HF_TOKEN=$your_personal_hugging_face_access_token
+               export HF_TOKEN=$your_personal_hugging_face_access_token

-         Run the setup script to install libraries and datasets needed for benchmarking.
+         2. Run the setup script to install libraries and datasets needed for benchmarking.

-         .. code-block:: shell
+            .. code-block:: shell

-            ./pytorch_benchmark_setup.sh
+               ./pytorch_benchmark_setup.sh

-         .. container:: model-doc pyt_train_llama-3.1-8b
+            .. container:: model-doc pyt_train_llama-3.1-8b

-            ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 8B:
+               ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 8B:

-            .. list-table::
-               :header-rows: 1
+               .. list-table::
+                  :header-rows: 1

-               * - Library
-                 - Reference
+                  * - Library
+                    - Reference

-               * - ``accelerate``
-                 - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
+                  * - ``accelerate``
+                    - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_

-               * - ``datasets``
-                 - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
+                  * - ``datasets``
+                    - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0

-         .. container:: model-doc pyt_train_llama-3.1-70b
+            .. container:: model-doc pyt_train_llama-3.1-70b

-            ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 70B:
+               ``pytorch_benchmark_setup.sh`` installs the following libraries for Llama 3.1 70B:

-            .. list-table::
-               :header-rows: 1
+               .. list-table::
+                  :header-rows: 1

-               * - Library
-                 - Reference
+                  * - Library
+                    - Reference

-               * - ``datasets``
-                 - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
+                  * - ``datasets``
+                    - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0

-               * - ``torchdata``
-                 - `TorchData <https://pytorch.org/data/beta/index.html>`_
+                  * - ``torchdata``
+                    - `TorchData <https://pytorch.org/data/beta/index.html>`_

-               * - ``tomli``
-                 - `Tomli <https://pypi.org/project/tomli/>`_
+                  * - ``tomli``
+                    - `Tomli <https://pypi.org/project/tomli/>`_

-               * - ``tiktoken``
-                 - `tiktoken <https://github.com/openai/tiktoken>`_
+                  * - ``tiktoken``
+                    - `tiktoken <https://github.com/openai/tiktoken>`_

-               * - ``blobfile``
-                 - `blobfile <https://pypi.org/project/blobfile/>`_
+                  * - ``blobfile``
+                    - `blobfile <https://pypi.org/project/blobfile/>`_

-               * - ``tabulate``
-                 - `tabulate <https://pypi.org/project/tabulate/>`_
+                  * - ``tabulate``
+                    - `tabulate <https://pypi.org/project/tabulate/>`_

-               * - ``wandb``
-                 - `Weights & Biases <https://github.com/wandb/wandb>`_
+                  * - ``wandb``
+                    - `Weights & Biases <https://github.com/wandb/wandb>`_

-               * - ``sentencepiece``
-                 - `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
+                  * - ``sentencepiece``
+                    - `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0

-               * - ``tensorboard``
-                 - `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
+                  * - ``tensorboard``
+                    - `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0

-         .. container:: model-doc pyt_train_flux
+            .. container:: model-doc pyt_train_flux

-            ``pytorch_benchmark_setup.sh`` installs the following libraries for FLUX:
+               ``pytorch_benchmark_setup.sh`` installs the following libraries for FLUX:

-            .. list-table::
-               :header-rows: 1
+               .. list-table::
+                  :header-rows: 1

-               * - Library
-                 - Reference
+                  * - Library
+                    - Reference

-               * - ``accelerate``
-                 - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_
+                  * - ``accelerate``
+                    - `Hugging Face Accelerate <https://huggingface.co/docs/accelerate/en/index>`_

-               * - ``datasets``
-                 - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0
+                  * - ``datasets``
+                    - `Hugging Face Datasets <https://huggingface.co/docs/datasets/v3.2.0/en/index>`_ 3.2.0

-               * - ``sentencepiece``
-                 - `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0
+                  * - ``sentencepiece``
+                    - `SentencePiece <https://github.com/google/sentencepiece>`_ 0.2.0

-               * - ``tensorboard``
-                 - `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0
+                  * - ``tensorboard``
+                    - `TensorBoard <https://www.tensorflow.org/tensorboard>`_ 2.18.0

-               * - ``csvkit``
-                 - `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1
+                  * - ``csvkit``
+                    - `csvkit <https://csvkit.readthedocs.io/en/latest/>`_ 2.0.1

-               * - ``deepspeed``
-                 - `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2
+                  * - ``deepspeed``
+                    - `DeepSpeed <https://github.com/deepspeedai/DeepSpeed>`_ 0.16.2

-               * - ``diffusers``
-                 - `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0
+                  * - ``diffusers``
+                    - `Hugging Face Diffusers <https://huggingface.co/docs/diffusers/en/index>`_ 0.31.0

-               * - ``GitPython``
-                 - `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44
+                  * - ``GitPython``
+                    - `GitPython <https://github.com/gitpython-developers/GitPython>`_ 3.1.44

-               * - ``opencv-python-headless``
-                 - `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84
+                  * - ``opencv-python-headless``
+                    - `opencv-python-headless <https://pypi.org/project/opencv-python-headless/>`_ 4.10.0.84

-               * - ``peft``
-                 - `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0
+                  * - ``peft``
+                    - `PEFT <https://huggingface.co/docs/peft/en/index>`_ 0.14.0

-               * - ``protobuf``
-                 - `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2
+                  * - ``protobuf``
+                    - `Protocol Buffers <https://github.com/protocolbuffers/protobuf>`_ 5.29.2

-               * - ``pytest``
-                 - `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4
+                  * - ``pytest``
+                    - `PyTest <https://docs.pytest.org/en/stable/>`_ 8.3.4

-               * - ``python-dotenv``
-                 - `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1
+                  * - ``python-dotenv``
+                    - `python-dotenv <https://pypi.org/project/python-dotenv/>`_ 1.0.1

-               * - ``seaborn``
-                 - `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2
+                  * - ``seaborn``
+                    - `Seaborn <https://seaborn.pydata.org/>`_ 0.13.2

-               * - ``transformers``
-                 - `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0
+                  * - ``transformers``
+                    - `Transformers <https://huggingface.co/docs/transformers/en/index>`_ 4.47.0

-         ``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face:
+            ``pytorch_benchmark_setup.sh`` downloads the following datasets from Hugging Face:

-         * `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_
+            * `bghira/pseudo-camera-10k <https://huggingface.co/datasets/bghira/pseudo-camera-10k>`_

   {% for model_group in model_groups %}
      {% for model in model_group.models %}
-         {% if model_group.tag == "pre-training" and model.mad_tag in ["pyt_train_llama-3.1-8b", "pyt_train_llama-3.1-70b", "pyt_train_flux"] %}
+         {% set training_modes = model.training_modes %}
+         {% set training_mode_descs = {
+            "pretrain": "Benchmark pre-training.",
+            "HF_pretrain": "Llama 3.1 8B pre-training with FP8 precision."
+         } %}
+         {% set available_modes = training_modes | select("in", ["pretrain", "HF_pretrain"]) | list %}
+         {% if available_modes %}

         .. container:: model-doc {{ model.mad_tag }}

-            .. rubric:: Pretraining
+            .. rubric:: Pre-training

            To start the pre-training benchmark, use the following command with the
            appropriate options. See the following list of options and their descriptions.

            .. code-block:: shell

-               ./pytorch_benchmark_report.sh -t pretrain -m {{ model.model_repo }} -p $datatype -s $sequence_length
-
-            .. list-table::
-               :header-rows: 1
-
-               * - Name
-                 - Options
-                 - Description
-
-            {% if model.mad_tag == "pyt_train_llama-3.1-8b" %}
-               * - ``$datatype``
-                 - ``BF16`` or ``FP8``
-                 - Only Llama 3.1 8B supports FP8 precision.
-            {% else %}
-               * - ``$datatype``
-                 - ``BF16``
-                 - Only Llama 3.1 8B supports FP8 precision.
-            {% endif %}
-
-               * - ``$sequence_length``
-                 - Sequence length for the language model.
-                 - Between 2048 and 8192. 8192 by default.
+               ./pytorch_benchmark_report.sh -t {% if available_modes | length == 1 %}{{ available_modes[0] }}{% else %}$training_mode{% endif %} \
+                   -m {{ model.model_repo }} \
+                   -p $datatype \
+                   -s $sequence_length

            {% if model.mad_tag == "pyt_train_flux" %}
            .. container:: model-doc {{ model.mad_tag }}

               .. note::

+                  Currently, FLUX models are not supported out-of-the-box on {{ unified_docker.pull_tag }}.
+                  To use FLUX, refer to the previous version of the ``pytorch-training`` Docker: :doc:`previous-versions/pytorch-training-v25.6`
+
                  Occasionally, downloading the Flux dataset might fail. In the event of this
                  error, manually download it from Hugging Face at
                  `black-forest-labs/FLUX.1-dev <https://huggingface.co/black-forest-labs/FLUX.1-dev>`_
                  and save it to `/workspace/FluxBenchmark`. This ensures that the test script can access
                  the required dataset.
            {% endif %}
-         {% endif %}
-
-         {% if model_group.tag == "fine-tuning" %}
-         .. container:: model-doc {{ model.mad_tag }}
-
-            .. rubric:: Fine-tuning
-
-            To start the fine-tuning benchmark, use the following command with the
-            appropriate options. See the following list of options and their descriptions.
-
-            .. code-block:: shell
-
-               ./pytorch_benchmark_report.sh -t $training_mode -m {{ model.model_repo }} -p BF16 -s $sequence_length

            .. list-table::
               :header-rows: 1
@@ -383,53 +404,143 @@ The following models are pre-optimized for performance on the AMD Instinct MI325
                 - Options
                 - Description

-               * - ``$training_mode``
-                 - ``finetune_fw``
-                 - Full weight fine-tuning (BF16 supported)
-
-               * -
-                 - ``finetune_lora``
-                 - LoRA fine-tuning (BF16 supported)
-
-               * -
-                 - ``finetune_qlora``
-                 - QLoRA fine-tuning (BF16 supported)
-
-               * -
-                 - ``HF_finetune_lora``
-                 - LoRA fine-tuning with Hugging Face PEFT
+               {% for mode in available_modes %}
+               * - {% if loop.first %}``$training_mode``{% endif %}
+                 - ``{{ mode }}``
+                 - {{ training_mode_descs[mode] }}
+               {% endfor %}

               * - ``$datatype``
-                 - ``BF16``
-                 - All models support BF16.
+                 - ``BF16``{% if model.mad_tag == "pyt_train_llama-3.1-8b" %} or ``FP8``{% endif %}
+                 - Only Llama 3.1 8B supports FP8 precision.
+
+               * - ``$sequence_length``
+                 - Sequence length for the language model.
+                 - Between 2048 and 8192. 8192 by default.
+         {% endif %}
+
+         {% set training_mode_descs = {
+            "finetune_fw": "Full weight fine-tuning (BF16 and FP8 supported).",
+            "finetune_lora": "LoRA fine-tuning (BF16 supported).",
+            "finetune_qlora": "QLoRA fine-tuning (BF16 supported).",
+            "HF_finetune_lora": "LoRA fine-tuning with Hugging Face PEFT.",
+         } %}
+         {% set available_modes = training_modes | select("in", ["finetune_fw", "finetune_lora", "finetune_qlora", "HF_finetune_lora"]) | list %}
+         {% if available_modes %}
+         .. container:: model-doc {{ model.mad_tag }}
+
+            .. rubric:: Fine-tuning
+
+            To start the fine-tuning benchmark, use the following command with the
+            appropriate options. See the following list of options and their descriptions.
+            See :ref:`supported training modes <amd-pytorch-training-supported-training-modes>`.
+
+            .. code-block:: shell
+
+               ./pytorch_benchmark_report.sh -t $training_mode \
+                   -m {{ model.model_repo }} \
+                   -p $datatype \
+                   -s $sequence_length
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Name
+                 - Options
+                 - Description
+
+               {% for mode in available_modes %}
+               * - {% if loop.first %}``$training_mode``{% endif %}
+                 - ``{{ mode }}``
+                 - {{ training_mode_descs[mode] }}
+               {% endfor %}
+
+               * - ``$datatype``
+                 - ``BF16``{% if "finetune_fw" in available_modes %} or ``FP8``{% endif %}
+                 - All models support BF16.{% if "finetune_fw" in available_modes %} FP8 is only available for full weight fine-tuning.{% endif %}

               * - ``$sequence_length``
                 - Between 2048 and 16384.
                 - Sequence length for the language model.

+            {% if model.mad_tag in ["pyt_train_llama3.2-vision-11b", "pyt_train_llama-3.2-vision-90b"] %}
            .. note::

-               {{ model.model }} currently supports the following fine-tuning methods:
+               For LoRA and QLoRA support with vision models (Llama 3.2 11B and 90B),
+               use the following torchtune commit for compatibility:

-            {% for method in model.training_modes %}
-               * ``{{ method }}``
-            {% endfor %}
-            {% if model.training_modes|length < 4 %}
+               .. code-block:: shell
+
+                  git checkout 48192e23188b1fc524dd6d127725ceb2348e7f0e
+
+            {% elif model.mad_tag in ["pyt_train_llama-2-7b", "pyt_train_llama-2-13b", "pyt_train_llama-2-70b"] %}
+            .. note::
+
+               You might encounter the following error with Llama 2: ``ValueError: seq_len (16384) of
+               input tensor should be smaller than max_seq_len (4096)``.
+               This error indicates that an input sequence is longer than the model's maximum context window.
+
+               Ensure your tokenized input does not exceed the model's ``max_seq_len`` (4096
+               tokens in this case). You can resolve this by truncating the input or splitting
+               it into smaller chunks before passing it to the model.
+
+               Note on reproducibility: The results in this guide are based on
+               commit ``b4c98ac`` from the upstream
+               `<https://github.com/pytorch/torchtune>`__ repository. For the
+               latest updates, you can use the main branch.

-               The upstream `torchtune <https://github.com/pytorch/torchtune>`_ repository
-               does not currently provide YAML configuration files for other combinations of
-               model to fine-tuning method
-               However, you can still configure your own YAML files to enable support for
-               fine-tuning methods not listed here by following existing patterns in the
-               ``/workspace/torchtune/recipes/configs`` directory.
            {% endif %}
         {% endif %}
      {% endfor %}
   {% endfor %}

-               .. rubric:: Benchmarking examples
+            .. rubric:: Benchmarking examples

-               For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.
+            For examples of benchmarking commands, see `<https://github.com/ROCm/MAD/tree/develop/benchmark/pytorch_train#benchmarking-examples>`__.
+
+Multi-node training
+-------------------
+
+Pre-training
+~~~~~~~~~~~~
+
+Multi-node training with torchtitan is supported. The provided SLURM script is pre-configured for Llama 3 70B.
+
+To launch the training job on a SLURM cluster for Llama 3 70B, run the following commands from the MAD repository.
+
+.. code-block:: shell
+
+   # In the MAD repository
+   cd scripts/pytorch_train
+   sbatch run_slurm_train.sh
+
+Fine-tuning
+~~~~~~~~~~~
+
+Multi-node training with torchtune is supported. The provided SLURM script is pre-configured for Llama 3.3 70B.
+
+To launch the training job on a SLURM cluster for Llama 3.3 70B, run the following commands from the MAD repository.
+
+.. code-block:: shell
+
+   huggingface-cli login # Get access to HF Llama model space
+   huggingface-cli download meta-llama/Llama-3.3-70B-Instruct --local-dir ./models/Llama-3.3-70B-Instruct # Download the Llama 3.3 model locally
+   # In the MAD repository
+   cd scripts/pytorch_train
+   sbatch Torchtune_Multinode.sh
+
+.. note::
+
+   Information regarding benchmark setup:
+
+   * By default, Llama 3.3 70B is fine-tuned using ``alpaca_dataset``.
+   * You can adjust the torchtune `YAML configuration file
+     <https://github.com/pytorch/torchtune/blob/main/recipes/configs/llama3_3/70B_full_multinode.yaml>`__
+     if you're using a different model.
+   * The number of nodes and other parameters can be tuned in the SLURM script ``Torchtune_Multinode.sh``.
+   * Set the ``mounting_paths`` inside the SLURM script.
+
+Once the run is finished, you can find the log files in the ``result_torchtune/`` directory.

 Further reading
 ===============
--- a/docs/how-to/rocm-for-ai/training/index.rst
+++ b/docs/how-to/rocm-for-ai/training/index.rst
@@ -21,6 +21,8 @@ In this guide, you'll learn about:

 - Training a model

+  - :doc:`With Primus (Megatron-LM backend) <benchmark-docker/primus-megatron>`
+
  - :doc:`With Megatron-LM <benchmark-docker/megatron-lm>`

  - :doc:`With PyTorch <benchmark-docker/pytorch-training>`
--- a/docs/reference/gpu-arch-specs.rst
+++ b/docs/reference/gpu-arch-specs.rst
@@ -285,7 +285,7 @@ For more information about ROCm hardware compatibility, see the ROCm `Compatibil
          - Radeon AI PRO R9700
          - RDNA4
          - gfx1201
-          - 16
+          - 32
          - 64
          - 32 or 64
          - 128
--- a/docs/sphinx/_toc.yml.in
+++ b/docs/sphinx/_toc.yml.in
@@ -27,6 +27,24 @@ subtrees:
    title: ROCm on Radeon GPUs
  - file: how-to/deep-learning-rocm.md
    title: Deep learning frameworks
+    subtrees:
+    - entries:
+      - file: compatibility/ml-compatibility/pytorch-compatibility.rst
+        title: PyTorch compatibility
+      - file: compatibility/ml-compatibility/tensorflow-compatibility.rst
+        title: TensorFlow compatibility  
+      - file: compatibility/ml-compatibility/jax-compatibility.rst
+        title: JAX compatibility
+      - file: compatibility/ml-compatibility/verl-compatibility.rst
+        title: verl compatibility  
+      - file: compatibility/ml-compatibility/stanford-megatron-lm-compatibility.rst
+        title: Stanford Megatron-LM compatibility
+      - file: compatibility/ml-compatibility/dgl-compatibility.rst
+        title: DGL compatibility  
+      - file: compatibility/ml-compatibility/megablocks-compatibility.rst
+        title: Megablocks compatibility
+      - file: compatibility/ml-compatibility/taichi-compatibility.rst
+        title: Taichi compatibility 
  - file: how-to/build-rocm.rst
    title: Build ROCm from source

@@ -44,8 +62,8 @@ subtrees:
        title: Training
        subtrees:
        - entries:
-          - file: how-to/rocm-for-ai/training/benchmark-docker/megatron-lm.rst
-            title: Train a model with Megatron-LM
+          - file: how-to/rocm-for-ai/training/benchmark-docker/primus-megatron.rst
+            title: Train a model with Primus and Megatron-Core
          - file: how-to/rocm-for-ai/training/benchmark-docker/pytorch-training.rst
            title: Train a model with PyTorch
          - file: how-to/rocm-for-ai/training/benchmark-docker/jax-maxtext.rst
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -234,7 +234,7 @@ sphinx-notfound-page==1.1.0
    # via rocm-docs-core
 sphinx-reredirects==0.1.6
    # via -r requirements.in
-sphinx-sitemap==2.7.2
+sphinx-sitemap==2.8.0
    # via -r requirements.in
 sphinxcontrib-applehelp==2.0.0
    # via sphinx
--- a/tools/autotag/components.xml
+++ b/tools/autotag/components.xml
@@ -1,7 +1,7 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <manifest>
    <remote name="rocm-org" fetch="https://github.com/ROCm/" />
-    <default revision="refs/tags/rocm-6.4.2"
+    <default revision="refs/tags/rocm-6.4.3"
     remote="rocm-org"
     sync-c="true"
     sync-j="4" />
--- a/tools/rocm-build/rocm-6.4.3.xml
+++ b/tools/rocm-build/rocm-6.4.3.xml
@@ -0,0 +1,79 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<manifest>
+    <remote name="rocm-org" fetch="https://github.com/ROCm/" />
+    <default revision="refs/tags/rocm-6.4.3"
+     remote="rocm-org"
+     sync-c="true"
+     sync-j="4" />
+<!--list of projects for ROCm-->
+    <project name="ROCm" revision="roc-6.4.x" />
+    <project name="ROCK-Kernel-Driver" />
+    <project name="ROCR-Runtime" />
+    <project name="amdsmi" />
+    <project name="rdc" />
+    <project name="rocm_bandwidth_test" />
+    <project name="rocm_smi_lib" />
+    <project name="rocm-core" />
+    <project name="rocm-examples" />
+    <project name="rocminfo" />
+    <project name="rocprofiler" />
+    <project name="rocprofiler-register" />
+    <project name="rocprofiler-sdk" />
+    <project name="rocprofiler-compute" />
+    <project name="rocprofiler-systems" />
+    <project name="roctracer" />
+<!--HIP Projects-->
+    <project name="hip" />
+    <project name="hip-tests" />
+    <project name="HIPIFY" />
+    <project name="clr" />
+    <project name="hipother" />
+<!-- The following projects are all associated with the AMDGPU LLVM compiler -->
+    <project name="half" />
+    <project name="llvm-project" />
+    <project name="spirv-llvm-translator" />
+<!-- gdb projects -->
+    <project name="ROCdbgapi" />
+    <project name="ROCgdb" />
+    <project name="rocr_debug_agent" />
+<!-- ROCm Libraries -->
+    <project groups="mathlibs" name="AMDMIGraphX" />
+    <project groups="mathlibs" name="MIOpen" />
+    <project groups="mathlibs" name="MIVisionX" />
+    <project groups="mathlibs" name="ROCmValidationSuite" />
+    <project groups="mathlibs" name="Tensile" />
+    <project groups="mathlibs" name="composable_kernel" />
+    <project groups="mathlibs" name="hipBLAS-common" />
+    <project groups="mathlibs" name="hipBLAS" />
+    <project groups="mathlibs" name="hipBLASLt" />
+    <project groups="mathlibs" name="hipCUB" />
+    <project groups="mathlibs" name="hipFFT" />
+    <project groups="mathlibs" name="hipRAND" />
+    <project groups="mathlibs" name="hipSOLVER" />
+    <project groups="mathlibs" name="hipSPARSE" />
+    <project groups="mathlibs" name="hipSPARSELt" />
+    <project groups="mathlibs" name="hipTensor" />
+    <project groups="mathlibs" name="hipfort" />
+    <project groups="mathlibs" name="rccl" />
+    <project groups="mathlibs" name="rocAL" />
+    <project groups="mathlibs" name="rocALUTION" />
+    <project groups="mathlibs" name="rocBLAS" />
+    <project groups="mathlibs" name="rocDecode" />
+    <project groups="mathlibs" name="rocJPEG" />
+    <project groups="mathlibs" name="rocPyDecode" />
+    <project groups="mathlibs" name="rocFFT" />
+    <project groups="mathlibs" name="rocPRIM" />
+    <project groups="mathlibs" name="rocRAND" />
+    <project groups="mathlibs" name="rocSHMEM" />
+    <project groups="mathlibs" name="rocSOLVER" />
+    <project groups="mathlibs" name="rocSPARSE" />
+    <project groups="mathlibs" name="rocThrust" />
+    <project groups="mathlibs" name="rocWMMA" />
+    <project groups="mathlibs" name="rocm-cmake" />
+    <project groups="mathlibs" name="rpp" />
+    <project groups="mathlibs" name="TransferBench" />
+<!-- Projects for OpenMP-Extras -->
+    <project name="aomp" path="openmp-extras/aomp" />
+    <project name="aomp-extras" path="openmp-extras/aomp-extras" />
+    <project name="flang" path="openmp-extras/flang" />
+</manifest>
Author	SHA1	Message	Date
Ibrahim Wani	b4ea72eb91	Add pip module installs to test job	2025-09-09 19:40:46 +00:00
Ibrahim Wani	5bc1197184	Quick fix	2025-09-09 19:29:12 +00:00
Ibrahim Wani	61114cd072	Remove nanobind install script	2025-09-09 19:27:41 +00:00
Ibrahim Wani	0b19fdf17d	Add nanobind to array correctly	2025-09-09 17:27:21 +00:00
Ibrahim Wani	e0a746d392	Add nanobind to array	2025-09-09 17:18:17 +00:00
Ibrahim Wani	61bf1adc4b	Clean up	2025-09-08 21:56:46 +00:00
Ibrahim Wani	b020b63cff	Fix hipblaslt ci errors	2025-09-08 20:47:25 +00:00
Ibrahim Wani	d9138bdec1	Add old changes back in	2025-09-08 20:12:31 +00:00
Ibrahim Wani	5ea4c4ddb9	Resolve more ci failures in hipblaslt	2025-09-08 19:51:12 +00:00
Ibrahim Wani	0569e6fb13	Fix hipblaslt build failures	2025-09-08 19:22:44 +00:00
Ibrahim Wani	6c2856013f	Merge remote-tracking branch 'origin/develop' into users/ibrahimw1/origami-azure-ci	2025-09-08 19:18:13 +00:00
Ibrahim Wani	54c92e1792	Run individual tests not ctest.	2025-09-08 19:05:53 +00:00
Ibrahim Wani	9e802ca390	Add missing cd	2025-09-08 18:03:59 +00:00
Ibrahim Wani	7a55ae8179	Only download artifacts for ubuntu	2025-09-08 17:52:17 +00:00
Ibrahim Wani	13994905a4	Try ctest instead of running test files directly	2025-09-08 17:48:36 +00:00
Ibrahim Wani	a8d1992c12	Download python bindings as artifact	2025-09-08 17:35:00 +00:00
Ibrahim Wani	36aacc7bec	Sparse checkout origami/python	2025-09-08 17:01:01 +00:00
Ibrahim Wani	a3b7bee019	Echo to list dir	2025-09-08 16:16:52 +00:00
Ibrahim Wani	6702d68dc6	Test1	2025-09-08 15:55:08 +00:00
Joseph Macaranas	94476f34ca	[External CI] Add amdgpu deps to rocpydecode pipeline (#5267 )	2025-09-08 11:32:10 -04:00
Ibrahim Wani	a81a33c5a9	Fix checkout in test job	2025-09-08 15:09:37 +00:00
Ibrahim Wani	b932004daf	Comment out block	2025-09-08 05:50:43 +00:00
Ibrahim Wani	df6ccad6cc	Download build as artifact.	2025-09-08 05:44:15 +00:00
Ibrahim Wani	65e2d8370e	Package bindings in build artifacts	2025-09-05 21:55:29 +00:00
Ibrahim Wani	51a77f7d60	Try removing job.target from params	2025-09-05 21:25:05 +00:00
Ibrahim Wani	eece16b75d	Build origami in test job	2025-09-05 21:09:45 +00:00
Ibrahim Wani	6bee1b86c7	Change test dir	2025-09-05 18:43:44 +00:00
Ibrahim Wani	99bdbd3d49	Test	2025-09-05 18:29:16 +00:00
Ibrahim Wani	1ed7f975cf	Clean up	2025-09-05 18:04:53 +00:00
Ibrahim Wani	663911c2db	Try to find build dir	2025-09-05 17:34:21 +00:00
Ibrahim Wani	bc703b3711	Find build dir	2025-09-05 17:21:05 +00:00
Ibrahim Wani	bddc9478ba	Fix sparse checkout	2025-09-05 17:10:29 +00:00
Ibrahim Wani	bd42ddd551	Finding test dir	2025-09-05 17:09:38 +00:00
Ibrahim Wani	b1fc9a18d9	Update test jobs	2025-09-05 16:54:20 +00:00
Ibrahim Wani	50590adf03	Add test job to origami	2025-09-05 16:43:30 +00:00
Peter Park	4bc1bf00c6	Update PyTorch training benchmark docker doc to 25.7 (#5255 ) * Update PyTorch training benchmark docker doc to 25.7 * update .wordlist.txt * update conf.py * update data sheet * fix sphinx warnings	2025-09-05 12:07:51 -04:00
Matt Williams	76fd6b2290	Updating broken link (#5258 )	2025-09-05 11:45:06 -04:00
Joseph Macaranas	e5345a9cca	External CI: rocdecode downstream builds (#5254 ) - Trigger downstream build of rocpydecode within rocdecode pipelines. - Copying similar variables as other pipelines even though these projects are not in the super-repos.	2025-09-05 10:12:39 -04:00
David Dixon	2f40189575	add catch2 (#5257 )	2025-09-04 18:48:34 -06:00
Ibrahim Wani	eceea00897	Resolve merge conflict; merge develop	2025-09-04 20:55:32 +00:00
Ibrahim Wani	af627119d1	test	2025-09-04 20:17:03 +00:00
Ibrahim Wani	58b117e998	Add more gpu targets	2025-09-04 20:08:27 +00:00
Ibrahim Wani	2551ed3092	Add gpu to pipeline	2025-09-04 19:31:17 +00:00
Ibrahim Wani	7ec397e214	Check devices on machine	2025-09-04 19:25:14 +00:00
Ibrahim Wani	27e0366b22	Adding job target to hipblaslt dependant builds	2025-09-04 16:17:42 +00:00
Ibrahim Wani	b5e52b4a7c	Test job target	2025-09-04 16:07:05 +00:00
David Dixon	9e1a82d327	Add libdivide (#5252 )	2025-09-03 20:11:38 -06:00
Ibrahim Wani	b059b8250e	Test	2025-09-03 21:52:35 +00:00
Ibrahim Wani	5a9e5decba	Fix CI error.	2025-09-03 21:18:20 +00:00
Ibrahim Wani	875756a718	Add archs to buildJobs	2025-09-03 21:05:02 +00:00
Joseph Macaranas	3aab9e1bc5	Modify sparseCheckoutDirectories in checkout.yml (#5251 ) Added 'shared' to sparseCheckoutDirectories parameter.	2025-09-03 16:58:17 -04:00
Ibrahim Wani	edf7dede1b	Add missing dep for CI	2025-09-03 20:48:05 +00:00
Ibrahim Wani	afe66159b2	Change build path for tests again	2025-09-03 20:14:52 +00:00
Ibrahim Wani	9ac6f5a1ba	Change build path for tests	2025-09-03 20:03:25 +00:00
David Dixon	2b0ce5e5c2	Fix typo (#5250 )	2025-09-03 13:59:41 -06:00
Ibrahim Wani	54943d086e	Run origami binding tests	2025-09-03 19:49:32 +00:00
David Dixon	f1be2d291a	Add fmtlib version that works with spdlog (#5249 )	2025-09-03 13:26:18 -06:00
amd-hsivasun	07cb61f969	Update testjob dependsOn	2025-09-03 14:02:47 -04:00
amd-hsivasun	c486c39b50	Update rocprofiler-compute.yml Reverted Component name and updated job names	2025-09-03 14:02:47 -04:00
amd-hsivasun	e68d9e9ce2	Update rocprofiler-compute.yml	2025-09-03 14:02:47 -04:00
amd-hsivasun	bff5c4a955	Fixed sparseCheckoutDir	2025-09-03 14:02:47 -04:00
amd-hsivasun	b0abc43c46	Added sparseCheckout to testjob template	2025-09-03 14:02:47 -04:00
amd-hsivasun	ceabccad83	Fixed componentName	2025-09-03 14:02:47 -04:00
amd-hsivasun	2628812fc4	[Ex CI] Enable rocprofiler-compute monorepo	2025-09-03 14:02:47 -04:00
amd-hsivasun	df3ea80290	Enable Roctracer Monorepo	2025-09-03 14:02:20 -04:00
David Dixon	b6647dfb22	Add spdlog source builds (#5247 )	2025-09-03 11:35:53 -06:00
David Dixon	c34fddb26a	Add boost deps (#5235 )	2025-09-02 13:28:19 -06:00
Daniel Su	977e9c2295	[Ex CI] change hip-clr pipeline ID (#5230 )	2025-08-27 13:06:08 -04:00
Daniel Su	eac9772fff	[Ex CI] add temporary downstream path from rocBLAS to hipBLAS (#5184 )	2025-08-27 13:05:51 -04:00
Daniel Su	151a4bd7bc	[Ex CI] add retries to potentially flaky steps (#5175 )	2025-08-27 13:05:26 -04:00
Daniel Su	9d28684161	[Ex CI] enable clr/hip/hipother monorepo builds (#5217 )	2025-08-27 10:43:07 -04:00
Braden Stefanuk	9ea9b33d14	[superbuild] Configure pipeline (#5221 )	2025-08-26 15:12:19 -06:00
Ibrahim Wani	b20384cf76	Testing nanobind install in pipelines	2025-08-26 20:00:19 +00:00
Ibrahim Wani	27207925f8	Quick fix	2025-08-26 19:55:24 +00:00
Ibrahim Wani	2ab2ccc98c	Replace pybind with nanobind.	2025-08-26 19:51:23 +00:00
Ibrahim Wani	d0825adab9	Merge remote-tracking branch 'origin/amd/danielsu/multi-sparse-checkout' into users/ibrahimw1/origami-azure-ci	2025-08-26 19:46:54 +00:00
Daniel Su	f920cf533d	hipBLASLt multi sparse	2025-08-26 11:26:14 -04:00
Daniel Su	6f02314b4b	[Ex CI] don't create symlink if more than one sparse checkout dir	2025-08-26 11:25:44 -04:00
Ibrahim Wani	2cfa3f48d4	Test	2025-08-25 22:06:46 +00:00
Ibrahim Wani	c9a825a9f0	Fix pybind11 dep for almalinux again	2025-08-25 21:45:12 +00:00
Ibrahim Wani	162c32a01a	Fix pybind11 dep for almalinux	2025-08-25 21:38:44 +00:00
Ibrahim Wani	e2f842d543	Quick fix	2025-08-25 20:06:06 +00:00
Ibrahim Wani	fb2eddc630	Fix pipeline failures.	2025-08-25 20:02:36 +00:00
Ibrahim Wani	2dc586060a	Add pybind dep	2025-08-25 19:02:02 +00:00
Ibrahim Wani	1f0e6580d6	Add cmake dependency step to origami yml.	2025-08-25 18:47:31 +00:00
Ibrahim Wani	d563b9b64b	Unindent lines.	2025-08-25 15:36:08 +00:00
Ibrahim Wani	eb8cd0d17d	Add origami yaml pipeline.	2025-08-22 22:34:53 +00:00
Matt Williams	1d42f7cc62	Deep learning frameworks edits for scale (#5189 ) * Deep learning frameworks edits for scale Based on https://ontrack-internal.amd.com/browse/ROCDOC-1809 * update table table * leo comments * formatting * format * update table based on feedback * header * Update machine learning page * headers * Apply suggestions from code review Co-authored-by: anisha-amd <anisha.sankar@amd.com> * Update .wordlist.txt * formatting * Update docs/how-to/deep-learning-rocm.rst Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com> --------- Co-authored-by: Matt Williams <Matt.Williams+amdeng@amd.com> Co-authored-by: anisha-amd <anisha.sankar@amd.com> Co-authored-by: Leo Paoletti <164940351+lpaoletti@users.noreply.github.com>	2025-08-22 11:46:07 -04:00
Peter Park	98029db4ee	docs: Add Primus (Megatron) training Docker documentation (#5218 )	2025-08-21 23:50:55 -04:00
Matt Williams	65ebbaa117	Merge pull request #5113 from ROCm/aqlprofile AQLProfile component additions	2025-08-21 12:53:16 -04:00
Joseph Macaranas	3dfc0cdbf1	[External CI] Update CMake on MIOpen build pipeline (#5210 )	2025-08-20 15:37:15 +00:00
Daniel Su	00b0d9430e	[Ex CI] change rocprofiler's branch to develop (#5208 )	2025-08-19 15:44:07 -04:00
Daniel Su	14acec6000	[Ex CI] switch rocprofiler pipeline ID (#5207 )	2025-08-19 15:22:02 -04:00
Peter Park	c154b7e0a3	Fix documented VRAM for Radeon AI Pro R9700 (#5203 )	2025-08-18 10:00:10 -04:00
David Dixon	9f5cd4500c	Don't use local tensilelite (#5201 )	2025-08-18 06:19:27 -06:00
Jan Stephan	51e7d9550f	Make documentation build platform-independent (#5052 ) Make documentation build platform-independent	2025-08-18 10:59:31 +02:00
Peter Park	55d0a88ec5	vLLM inference benchmark doc: add missing data field (#5199 )	2025-08-15 13:20:39 -04:00
Peter Park	7ee22790ce	docs: Update vLLM benchmark doc for 20250812 Docker release (#5196 )	2025-08-14 15:43:36 -04:00
Daniel Su	ec05312de7	[Ex CI] enable rocprofiler monorepo (#5197 ) * [Ex CI] enable rocprofiler monorepo * set ROCM_PATH	2025-08-14 14:31:34 -04:00
amd-hsivasun	39e7ccd3c5	Update variables-global.yml	2025-08-13 17:27:05 -04:00
dependabot[bot]	c4135ab541	Bump sphinx-sitemap from 2.7.2 to 2.8.0 in /docs/sphinx (#5192 ) Bumps [sphinx-sitemap](https://github.com/jdillard/sphinx-sitemap) from 2.7.2 to 2.8.0. - [Release notes](https://github.com/jdillard/sphinx-sitemap/releases) - [Changelog](https://github.com/jdillard/sphinx-sitemap/blob/master/CHANGELOG.rst) - [Commits](https://github.com/jdillard/sphinx-sitemap/compare/v2.7.2...v2.8.0) --- updated-dependencies: - dependency-name: sphinx-sitemap dependency-version: 2.8.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2025-08-13 09:22:31 -06:00
anisha-amd	dd56fd4d3a	develop: compatibility matrix frameworks support update (#5185 )	2025-08-12 14:25:37 -04:00
Peter Park	80f7dc79b9	Add Hunyuan Video to PyTorch inference benchmark models doc (#5094 )	2025-08-12 11:54:59 -04:00
David Dixon	231aa0bfc6	Merge pull request #5120 from ROCm/users/ellosel/hipblaslt-lapack-deps Add deps and config options for new hipblaslt build system	2025-08-11 13:32:09 -06:00
Joseph Macaranas	8655fb369a	[External CI] Full checkout of rocm-libraries for hipsparselt pipeline (#5178 )	2025-08-11 10:31:40 -04:00
Dominic Widdows	306b39ea5e	Merge pull request #5174 from ROCm/dwiddows-patch-1 Fix hyperlink syntax	2025-08-08 11:23:09 -07:00
Dominic Widdows	9e055d92ce	Fix hyperlink syntax	2025-08-08 10:28:09 -07:00
Daniel Su	85b13c0513	[Ex CI] temporarily disable high pool (#5173 )	2025-08-08 11:10:04 -04:00
pbhandar-amd	dba913095a	Merge pull request #5168 from ROCm/amd/pbhandar/manifest_700 Update XML for 6.4.3	2025-08-08 10:51:03 -04:00
Daniel Su	81b9d50c2c	[Ex CI] retry MIOpen CK download if unzip fails (#5163 )	2025-08-08 10:37:05 -04:00
David Dixon	e9bb2fca36	Remove build dir artifact creation	2025-08-08 14:26:12 +00:00
David Dixon	16e96caf80	Restore commented code	2025-08-08 14:26:12 +00:00
David Dixon	7e0efaa6b0	build all kernels	2025-08-08 14:25:43 +00:00
Daniel Su	af4f291005	Compress and upload build files	2025-08-08 14:25:43 +00:00
David Dixon	b9218832bc	Update hipBLASLt.yml	2025-08-08 14:25:43 +00:00
David Dixon	3f2c1d65eb	only run one test	2025-08-08 14:25:43 +00:00
David Dixon	ee4287fdd7	parallellize lapack build	2025-08-08 14:25:43 +00:00
David Dixon	d63db0be41	debug commit	2025-08-08 14:25:43 +00:00
David Dixon	6a37323fe7	Enable rocroller and use fetch content	2025-08-08 14:24:44 +00:00
David Dixon	b6b7b32e6d	Disable blis for new build system	2025-08-08 14:22:13 +00:00
David Dixon	7c11126938	Fix pip args	2025-08-08 14:22:13 +00:00
David Dixon	ac0b72497e	add python deps for hipblaslt	2025-08-08 14:22:13 +00:00
David Dixon	68bc7f83da	Need both target options while transitioning between build systems	2025-08-08 14:22:13 +00:00
David Dixon	5bbe8ecdcc	add deps install back	2025-08-08 14:22:13 +00:00
Daniel Su	6bc408d051	Change to GPU_TARGETS	2025-08-08 14:22:13 +00:00
Daniel Su	20762b9a96	Add blas and lapack to dnf map	2025-08-08 14:22:13 +00:00
David Dixon	fa5395a1a6	Drop lapack install script	2025-08-08 14:22:13 +00:00
Joseph Macaranas	254d863b91	External CI: Temporary Pipeline Change for CMake Refactor (#5166 ) - Disable gfx1030 builds temporarily for blas, sparse, and solvers. - TODO: gfx1030 build path should have separate build flags to use rocblas path.	2025-08-08 10:14:28 -04:00
Parag Bhandari	03bf20e614	Update XML for 6.4.3	2025-08-08 09:10:42 -04:00
Matt Williams	9786a75390	Update license	2025-07-31 10:33:36 -04:00
Matt Williams	95543cae2a	Final edits	2025-07-30 14:43:52 -04:00
Matt Williams	1cf3eef9da	AQLProfile component additions	2025-07-28 14:39:39 -04:00
Jan Stephan	3c71bb25e8	Make initial directory and copy operations platform-independent	2025-07-16 15:13:13 +02:00